📄 bigsort.c
字号:
/*JS*********************************************************************** Program : BIGSORT* Language: ANSI-C* Author : Joerg Schoen* Purpose : Sort huge amounts of data by doing a multi-phase* sorting on temporary files. For a description of the* algorithm see Section 2.4.4 in* Niklaus Wirth: "Algorithmen und Datenstrukturen mit* Modula-2"** This implementation automatically recognizes if the input data fits* completely into memory and then does a in-place sort via quicksort.**************************************************************************/#ifndef lintstatic const char rcsid[] = "$Id: bigsort.c,v 1.12 1998/04/01 17:07:48 joerg Stab joerg $";#endif/********* INCLUDE *********/#include <jsalloca.h>#include <stddef.h>#include <stdio.h>#include <stdlib.h>#include <string.h>#include <jsconfig.h>#include <jssubs.h>#ifndef CONFIG_NO_POSIX# include <unistd.h>#endif/*#define DEBUG*//********* DEFINES *********/#ifndef BSH_File/* START-DEFINITIONS */#include <stdio.h> /* For "FILE" definition *//* Structure to access a sorted stream */typedef struct { /* 0: Memory list ("BSH_Buff"); 1: File access ("BSH_File") */ int BSH_Type; long BSH_Len,BSH_Count,BSH_Curr; union { FILE *bsh_File; void **bsh_Buff; } c;} BigSortHandle;#define BSH_File c.bsh_File#define BSH_Buff c.bsh_Buff/* For variable mode from bigSortOpen */#define BIGSORT_FDISTINCT (1<<0) /* remove duplicate entries *//* Obtain length of sorted stream */#define bigSortLength(bsh) ((bsh)->BSH_Count)/* Seek or rewind a stream and get current position */#define bigSortSeek(bsh,pos) ((bsh)->BSH_Curr = (pos))#define bigSortRewind(bsh) bigSortSeek(bsh,0)#define bigSortCurrPos(bsh) ((bsh)->BSH_Curr)/* Get next set in sequence (sequential access) */#define bigSortNext(bsh,buff) bigSortGet(bsh,-1,buff)/* Same as bigSortNext, but do not advance internal pointer */#define bigSortPeek(bsh,buff) bigSortGet(bsh,-2,buff)/* END-DEFINITIONS */#endif/* Align fields for comparison routine *//*#define CONFIG_BIGSORT_ALIGN*//* Maximum allocated memory */#ifndef BIGSORT_MAXMEM# define BIGSORT_MAXMEM 500000#endif/* Internal structures */typedef struct { FILE *SEQ_File; char *SEQ_Buffer; long SEQ_Count; int SEQ_NRuns,SEQ_DiffRuns;} Sequence;typedef struct { long BSI_Len; void *BSI_UPtr; int (*BSI_Compare)(void *uptr,const void *a,const void *b); long BSI_Count; Sequence *BSI_Seqs; int BSI_NSeqs; int BSI_Level; int BSI_Curr; /* current output sequence */ void **BSI_Temp;} BigSort;#define Prototype extern/********* PROTOTYPES *********/Prototype BigSortHandle *bigSortOpen(BigSortHandle *bsh,long len, int (*get)(void *uptr,void *buff,long nr), int (*compare)(void *uptr, const void *a, const void *b), void *uPtr,int mode);Prototype int bigSortClose(BigSortHandle *bsh,int mode);Prototype long bigSortGet(BigSortHandle *bsh,long nr,void *buff);Prototype long bigSortFind(BigSortHandle *bsh,const void *cont, int (*compare)(void *uptr,const void *a, const void *b),void *uPtr, void *dest);static int distRuns(BigSort *pbs,int n, int (*get)(void *uptr,void *buff,long nr), int mode);static void bsSift(void **array,int (*compare)(void *uptr, const void *a, const void *b), void *uPtr,long l,long r);static void sortSpecial(void **array,long n, int (*compare)(void *uptr,const void *a, const void *b), void *uPtr);static int selectRun(BigSort *pbs);static int mergeRuns(BigSort *pbs,int mode);static FILE *bsTmpFile(void);Prototype long BSortMaxMem;Prototype int BSortNSeqs;Prototype const char *BSortTempDir;/********* GLOBAL VARIABLES *********/long BSortMaxMem = BIGSORT_MAXMEM;int BSortNSeqs = 32; /* at least 3 */const char *BSortTempDir;/* ERROR-DEFINITIONS from bigSortOpen label _ERR_BSORTOPEN ord 18 BSortMaxMem to small*//*JS********************************************************************** Reads a stream until EOF is signaled and sorts the units of size len* using the given comparison function. The routine get must return 0* on success, -1 on error and 1 on EOF. A handle is returned which* allows direct or sequential access to the sorted units. If no* comparison routine is given (compare == NULL), the stream is simply* transferred to a direct-access one.*************************************************************************/BigSortHandle *bigSortOpen(BigSortHandle *bsh,long len, int (*get)(void *uptr,void *buff,long nr), int (*compare)(void *uptr,const void *a, const void *b),void *uPtr,int mode)/************************************************************************/{ long i,j; BigSort bs; int nSeqs; /* Set up internal structure */ bs.BSI_Len = len; bs.BSI_UPtr = uPtr; bs.BSI_Compare = compare; bs.BSI_NSeqs = nSeqs = BSortNSeqs; bs.BSI_Level = 0; bs.BSI_Curr = 0; bs.BSI_Count = 0; bs.BSI_Temp = NULL; /* *** 1. Distribute elements *** */ /* How much elements may we store in memory (at least 2)? */ if((i = (BSortMaxMem - nSeqs * (len + sizeof(void *))) / (len + sizeof(void *))) < 2) { JSErrNo = _ERR_BSORTOPEN + 0; goto error2; } if((j = distRuns(&bs,i,get,mode)) < 0) goto error2; if(j > 0) { /* *** Simple case: All elements fit in memory, so stop *** */ if(bsh == NULL && (bsh = (BigSortHandle *)malloc(sizeof(*bsh))) == NULL) goto error2; bsh->BSH_Type = 0; bsh->BSH_Len = len; bsh->BSH_Curr = 0; bsh->BSH_Count = bs.BSI_Count; bsh->BSH_Buff = bs.BSI_Temp; return(bsh); } /* *** 2. Mix sequences *** */ if(compare) { if(mergeRuns(&bs,mode) < 0) goto error;#ifdef DEBUG if(bs.BSI_Seqs[0].SEQ_Count != bs.BSI_Count) fprintf(stderr,"ERROR: Different count %ld != %ld!\n", bs.BSI_Seqs[0].SEQ_Count,bs.BSI_Count);#endif } /* *** Finished, set up return value *** */ if(bsh == NULL && (bsh = (BigSortHandle *)malloc(sizeof(*bsh))) == NULL) goto error; bsh->BSH_Type = 1; bsh->BSH_Len = len; bsh->BSH_Curr = 0; bsh->BSH_Count = bs.BSI_Count; bsh->BSH_File = bs.BSI_Seqs[0].SEQ_File; /* Close everything and clean up */#ifdef CONFIG_BIGSORT_ALIGN free(bs.BSI_Seqs[0].SEQ_Buffer);#endif for(i = 1 ; i < bs.BSI_NSeqs ; i++) { if(bs.BSI_Seqs[i].SEQ_File && fclose(bs.BSI_Seqs[i].SEQ_File)) goto error;#ifdef CONFIG_BIGSORT_ALIGN free(bs.BSI_Seqs[i].SEQ_Buffer);#endif } free(bs.BSI_Seqs);#ifndef CONFIG_BIGSORT_ALIGN /* Now its time to free temporary work space from distRuns */ free(bs.BSI_Temp);#endif return(bsh);error: if(bs.BSI_Seqs) {#ifdef CONFIG_BIGSORT_ALIGN free(bs.BSI_Seqs[0].SEQ_Buffer);#endif for(i = 1 ; i < bs.BSI_NSeqs ; i++) { if(bs.BSI_Seqs[i].SEQ_File && fclose(bs.BSI_Seqs[i].SEQ_File)) goto error;#ifdef CONFIG_BIGSORT_ALIGN free(bs.BSI_Seqs[i].SEQ_Buffer);#endif } free(bs.BSI_Seqs); }#ifndef CONFIG_BIGSORT_ALIGN free(bs.BSI_Temp);#endiferror2: return(NULL);}/*JS********************************************************************** Closes the handle obtained by bigSortOpen. If mode is non-zero, bsh* itself is not deallocated.*************************************************************************/int bigSortClose(BigSortHandle *bsh,int mode)/************************************************************************/{ if(bsh->BSH_Type == 0) {#ifdef CONFIG_BIGSORT_ALIGN long i; for(i = 0 ; i < bsh->BSH_Count ; i++) free(bsh->BSH_Buff[i]);#endif free(bsh->BSH_Buff); } else if(fclose(bsh->BSH_File)) { if(!mode) free(bsh); return(-1); } if(!mode) free(bsh); return(0);}/*JS********************************************************************** Provides direct access to the sorted stream. Read set nr (0 .. n-1)* into buff and return ordinal of it. On error -1 is returned and -2* on EOF. If nr == -1, the next set in the sequence is returned* (sequential access). nr == -2 is similar to nr == -1, but the* current position is not moved.*************************************************************************/long bigSortGet(BigSortHandle *bsh,long nr,void *buff)/************************************************************************/{ long number,len; /* Current position */ number = (nr >= 0) ? nr : bsh->BSH_Curr; /* Check for EOF */ if(number >= bsh->BSH_Count) return(-2); len = bsh->BSH_Len; if(bsh->BSH_Type == 0) { /* Memory list */ memcpy(buff,bsh->BSH_Buff[number],len); } else if((nr >= 0 && fseek(bsh->BSH_File,number * len,SEEK_SET)) || fread(buff,len,1,bsh->BSH_File) != 1 || /* Go back to current position if required */ (nr == -2 && fseek(bsh->BSH_File,number * len,SEEK_SET))) return(-1); if(nr != -2) bsh->BSH_Curr = number + 1; return(number);}/*JS********************************************************************** Provides direct access to the stream, allowing to search for a* specific set.*************************************************************************/long bigSortFind(BigSortHandle *bsh,const void *cont, int (*compare)(void *uptr,const void *a,const void *b), void *uPtr,void *dest)/************************************************************************/{ long l,r; l = 0; r = bsh->BSH_Count; if(bsh->BSH_Type == 1) { void *temp; if((temp = malloc(bsh->BSH_Len)) == NULL) return(-1); /* ** Binary search in file ** */ while(l < r) { long m = (l + r) / 2; if(fseek(bsh->BSH_File,m * bsh->BSH_Len,SEEK_SET) || fread(temp,bsh->BSH_Len,1,bsh->BSH_File) != 1) { free(temp); return(-1); } if((*compare)(uPtr,temp,cont) < 0) l = m + 1; else r = m; } free(temp); if(dest && (fseek(bsh->BSH_File,l * bsh->BSH_Len,SEEK_SET) || fread(dest,bsh->BSH_Len,1,bsh->BSH_File) != 1)) return(-1); return(l); } /* ** Binary search in memory list ** */ while(l < r) { long m = (l + r) / 2; if((*compare)(uPtr,bsh->BSH_Buff[m],cont) < 0) l = m + 1; else r = m; } if(dest) memcpy(dest,bsh->BSH_Buff[l],bsh->BSH_Len); return(l);}/*JS********************************************************************** Distribute initial data to the sequences, sorting chunks of a fixed* size using a modified heap sort.*************************************************************************/static int distRuns(BigSort *pbs,int n, int (*get)(void *uptr,void *buff,long nr),int mode)/************************************************************************/{ void **a,*tmp; long i,len,s,r; len = pbs->BSI_Len; /* Set up temporary array */ if((a = (void **)#ifdef CONFIG_BIGSORT_ALIGN malloc((n + 1) * sizeof(*a))#else malloc((n + 1) * (sizeof(*a) + len) + sizeof(*a))#endif ) == NULL) goto error;#ifndef CONFIG_BIGSORT_ALIGN /* Hack: Assuming most strict alignment for type "double" and * sizeof(double) == 2 * sizeof(void *), the following * calculation makes sure that fields are correctly aligned. */ a[0] = (void *)&a[(n + 2) & ~1];#endif /* *** To recognize small amounts of data more quickly, * *** try to read first as much data as possible. */ for(i = 0 ; i < n ; i++) { int ret;#ifdef CONFIG_BIGSORT_ALIGN if((a[i] = (void *)malloc(len)) == NULL) goto error;#else if(i) a[i] = (char *)(a[i - 1]) + len;#endif if((ret = (*get)(pbs->BSI_UPtr,a[i],pbs->BSI_Count)) < 0) goto error; /* EOF? */ if(ret > 0) break; pbs->BSI_Count++; } if(i < n) {#ifndef CONFIG_BIGSORT_ALIGN char *from,*to; ptrdiff_t diff;#endif /* Special case: Everything in memory. Sort a[0 .. i-1] * if required and return */#ifdef CONFIG_BIGSORT_ALIGN free(a[i]);#endif if(pbs->BSI_Compare) { sortSpecial(a,i,pbs->BSI_Compare,pbs->BSI_UPtr); if((mode & BIGSORT_FDISTINCT) && i > 1) { /* Remove duplicate entries */ for(s = 0, r = 1 ; r < i ; r++) { if((*(pbs->BSI_Compare))(pbs->BSI_UPtr,a[s],a[r]) == 0) {#ifdef CONFIG_BIGSORT_ALIGN free(a[r]);#endif continue; } s++; if(s < r)#ifdef CONFIG_BIGSORT_ALIGN a[s] = a[r];#else memcpy(a[s],a[r],len);#endif } pbs->BSI_Count = ++s; /* In case we did not align, we cannot reduce memory by setting * i = s, since the sorting routine shuffled everything around. */#ifdef CONFIG_BIGSORT_ALIGN i = s;#endif } }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -