tuplesort.c
来自「PostgreSQL7.4.6 for Linux」· C语言 代码 · 共 2,177 行 · 第 1/5 页
C
2,177 行
/*------------------------------------------------------------------------- * * tuplesort.c * Generalized tuple sorting routines. * * This module handles sorting of heap tuples, index tuples, or single * Datums (and could easily support other kinds of sortable objects, * if necessary). It works efficiently for both small and large amounts * of data. Small amounts are sorted in-memory using qsort(). Large * amounts are sorted using temporary files and a standard external sort * algorithm. * * See Knuth, volume 3, for more than you want to know about the external * sorting algorithm. We divide the input into sorted runs using replacement * selection, in the form of a priority tree implemented as a heap * (essentially his Algorithm 5.2.3H), then merge the runs using polyphase * merge, Knuth's Algorithm 5.4.2D. The logical "tapes" used by Algorithm D * are implemented by logtape.c, which avoids space wastage by recycling * disk space as soon as each block is read from its "tape". * * We do not form the initial runs using Knuth's recommended replacement * selection data structure (Algorithm 5.4.1R), because it uses a fixed * number of records in memory at all times. Since we are dealing with * tuples that may vary considerably in size, we want to be able to vary * the number of records kept in memory to ensure full utilization of the * allowed sort memory space. So, we keep the tuples in a variable-size * heap, with the next record to go out at the top of the heap. Like * Algorithm 5.4.1R, each record is stored with the run number that it * must go into, and we use (run number, key) as the ordering key for the * heap. When the run number at the top of the heap changes, we know that * no more records of the prior run are left in the heap. * * The (approximate) amount of memory allowed for any one sort operation * is given in kilobytes by the external variable SortMem. Initially, * we absorb tuples and simply store them in an unsorted array as long as * we haven't exceeded SortMem. If we reach the end of the input without * exceeding SortMem, we sort the array using qsort() and subsequently return * tuples just by scanning the tuple array sequentially. If we do exceed * SortMem, we construct a heap using Algorithm H and begin to emit tuples * into sorted runs in temporary tapes, emitting just enough tuples at each * step to get back within the SortMem limit. Whenever the run number at * the top of the heap changes, we begin a new run with a new output tape * (selected per Algorithm D). After the end of the input is reached, * we dump out remaining tuples in memory into a final run (or two), * then merge the runs using Algorithm D. * * When merging runs, we use a heap containing just the frontmost tuple from * each source run; we repeatedly output the smallest tuple and insert the * next tuple from its source tape (if any). When the heap empties, the merge * is complete. The basic merge algorithm thus needs very little memory --- * only M tuples for an M-way merge, and M is at most six in the present code. * However, we can still make good use of our full SortMem allocation by * pre-reading additional tuples from each source tape. Without prereading, * our access pattern to the temporary file would be very erratic; on average * we'd read one block from each of M source tapes during the same time that * we're writing M blocks to the output tape, so there is no sequentiality of * access at all, defeating the read-ahead methods used by most Unix kernels. * Worse, the output tape gets written into a very random sequence of blocks * of the temp file, ensuring that things will be even worse when it comes * time to read that tape. A straightforward merge pass thus ends up doing a * lot of waiting for disk seeks. We can improve matters by prereading from * each source tape sequentially, loading about SortMem/M bytes from each tape * in turn. Then we run the merge algorithm, writing but not reading until * one of the preloaded tuple series runs out. Then we switch back to preread * mode, fill memory again, and repeat. This approach helps to localize both * read and write accesses. * * When the caller requests random access to the sort result, we form * the final sorted run on a logical tape which is then "frozen", so * that we can access it randomly. When the caller does not need random * access, we return from tuplesort_performsort() as soon as we are down * to one run per logical tape. The final merge is then performed * on-the-fly as the caller repeatedly calls tuplesort_gettuple; this * saves one cycle of writing all the data out to disk and reading it in. * * * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION * $Header: /cvsroot/pgsql/src/backend/utils/sort/tuplesort.c,v 1.37 2003/08/17 19:58:06 tgl Exp $ * *------------------------------------------------------------------------- */#include "postgres.h"#include "access/heapam.h"#include "access/nbtree.h"#include "catalog/pg_amop.h"#include "catalog/pg_operator.h"#include "miscadmin.h"#include "utils/catcache.h"#include "utils/datum.h"#include "utils/logtape.h"#include "utils/lsyscache.h"#include "utils/syscache.h"#include "utils/tuplesort.h"/* * Possible states of a Tuplesort object. These denote the states that * persist between calls of Tuplesort routines. */typedef enum{ TSS_INITIAL, /* Loading tuples; still within memory * limit */ TSS_BUILDRUNS, /* Loading tuples; writing to tape */ TSS_SORTEDINMEM, /* Sort completed entirely in memory */ TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */ TSS_FINALMERGE /* Performing final merge on-the-fly */} TupSortStatus;/* * We use a seven-tape polyphase merge, which is the "sweet spot" on the * tapes-to-passes curve according to Knuth's figure 70 (section 5.4.2). */#define MAXTAPES 7 /* Knuth's T */#define TAPERANGE (MAXTAPES-1) /* Knuth's P *//* * Private state of a Tuplesort operation. */struct Tuplesortstate{ TupSortStatus status; /* enumerated value as shown above */ bool randomAccess; /* did caller request random access? */ long availMem; /* remaining memory available, in bytes */ LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp * file */ /* * These function pointers decouple the routines that must know what * kind of tuple we are sorting from the routines that don't need to * know it. They are set up by the tuplesort_begin_xxx routines. * * Function to compare two tuples; result is per qsort() convention, ie: * * <0, 0, >0 according as a<b, a=b, a>b. */ int (*comparetup) (Tuplesortstate *state, const void *a, const void *b); /* * Function to copy a supplied input tuple into palloc'd space. (NB: * we assume that a single pfree() is enough to release the tuple * later, so the representation must be "flat" in one palloc chunk.) * state->availMem must be decreased by the amount of space used. */ void *(*copytup) (Tuplesortstate *state, void *tup); /* * Function to write a stored tuple onto tape. The representation of * the tuple on tape need not be the same as it is in memory; * requirements on the tape representation are given below. After * writing the tuple, pfree() it, and increase state->availMem by the * amount of memory space thereby released. */ void (*writetup) (Tuplesortstate *state, int tapenum, void *tup); /* * Function to read a stored tuple from tape back into memory. 'len' * is the already-read length of the stored tuple. Create and return * a palloc'd copy, and decrease state->availMem by the amount of * memory space consumed. */ void *(*readtup) (Tuplesortstate *state, int tapenum, unsigned int len); /* * This array holds pointers to tuples in sort memory. If we are in * state INITIAL, the tuples are in no particular order; if we are in * state SORTEDINMEM, the tuples are in final sorted order; in states * BUILDRUNS and FINALMERGE, the tuples are organized in "heap" order * per Algorithm H. (Note that memtupcount only counts the tuples * that are part of the heap --- during merge passes, memtuples[] * entries beyond TAPERANGE are never in the heap and are used to hold * pre-read tuples.) In state SORTEDONTAPE, the array is not used. */ void **memtuples; /* array of pointers to palloc'd tuples */ int memtupcount; /* number of tuples currently present */ int memtupsize; /* allocated length of memtuples array */ /* * While building initial runs, this array holds the run number for * each tuple in memtuples[]. During merge passes, we re-use it to * hold the input tape number that each tuple in the heap was read * from, or to hold the index of the next tuple pre-read from the same * tape in the case of pre-read entries. This array is never * allocated unless we need to use tapes. Whenever it is allocated, * it has the same length as memtuples[]. */ int *memtupindex; /* index value associated with * memtuples[i] */ /* * While building initial runs, this is the current output run number * (starting at 0). Afterwards, it is the number of initial runs we * made. */ int currentRun; /* * These variables are only used during merge passes. mergeactive[i] * is true if we are reading an input run from (actual) tape number i * and have not yet exhausted that run. mergenext[i] is the memtuples * index of the next pre-read tuple (next to be loaded into the heap) * for tape i, or 0 if we are out of pre-read tuples. mergelast[i] * similarly points to the last pre-read tuple from each tape. * mergeavailmem[i] is the amount of unused space allocated for tape * i. mergefreelist and mergefirstfree keep track of unused locations * in the memtuples[] array. memtupindex[] links together pre-read * tuples for each tape as well as recycled locations in * mergefreelist. It is OK to use 0 as a null link in these lists, * because memtuples[0] is part of the merge heap and is never a * pre-read tuple. */ bool mergeactive[MAXTAPES]; /* Active input run source? */ int mergenext[MAXTAPES]; /* first preread tuple for each * source */ int mergelast[MAXTAPES]; /* last preread tuple for each * source */ long mergeavailmem[MAXTAPES]; /* availMem for prereading * tapes */ long spacePerTape; /* actual per-tape target usage */ int mergefreelist; /* head of freelist of recycled slots */ int mergefirstfree; /* first slot never used in this merge */ /* * Variables for Algorithm D. Note that destTape is a "logical" tape * number, ie, an index into the tp_xxx[] arrays. Be careful to keep * "logical" and "actual" tape numbers straight! */ int Level; /* Knuth's l */ int destTape; /* current output tape (Knuth's j, less 1) */ int tp_fib[MAXTAPES]; /* Target Fibonacci run counts * (A[]) */ int tp_runs[MAXTAPES]; /* # of real runs on each tape */ int tp_dummy[MAXTAPES]; /* # of dummy runs for each tape * (D[]) */ int tp_tapenum[MAXTAPES]; /* Actual tape numbers (TAPE[]) */ /* * These variables are used after completion of sorting to keep track * of the next tuple to return. (In the tape case, the tape's current * read position is also critical state.) */ int result_tape; /* actual tape number of finished output */ int current; /* array index (only used if SORTEDINMEM) */ bool eof_reached; /* reached EOF (needed for cursors) */ /* markpos_xxx holds marked position for mark and restore */ long markpos_block; /* tape block# (only used if SORTEDONTAPE) */ int markpos_offset; /* saved "current", or offset in tape * block */ bool markpos_eof; /* saved "eof_reached" */ /* * These variables are specific to the HeapTuple case; they are set by * tuplesort_begin_heap and used only by the HeapTuple routines. */ TupleDesc tupDesc; int nKeys; ScanKey scanKeys; SortFunctionKind *sortFnKinds; /* * These variables are specific to the IndexTuple case; they are set * by tuplesort_begin_index and used only by the IndexTuple routines. */ Relation indexRel; ScanKey indexScanKey; bool enforceUnique; /* complain if we find duplicate tuples */ /* * These variables are specific to the Datum case; they are set by * tuplesort_begin_datum and used only by the DatumTuple routines. */ Oid datumType; Oid sortOperator; FmgrInfo sortOpFn; /* cached lookup data for sortOperator */ SortFunctionKind sortFnKind; /* we need typelen and byval in order to know how to copy the Datums. */ int datumTypeLen; bool datumTypeByVal;};#define COMPARETUP(state,a,b) ((*(state)->comparetup) (state, a, b))#define COPYTUP(state,tup) ((*(state)->copytup) (state, tup))#define WRITETUP(state,tape,tup) ((*(state)->writetup) (state, tape, tup))#define READTUP(state,tape,len) ((*(state)->readtup) (state, tape, len))#define LACKMEM(state) ((state)->availMem < 0)#define USEMEM(state,amt) ((state)->availMem -= (amt))#define FREEMEM(state,amt) ((state)->availMem += (amt))/*-------------------- * * NOTES about on-tape representation of tuples: * * We require the first "unsigned int" of a stored tuple to be the total size * on-tape of the tuple, including itself (so it is never zero; an all-zero * unsigned int is used to delimit runs). The remainder of the stored tuple * may or may not match the in-memory representation of the tuple --- * any conversion needed is the job of the writetup and readtup routines. * * If state->randomAccess is true, then the stored representation of the * tuple must be followed by another "unsigned int" that is a copy of the * length --- so the total tape space used is actually sizeof(unsigned int) * more than the stored length value. This allows read-backwards. When * randomAccess is not true, the write/read routines may omit the extra * length word. * * writetup is expected to write both length words as well as the tuple * data. When readtup is called, the tape is positioned just after the * front length word; readtup must read the tuple data and advance past * the back length word (if present). * * The write/read routines can make use of the tuple description data * stored in the Tuplesortstate record, if needed. They are also expected * to adjust state->availMem by the amount of memory space (not tape space!) * released or consumed. There is no error return from either writetup * or readtup; they should ereport() on failure. * * * NOTES about memory consumption calculations: * * We count space allocated for tuples against the SortMem limit, plus * the space used by the variable-size arrays memtuples and memtupindex. * Fixed-size space (primarily the LogicalTapeSet I/O buffers) is not * counted. * * Note that we count actual space used (as shown by GetMemoryChunkSpace) * rather than the originally-requested size. This is important since * palloc can add substantial overhead. It's not a complete answer since * we won't count any wasted space in palloc allocation blocks, but it's * a lot better than what we were doing before 7.3. * *-------------------- *//* * For sorting single Datums, we build "pseudo tuples" that just carry * the datum's value and null flag. For pass-by-reference data types, * the actual data value appears after the DatumTupleHeader (MAXALIGNed, * of course), and the value field in the header is just a pointer to it. */typedef struct{ Datum val; bool isNull;} DatumTuple;static Tuplesortstate *tuplesort_begin_common(bool randomAccess);static void puttuple_common(Tuplesortstate *state, void *tuple);static void inittapes(Tuplesortstate *state);static void selectnewtape(Tuplesortstate *state);static void mergeruns(Tuplesortstate *state);static void mergeonerun(Tuplesortstate *state);static void beginmerge(Tuplesortstate *state);static void mergepreread(Tuplesortstate *state);static void dumptuples(Tuplesortstate *state, bool alltuples);static void tuplesort_heap_insert(Tuplesortstate *state, void *tuple, int tupleindex, bool checkIndex);static void tuplesort_heap_siftup(Tuplesortstate *state, bool checkIndex);static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK);static void markrunend(Tuplesortstate *state, int tapenum);static int qsort_comparetup(const void *a, const void *b);static int comparetup_heap(Tuplesortstate *state, const void *a, const void *b);static void *copytup_heap(Tuplesortstate *state, void *tup);static void writetup_heap(Tuplesortstate *state, int tapenum, void *tup);static void *readtup_heap(Tuplesortstate *state, int tapenum, unsigned int len);static int comparetup_index(Tuplesortstate *state, const void *a, const void *b);static void *copytup_index(Tuplesortstate *state, void *tup);static void writetup_index(Tuplesortstate *state, int tapenum, void *tup);static void *readtup_index(Tuplesortstate *state, int tapenum, unsigned int len);static int comparetup_datum(Tuplesortstate *state, const void *a, const void *b);static void *copytup_datum(Tuplesortstate *state, void *tup);static void writetup_datum(Tuplesortstate *state, int tapenum, void *tup);static void *readtup_datum(Tuplesortstate *state, int tapenum, unsigned int len);/* * Since qsort(3) will not pass any context info to qsort_comparetup(), * we have to use this ugly static variable. It is set to point to the * active Tuplesortstate object just before calling qsort. It should * not be used directly by anything except qsort_comparetup(). */static Tuplesortstate *qsort_tuplesortstate;/* * tuplesort_begin_xxx * * Initialize for a tuple sort operation. * * After calling tuplesort_begin, the caller should call tuplesort_puttuple * zero or more times, then call tuplesort_performsort when all the tuples * have been supplied. After performsort, retrieve the tuples in sorted * order by calling tuplesort_gettuple until it returns NULL. (If random * access was requested, rescan, markpos, and restorepos can also be called.) * For Datum sorts, putdatum/getdatum are used instead of puttuple/gettuple. * Call tuplesort_end to terminate the operation and release memory/disk space. */static Tuplesortstate *tuplesort_begin_common(bool randomAccess){ Tuplesortstate *state; state = (Tuplesortstate *) palloc0(sizeof(Tuplesortstate)); state->status = TSS_INITIAL; state->randomAccess = randomAccess; state->availMem = SortMem * 1024L; state->tapeset = NULL; state->memtupcount = 0; state->memtupsize = 1024; /* initial guess */ state->memtuples = (void **) palloc(state->memtupsize * sizeof(void *)); state->memtupindex = NULL; /* until and unless needed */ USEMEM(state, GetMemoryChunkSpace(state->memtuples)); state->currentRun = 0; /* Algorithm D variables will be initialized by inittapes, if needed */ state->result_tape = -1; /* flag that result tape has not been * formed */
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?