📄 nbtsort.c
字号:
/*------------------------------------------------------------------------- * btsort.c * * Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION * $Id: nbtsort.c,v 1.40.2.1 1999/08/02 05:24:41 scrappy Exp $ * * NOTES * * what we do is: * - generate a set of initial one-block runs, distributed round-robin * between the output tapes. * - for each pass, * - swap input and output tape sets, rewinding both and truncating * the output tapes. * - merge the current run in each input tape to the current output * tape. * - when each input run has been exhausted, switch to another output * tape and start processing another run. * - when we have fewer runs than tapes, we know we are ready to start * merging into the btree leaf pages. (i.e., we do not have to wait * until we have exactly one tape.) * - as we extract tuples from the final runs, we build the pages for * each level. when we have only one page on a level, it must be the * root -- it can be attached to the btree metapage and we are done. * * conventions: * - external interface routines take in and return "void *" for their * opaque handles. this is for modularity reasons. * * this code is moderately slow (~10% slower) compared to the regular * btree (insertion) build code on sorted or well-clustered data. on * random data, however, the insertion build code is unusable -- the * difference on a 60MB heap is a factor of 15 because the random * probes into the btree thrash the buffer pool. * * this code currently packs the pages to 100% of capacity. this is * not wise, since *any* insertion will cause splitting. filling to * something like the standard 70% steady-state load factor for btrees * would probably be better. * * somebody desperately needs to figure out how to do a better job of * balancing the merge passes -- the fan-in on the final merges can be * pretty poor, which is bad for performance. *------------------------------------------------------------------------- */#include <fcntl.h>#include "postgres.h"#include "access/nbtree.h"#ifdef BTREE_BUILD_STATS#define ShowExecutorStats pg_options[TRACE_EXECUTORSTATS]#endifstatic BTItem _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags);static BTItem _bt_minitem(Page opage, BlockNumber oblkno, int atend);static void *_bt_pagestate(Relation index, int flags, int level, bool doupper);static void _bt_uppershutdown(Relation index, BTPageState *state);/* * turn on debugging output. * * XXX this code just does a numeric printf of the index key, so it's * only really useful for integer keys. *//*#define FASTBUILD_DEBUG*/#define FASTBUILD_SPOOL#define FASTBUILD_MERGE#define MAXTAPES (7)#define TAPEBLCKSZ (BLCKSZ << 2)extern int NDirectFileRead;extern int NDirectFileWrite;/* * this is what we use to shovel BTItems in and out of memory. it's * bigger than a standard block because we are doing a lot of strictly * sequential i/o. this is obviously something of a tradeoff since we * are potentially reading a bunch of zeroes off of disk in many * cases. * * BTItems are packed in and MAXALIGN'd. * * the fd should not be going out to disk, strictly speaking, but it's * the only thing like that so i'm not going to worry about wasting a * few bytes. */typedef struct{ int bttb_magic; /* magic number */ File bttb_fd; /* file descriptor */ int bttb_top; /* top of free space within bttb_data */ short bttb_ntup; /* number of tuples in this block */ short bttb_eor; /* End-Of-Run marker */ char bttb_data[TAPEBLCKSZ - 2 * sizeof(double)];} BTTapeBlock;/* * this structure holds the bookkeeping for a simple balanced multiway * merge. (polyphase merging is hairier than i want to get into right * now, and i don't see why i have to care how many "tapes" i use * right now. though if psort was in a condition that i could hack it * to do this, you bet i would.) */typedef struct{ int bts_ntapes; int bts_tape; BTTapeBlock **bts_itape; /* input tape blocks */ BTTapeBlock **bts_otape; /* output tape blocks */ bool isunique;} BTSpool;/*------------------------------------------------------------------------- * sorting comparison routine - returns {-1,0,1} depending on whether * the key in the left BTItem is {<,=,>} the key in the right BTItem. * * we want to use _bt_isortcmp as a comparison function for qsort(3), * but it needs extra arguments, so we "pass them in" as global * variables. ick. fortunately, they are the same throughout the * build, so we need do this only once. this is why you must call * _bt_isortcmpinit before the call to qsort(3). * * a NULL BTItem is always assumed to be greater than any actual * value; our heap routines (see below) assume that the smallest * element in the heap is returned. that way, NULL values from the * exhausted tapes can sift down to the bottom of the heap. in point * of fact we just don't replace the elements of exhausted tapes, but * what the heck. * *------------------------------------------------------------------------- */typedef struct{ Datum *btsk_datum; char *btsk_nulls; BTItem btsk_item;} BTSortKey;static Relation _bt_sortrel;static int _bt_nattr;static BTSpool *_bt_inspool;static void_bt_isortcmpinit(Relation index, BTSpool *spool){ _bt_sortrel = index; _bt_inspool = spool; _bt_nattr = index->rd_att->natts;}static int_bt_isortcmp(BTSortKey *k1, BTSortKey *k2){ Datum *k1_datum = k1->btsk_datum; Datum *k2_datum = k2->btsk_datum; char *k1_nulls = k1->btsk_nulls; char *k2_nulls = k2->btsk_nulls; bool equal_isnull = false; int i; if (k1->btsk_item == (BTItem) NULL) { if (k2->btsk_item == (BTItem) NULL) return 0; /* 1 = 2 */ return 1; /* 1 > 2 */ } else if (k2->btsk_item == (BTItem) NULL) return -1; /* 1 < 2 */ for (i = 0; i < _bt_nattr; i++) { if (k1_nulls[i] != ' ') /* k1 attr is NULL */ { if (k2_nulls[i] != ' ') /* the same for k2 */ { equal_isnull = true; continue; } return 1; /* NULL ">" NOT_NULL */ } else if (k2_nulls[i] != ' ') /* k2 attr is NULL */ return -1; /* NOT_NULL "<" NULL */ if (_bt_invokestrat(_bt_sortrel, i + 1, BTGreaterStrategyNumber, k1_datum[i], k2_datum[i])) return 1; /* 1 > 2 */ else if (_bt_invokestrat(_bt_sortrel, i + 1, BTGreaterStrategyNumber, k2_datum[i], k1_datum[i])) return -1; /* 1 < 2 */ } if (_bt_inspool->isunique && !equal_isnull) { _bt_spooldestroy((void *) _bt_inspool); elog(ERROR, "Cannot create unique index. Table contains non-unique values"); } return 0; /* 1 = 2 */}static void_bt_setsortkey(Relation index, BTItem bti, BTSortKey *sk){ sk->btsk_item = (BTItem) NULL; sk->btsk_datum = (Datum *) NULL; sk->btsk_nulls = (char *) NULL; if (bti != (BTItem) NULL) { IndexTuple it = &(bti->bti_itup); TupleDesc itdesc = index->rd_att; Datum *dp = (Datum *) palloc(_bt_nattr * sizeof(Datum)); char *np = (char *) palloc(_bt_nattr * sizeof(char)); bool isnull; int i; for (i = 0; i < _bt_nattr; i++) { dp[i] = index_getattr(it, i + 1, itdesc, &isnull); if (isnull) np[i] = 'n'; else np[i] = ' '; } sk->btsk_item = bti; sk->btsk_datum = dp; sk->btsk_nulls = np; }}/*------------------------------------------------------------------------- * priority queue methods * * these were more-or-less lifted from the heap section of the 1984 * edition of gonnet's book on algorithms and data structures. they * are coded so that the smallest element in the heap is returned (we * use them for merging sorted runs). * * XXX these probably ought to be generic library functions. *------------------------------------------------------------------------- */typedef struct{ int btpqe_tape; /* tape identifier */ BTSortKey btpqe_item; /* pointer to BTItem in tape buffer */} BTPriQueueElem;#define MAXELEM MAXTAPEStypedef struct{ int btpq_nelem; BTPriQueueElem btpq_queue[MAXELEM]; Relation btpq_rel;} BTPriQueue;/* be sure to call _bt_isortcmpinit first */#define GREATER(a, b) \ (_bt_isortcmp(&((a)->btpqe_item), &((b)->btpqe_item)) > 0)static void_bt_pqsift(BTPriQueue *q, int parent){ int child; BTPriQueueElem e; for (child = parent * 2 + 1; child < q->btpq_nelem; child = parent * 2 + 1) { if (child < q->btpq_nelem - 1) { if (GREATER(&(q->btpq_queue[child]), &(q->btpq_queue[child + 1]))) ++child; } if (GREATER(&(q->btpq_queue[parent]), &(q->btpq_queue[child]))) { e = q->btpq_queue[child]; /* struct = */ q->btpq_queue[child] = q->btpq_queue[parent]; /* struct = */ q->btpq_queue[parent] = e; /* struct = */ parent = child; } else parent = child + 1; }}static int_bt_pqnext(BTPriQueue *q, BTPriQueueElem *e){ if (q->btpq_nelem < 1) { /* already empty */ return -1; } *e = q->btpq_queue[0]; /* struct = */ if (--q->btpq_nelem < 1) { /* now empty, don't sift */ return 0; } q->btpq_queue[0] = q->btpq_queue[q->btpq_nelem]; /* struct = */ _bt_pqsift(q, 0); return 0;}static void_bt_pqadd(BTPriQueue *q, BTPriQueueElem *e){ int child, parent; if (q->btpq_nelem >= MAXELEM) elog(ERROR, "_bt_pqadd: queue overflow"); child = q->btpq_nelem++; while (child > 0) { parent = child / 2; if (GREATER(e, &(q->btpq_queue[parent]))) break; else { q->btpq_queue[child] = q->btpq_queue[parent]; /* struct = */ child = parent; } } q->btpq_queue[child] = *e; /* struct = */}/*------------------------------------------------------------------------- * tape methods *------------------------------------------------------------------------- */#define BTITEMSZ(btitem) \ ((btitem) ? \ (IndexTupleDSize((btitem)->bti_itup) + \ (sizeof(BTItemData) - sizeof(IndexTupleData))) : \ 0)#define SPCLEFT(tape) \ (sizeof((tape)->bttb_data) - (tape)->bttb_top)#define EMPTYTAPE(tape) \ ((tape)->bttb_ntup <= 0)#define BTTAPEMAGIC 0x19660226/* * reset the tape header for its next use without doing anything to * the physical tape file. (setting bttb_top to 0 makes the block * empty.) */static void_bt_tapereset(BTTapeBlock *tape){ tape->bttb_eor = 0; tape->bttb_top = 0; tape->bttb_ntup = 0;}/* * rewind the physical tape file. */static void_bt_taperewind(BTTapeBlock *tape){ FileSeek(tape->bttb_fd, 0L, SEEK_SET);}/* * destroy the contents of the physical tape file without destroying * the tape data structure or removing the physical tape file. * * we use the VFD version of ftruncate(2) to do this rather than * unlinking and recreating the file. you still have to wait while * the OS frees up all of the file system blocks and stuff, but at * least you don't have to delete and reinsert the directory entries. */static void_bt_tapeclear(BTTapeBlock *tape){ /* blow away the contents of the old file */ _bt_taperewind(tape);#ifdef NOT_USED FileSync(tape->bttb_fd);#endif FileTruncate(tape->bttb_fd, 0); /* reset the buffer */ _bt_tapereset(tape);}/* * create a new BTTapeBlock, allocating memory for the data structure * as well as opening a physical tape file. */static BTTapeBlock *_bt_tapecreate(void){ BTTapeBlock *tape = (BTTapeBlock *) palloc(sizeof(BTTapeBlock)); if (tape == (BTTapeBlock *) NULL) elog(ERROR, "_bt_tapecreate: out of memory"); tape->bttb_magic = BTTAPEMAGIC; tape->bttb_fd = OpenTemporaryFile(); Assert(tape->bttb_fd >= 0); /* initialize the buffer */ _bt_tapereset(tape); return tape;}/* * destroy the BTTapeBlock structure and its physical tape file. */static void_bt_tapedestroy(BTTapeBlock *tape){ FileUnlink(tape->bttb_fd); pfree((void *) tape);}/* * flush the tape block to the file, marking End-Of-Run if requested. */static void_bt_tapewrite(BTTapeBlock *tape, int eor){ tape->bttb_eor = eor; FileWrite(tape->bttb_fd, (char *) tape, TAPEBLCKSZ); NDirectFileWrite += TAPEBLCKSZ / BLCKSZ; _bt_tapereset(tape);}/* * read a tape block from the file, overwriting the current contents * of the buffer. * * returns: * - 0 if there are no more blocks in the tape or in this run (call * _bt_tapereset to clear the End-Of-Run marker) * - 1 if a valid block was read */static int_bt_taperead(BTTapeBlock *tape){ File fd; int nread; if (tape->bttb_eor) { return 0; /* we are already at End-Of-Run */ } /* * we're clobbering the old tape block, but we do need to save the VFD * (the one in the block we're reading is bogus). */ fd = tape->bttb_fd; nread = FileRead(fd, (char *) tape, TAPEBLCKSZ); tape->bttb_fd = fd; if (nread != TAPEBLCKSZ) { Assert(nread == 0); /* we are at EOF */ return 0; } Assert(tape->bttb_magic == BTTAPEMAGIC); NDirectFileRead += TAPEBLCKSZ / BLCKSZ;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -