📄 nbtsort.c

📁 关系型数据库 Postgresql 6.5.2
💻 C
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/*------------------------------------------------------------------------- * btsort.c * * Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION *	  $Id: nbtsort.c,v 1.40.2.1 1999/08/02 05:24:41 scrappy Exp $ * * NOTES * * what we do is: * - generate a set of initial one-block runs, distributed round-robin *	 between the output tapes. * - for each pass, *	 - swap input and output tape sets, rewinding both and truncating *	   the output tapes. *	 - merge the current run in each input tape to the current output *	   tape. *	   - when each input run has been exhausted, switch to another output *		 tape and start processing another run. * - when we have fewer runs than tapes, we know we are ready to start *	 merging into the btree leaf pages.  (i.e., we do not have to wait *	 until we have exactly one tape.) * - as we extract tuples from the final runs, we build the pages for *	 each level.  when we have only one page on a level, it must be the *	 root -- it can be attached to the btree metapage and we are done. * * conventions: * - external interface routines take in and return "void *" for their *	 opaque handles.  this is for modularity reasons. * * this code is moderately slow (~10% slower) compared to the regular * btree (insertion) build code on sorted or well-clustered data.  on * random data, however, the insertion build code is unusable -- the * difference on a 60MB heap is a factor of 15 because the random * probes into the btree thrash the buffer pool. * * this code currently packs the pages to 100% of capacity.  this is * not wise, since *any* insertion will cause splitting.  filling to * something like the standard 70% steady-state load factor for btrees * would probably be better. * * somebody desperately needs to figure out how to do a better job of * balancing the merge passes -- the fan-in on the final merges can be * pretty poor, which is bad for performance. *------------------------------------------------------------------------- */#include <fcntl.h>#include "postgres.h"#include "access/nbtree.h"#ifdef BTREE_BUILD_STATS#define ShowExecutorStats pg_options[TRACE_EXECUTORSTATS]#endifstatic BTItem _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags);static BTItem _bt_minitem(Page opage, BlockNumber oblkno, int atend);static void *_bt_pagestate(Relation index, int flags, int level, bool doupper);static void _bt_uppershutdown(Relation index, BTPageState *state);/* * turn on debugging output. * * XXX this code just does a numeric printf of the index key, so it's * only really useful for integer keys. *//*#define FASTBUILD_DEBUG*/#define FASTBUILD_SPOOL#define FASTBUILD_MERGE#define MAXTAPES		(7)#define TAPEBLCKSZ		(BLCKSZ << 2)extern int	NDirectFileRead;extern int	NDirectFileWrite;/* * this is what we use to shovel BTItems in and out of memory.	it's * bigger than a standard block because we are doing a lot of strictly * sequential i/o.	this is obviously something of a tradeoff since we * are potentially reading a bunch of zeroes off of disk in many * cases. * * BTItems are packed in and MAXALIGN'd. * * the fd should not be going out to disk, strictly speaking, but it's * the only thing like that so i'm not going to worry about wasting a * few bytes. */typedef struct{	int			bttb_magic;		/* magic number */	File		bttb_fd;		/* file descriptor */	int			bttb_top;		/* top of free space within bttb_data */	short		bttb_ntup;		/* number of tuples in this block */	short		bttb_eor;		/* End-Of-Run marker */	char		bttb_data[TAPEBLCKSZ - 2 * sizeof(double)];} BTTapeBlock;/* * this structure holds the bookkeeping for a simple balanced multiway * merge.  (polyphase merging is hairier than i want to get into right * now, and i don't see why i have to care how many "tapes" i use * right now.  though if psort was in a condition that i could hack it * to do this, you bet i would.) */typedef struct{	int			bts_ntapes;	int			bts_tape;	BTTapeBlock **bts_itape;	/* input tape blocks */	BTTapeBlock **bts_otape;	/* output tape blocks */	bool		isunique;} BTSpool;/*------------------------------------------------------------------------- * sorting comparison routine - returns {-1,0,1} depending on whether * the key in the left BTItem is {<,=,>} the key in the right BTItem. * * we want to use _bt_isortcmp as a comparison function for qsort(3), * but it needs extra arguments, so we "pass them in" as global * variables.  ick.  fortunately, they are the same throughout the * build, so we need do this only once.  this is why you must call * _bt_isortcmpinit before the call to qsort(3). * * a NULL BTItem is always assumed to be greater than any actual * value; our heap routines (see below) assume that the smallest * element in the heap is returned.  that way, NULL values from the * exhausted tapes can sift down to the bottom of the heap.  in point * of fact we just don't replace the elements of exhausted tapes, but * what the heck. * *------------------------------------------------------------------------- */typedef struct{	Datum	   *btsk_datum;	char	   *btsk_nulls;	BTItem		btsk_item;} BTSortKey;static Relation _bt_sortrel;static int	_bt_nattr;static BTSpool *_bt_inspool;static void_bt_isortcmpinit(Relation index, BTSpool *spool){	_bt_sortrel = index;	_bt_inspool = spool;	_bt_nattr = index->rd_att->natts;}static int_bt_isortcmp(BTSortKey *k1, BTSortKey *k2){	Datum	   *k1_datum = k1->btsk_datum;	Datum	   *k2_datum = k2->btsk_datum;	char	   *k1_nulls = k1->btsk_nulls;	char	   *k2_nulls = k2->btsk_nulls;	bool		equal_isnull = false;	int			i;	if (k1->btsk_item == (BTItem) NULL)	{		if (k2->btsk_item == (BTItem) NULL)			return 0;			/* 1 = 2 */		return 1;				/* 1 > 2 */	}	else if (k2->btsk_item == (BTItem) NULL)		return -1;				/* 1 < 2 */	for (i = 0; i < _bt_nattr; i++)	{		if (k1_nulls[i] != ' ') /* k1 attr is NULL */		{			if (k2_nulls[i] != ' ')		/* the same for k2 */			{				equal_isnull = true;				continue;			}			return 1;			/* NULL ">" NOT_NULL */		}		else if (k2_nulls[i] != ' ')	/* k2 attr is NULL */			return -1;			/* NOT_NULL "<" NULL */		if (_bt_invokestrat(_bt_sortrel, i + 1, BTGreaterStrategyNumber,							k1_datum[i], k2_datum[i]))			return 1;			/* 1 > 2 */		else if (_bt_invokestrat(_bt_sortrel, i + 1, BTGreaterStrategyNumber,								 k2_datum[i], k1_datum[i]))			return -1;			/* 1 < 2 */	}	if (_bt_inspool->isunique && !equal_isnull)	{		_bt_spooldestroy((void *) _bt_inspool);		elog(ERROR, "Cannot create unique index. Table contains non-unique values");	}	return 0;					/* 1 = 2 */}static void_bt_setsortkey(Relation index, BTItem bti, BTSortKey *sk){	sk->btsk_item = (BTItem) NULL;	sk->btsk_datum = (Datum *) NULL;	sk->btsk_nulls = (char *) NULL;	if (bti != (BTItem) NULL)	{		IndexTuple	it = &(bti->bti_itup);		TupleDesc	itdesc = index->rd_att;		Datum	   *dp = (Datum *) palloc(_bt_nattr * sizeof(Datum));		char	   *np = (char *) palloc(_bt_nattr * sizeof(char));		bool		isnull;		int			i;		for (i = 0; i < _bt_nattr; i++)		{			dp[i] = index_getattr(it, i + 1, itdesc, &isnull);			if (isnull)				np[i] = 'n';			else				np[i] = ' ';		}		sk->btsk_item = bti;		sk->btsk_datum = dp;		sk->btsk_nulls = np;	}}/*------------------------------------------------------------------------- * priority queue methods * * these were more-or-less lifted from the heap section of the 1984 * edition of gonnet's book on algorithms and data structures.  they * are coded so that the smallest element in the heap is returned (we * use them for merging sorted runs). * * XXX these probably ought to be generic library functions. *------------------------------------------------------------------------- */typedef struct{	int			btpqe_tape;		/* tape identifier */	BTSortKey	btpqe_item;		/* pointer to BTItem in tape buffer */} BTPriQueueElem;#define MAXELEM MAXTAPEStypedef struct{	int			btpq_nelem;	BTPriQueueElem btpq_queue[MAXELEM];	Relation	btpq_rel;} BTPriQueue;/* be sure to call _bt_isortcmpinit first */#define GREATER(a, b) \	(_bt_isortcmp(&((a)->btpqe_item), &((b)->btpqe_item)) > 0)static void_bt_pqsift(BTPriQueue *q, int parent){	int			child;	BTPriQueueElem e;	for (child = parent * 2 + 1;		 child < q->btpq_nelem;		 child = parent * 2 + 1)	{		if (child < q->btpq_nelem - 1)		{			if (GREATER(&(q->btpq_queue[child]), &(q->btpq_queue[child + 1])))				++child;		}		if (GREATER(&(q->btpq_queue[parent]), &(q->btpq_queue[child])))		{			e = q->btpq_queue[child];	/* struct = */			q->btpq_queue[child] = q->btpq_queue[parent];		/* struct = */			q->btpq_queue[parent] = e;	/* struct = */			parent = child;		}		else			parent = child + 1;	}}static int_bt_pqnext(BTPriQueue *q, BTPriQueueElem *e){	if (q->btpq_nelem < 1)	{							/* already empty */		return -1;	}	*e = q->btpq_queue[0];		/* struct = */	if (--q->btpq_nelem < 1)	{							/* now empty, don't sift */		return 0;	}	q->btpq_queue[0] = q->btpq_queue[q->btpq_nelem];	/* struct = */	_bt_pqsift(q, 0);	return 0;}static void_bt_pqadd(BTPriQueue *q, BTPriQueueElem *e){	int			child,				parent;	if (q->btpq_nelem >= MAXELEM)		elog(ERROR, "_bt_pqadd: queue overflow");	child = q->btpq_nelem++;	while (child > 0)	{		parent = child / 2;		if (GREATER(e, &(q->btpq_queue[parent])))			break;		else		{			q->btpq_queue[child] = q->btpq_queue[parent];		/* struct = */			child = parent;		}	}	q->btpq_queue[child] = *e;	/* struct = */}/*------------------------------------------------------------------------- * tape methods *------------------------------------------------------------------------- */#define BTITEMSZ(btitem) \	((btitem) ? \	 (IndexTupleDSize((btitem)->bti_itup) + \	  (sizeof(BTItemData) - sizeof(IndexTupleData))) : \	 0)#define SPCLEFT(tape) \	(sizeof((tape)->bttb_data) - (tape)->bttb_top)#define EMPTYTAPE(tape) \	((tape)->bttb_ntup <= 0)#define BTTAPEMAGIC		0x19660226/* * reset the tape header for its next use without doing anything to * the physical tape file.	(setting bttb_top to 0 makes the block * empty.) */static void_bt_tapereset(BTTapeBlock *tape){	tape->bttb_eor = 0;	tape->bttb_top = 0;	tape->bttb_ntup = 0;}/* * rewind the physical tape file. */static void_bt_taperewind(BTTapeBlock *tape){	FileSeek(tape->bttb_fd, 0L, SEEK_SET);}/* * destroy the contents of the physical tape file without destroying * the tape data structure or removing the physical tape file. * * we use the VFD version of ftruncate(2) to do this rather than * unlinking and recreating the file.  you still have to wait while * the OS frees up all of the file system blocks and stuff, but at * least you don't have to delete and reinsert the directory entries. */static void_bt_tapeclear(BTTapeBlock *tape){	/* blow away the contents of the old file */	_bt_taperewind(tape);#ifdef NOT_USED	FileSync(tape->bttb_fd);#endif	FileTruncate(tape->bttb_fd, 0);	/* reset the buffer */	_bt_tapereset(tape);}/* * create a new BTTapeBlock, allocating memory for the data structure * as well as opening a physical tape file. */static BTTapeBlock *_bt_tapecreate(void){	BTTapeBlock *tape = (BTTapeBlock *) palloc(sizeof(BTTapeBlock));	if (tape == (BTTapeBlock *) NULL)		elog(ERROR, "_bt_tapecreate: out of memory");	tape->bttb_magic = BTTAPEMAGIC;	tape->bttb_fd = OpenTemporaryFile();	Assert(tape->bttb_fd >= 0);	/* initialize the buffer */	_bt_tapereset(tape);	return tape;}/* * destroy the BTTapeBlock structure and its physical tape file. */static void_bt_tapedestroy(BTTapeBlock *tape){	FileUnlink(tape->bttb_fd);	pfree((void *) tape);}/* * flush the tape block to the file, marking End-Of-Run if requested. */static void_bt_tapewrite(BTTapeBlock *tape, int eor){	tape->bttb_eor = eor;	FileWrite(tape->bttb_fd, (char *) tape, TAPEBLCKSZ);	NDirectFileWrite += TAPEBLCKSZ / BLCKSZ;	_bt_tapereset(tape);}/* * read a tape block from the file, overwriting the current contents * of the buffer. * * returns: * - 0 if there are no more blocks in the tape or in this run (call *	 _bt_tapereset to clear the End-Of-Run marker) * - 1 if a valid block was read */static int_bt_taperead(BTTapeBlock *tape){	File		fd;	int			nread;	if (tape->bttb_eor)	{		return 0;				/* we are already at End-Of-Run */	}	/*	 * we're clobbering the old tape block, but we do need to save the VFD	 * (the one in the block we're reading is bogus).	 */	fd = tape->bttb_fd;	nread = FileRead(fd, (char *) tape, TAPEBLCKSZ);	tape->bttb_fd = fd;	if (nread != TAPEBLCKSZ)	{		Assert(nread == 0);		/* we are at EOF */		return 0;	}	Assert(tape->bttb_magic == BTTAPEMAGIC);	NDirectFileRead += TAPEBLCKSZ / BLCKSZ;
12 3 下一页
💿 文件大小 6496 K
👤 上传用户 huajunun
📂 所属分类数据库系统
🏷️ 相关标签

#Postgresql #数据库
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -