nbtsearch.c

来自「PostgreSQL 8.1.4的源码 适用于Linux下的开源数据库系统」· C语言 代码 · 共 1,313 行 · 第 1/3 页

C
1,313
字号
/*------------------------------------------------------------------------- * * nbtsearch.c *	  Search code for postgres btrees. * * * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.96.2.1 2005/11/22 18:23:04 momjian Exp $ * *------------------------------------------------------------------------- */#include "postgres.h"#include "access/genam.h"#include "access/nbtree.h"#include "pgstat.h"#include "utils/lsyscache.h"static Buffer _bt_walk_left(Relation rel, Buffer buf);static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);/* *	_bt_search() -- Search the tree for a particular scankey, *		or more precisely for the first leaf page it could be on. * * When nextkey is false (the usual case), we are looking for the first * item >= scankey.  When nextkey is true, we are looking for the first * item strictly greater than scankey. * * Return value is a stack of parent-page pointers.  *bufP is set to the * address of the leaf-page buffer, which is read-locked and pinned. * No locks are held on the parent pages, however! * * NOTE that the returned buffer is read-locked regardless of the access * parameter.  However, access = BT_WRITE will allow an empty root page * to be created and returned.	When access = BT_READ, an empty index * will result in *bufP being set to InvalidBuffer. */BTStack_bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,		   Buffer *bufP, int access){	BTStack		stack_in = NULL;	/* Get the root page to start with */	*bufP = _bt_getroot(rel, access);	/* If index is empty and access = BT_READ, no root page is created. */	if (!BufferIsValid(*bufP))		return (BTStack) NULL;	/* Loop iterates once per level descended in the tree */	for (;;)	{		Page		page;		BTPageOpaque opaque;		OffsetNumber offnum;		ItemId		itemid;		BTItem		btitem;		IndexTuple	itup;		BlockNumber blkno;		BlockNumber par_blkno;		BTStack		new_stack;		/*		 * Race -- the page we just grabbed may have split since we read its		 * pointer in the parent (or metapage).  If it has, we may need to		 * move right to its new sibling.  Do that.		 */		*bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, BT_READ);		/* if this is a leaf page, we're done */		page = BufferGetPage(*bufP);		opaque = (BTPageOpaque) PageGetSpecialPointer(page);		if (P_ISLEAF(opaque))			break;		/*		 * Find the appropriate item on the internal page, and get the child		 * page that it points to.		 */		offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey);		itemid = PageGetItemId(page, offnum);		btitem = (BTItem) PageGetItem(page, itemid);		itup = &(btitem->bti_itup);		blkno = ItemPointerGetBlockNumber(&(itup->t_tid));		par_blkno = BufferGetBlockNumber(*bufP);		/*		 * We need to save the location of the index entry we chose in the		 * parent page on a stack. In case we split the tree, we'll use the		 * stack to work back up to the parent page.  We also save the actual		 * downlink (TID) to uniquely identify the index entry, in case it		 * moves right while we're working lower in the tree.  See the paper		 * by Lehman and Yao for how this is detected and handled. (We use the		 * child link to disambiguate duplicate keys in the index -- Lehman		 * and Yao disallow duplicate keys.)		 */		new_stack = (BTStack) palloc(sizeof(BTStackData));		new_stack->bts_blkno = par_blkno;		new_stack->bts_offset = offnum;		memcpy(&new_stack->bts_btitem, btitem, sizeof(BTItemData));		new_stack->bts_parent = stack_in;		/* drop the read lock on the parent page, acquire one on the child */		*bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ);		/* okay, all set to move down a level */		stack_in = new_stack;	}	return stack_in;}/* *	_bt_moveright() -- move right in the btree if necessary. * * When we follow a pointer to reach a page, it is possible that * the page has changed in the meanwhile.  If this happens, we're * guaranteed that the page has "split right" -- that is, that any * data that appeared on the page originally is either on the page * or strictly to the right of it. * * When nextkey is false (the usual case), we are looking for the first * item >= scankey.  When nextkey is true, we are looking for the first * item strictly greater than scankey. * * This routine decides whether or not we need to move right in the * tree by examining the high key entry on the page.  If that entry * is strictly less than the scankey, or <= the scankey in the nextkey=true * case, then we followed the wrong link and we need to move right. * * On entry, we have the buffer pinned and a lock of the type specified by * 'access'.  If we move right, we release the buffer and lock and acquire * the same on the right sibling.  Return value is the buffer we stop at. */Buffer_bt_moveright(Relation rel,			  Buffer buf,			  int keysz,			  ScanKey scankey,			  bool nextkey,			  int access){	Page		page;	BTPageOpaque opaque;	int32		cmpval;	page = BufferGetPage(buf);	opaque = (BTPageOpaque) PageGetSpecialPointer(page);	/*	 * When nextkey = false (normal case): if the scan key that brought us to	 * this page is > the high key stored on the page, then the page has split	 * and we need to move right.  (If the scan key is equal to the high key,	 * we might or might not need to move right; have to scan the page first	 * anyway.)	 *	 * When nextkey = true: move right if the scan key is >= page's high key.	 *	 * The page could even have split more than once, so scan as far as	 * needed.	 *	 * We also have to move right if we followed a link that brought us to a	 * dead page.	 */	cmpval = nextkey ? 0 : 1;	while (!P_RIGHTMOST(opaque) &&		   (P_IGNORE(opaque) ||			_bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval))	{		/* step right one page */		BlockNumber rblkno = opaque->btpo_next;		buf = _bt_relandgetbuf(rel, buf, rblkno, access);		page = BufferGetPage(buf);		opaque = (BTPageOpaque) PageGetSpecialPointer(page);	}	if (P_IGNORE(opaque))		elog(ERROR, "fell off the end of \"%s\"",			 RelationGetRelationName(rel));	return buf;}/* *	_bt_binsrch() -- Do a binary search for a key on a particular page. * * When nextkey is false (the usual case), we are looking for the first * item >= scankey.  When nextkey is true, we are looking for the first * item strictly greater than scankey. * * The scankey we get has the compare function stored in the procedure * entry of each data struct.  We invoke this regproc to do the * comparison for every key in the scankey. * * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first * key >= given scankey, or > scankey if nextkey is true.  (NOTE: in * particular, this means it is possible to return a value 1 greater than the * number of keys on the page, if the scankey is > all keys on the page.) * * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber * of the last key < given scankey, or last key <= given scankey if nextkey * is true.  (Since _bt_compare treats the first data key of such a page as * minus infinity, there will be at least one key < scankey, so the result * always points at one of the keys on the page.)  This key indicates the * right place to descend to be sure we find all leaf keys >= given scankey * (or leaf keys > given scankey when nextkey is true). * * This procedure is not responsible for walking right, it just examines * the given page.	_bt_binsrch() has no lock or refcount side effects * on the buffer. */OffsetNumber_bt_binsrch(Relation rel,			Buffer buf,			int keysz,			ScanKey scankey,			bool nextkey){	Page		page;	BTPageOpaque opaque;	OffsetNumber low,				high;	int32		result,				cmpval;	page = BufferGetPage(buf);	opaque = (BTPageOpaque) PageGetSpecialPointer(page);	low = P_FIRSTDATAKEY(opaque);	high = PageGetMaxOffsetNumber(page);	/*	 * If there are no keys on the page, return the first available slot. Note	 * this covers two cases: the page is really empty (no keys), or it	 * contains only a high key.  The latter case is possible after vacuuming.	 * This can never happen on an internal page, however, since they are	 * never empty (an internal page must have children).	 */	if (high < low)		return low;	/*	 * Binary search to find the first key on the page >= scan key, or first	 * key > scankey when nextkey is true.	 *	 * For nextkey=false (cmpval=1), the loop invariant is: all slots before	 * 'low' are < scan key, all slots at or after 'high' are >= scan key.	 *	 * For nextkey=true (cmpval=0), the loop invariant is: all slots before	 * 'low' are <= scan key, all slots at or after 'high' are > scan key.	 *	 * We can fall out when high == low.	 */	high++;						/* establish the loop invariant for high */	cmpval = nextkey ? 0 : 1;	/* select comparison value */	while (high > low)	{		OffsetNumber mid = low + ((high - low) / 2);		/* We have low <= mid < high, so mid points at a real slot */		result = _bt_compare(rel, keysz, scankey, page, mid);		if (result >= cmpval)			low = mid + 1;		else			high = mid;	}	/*	 * At this point we have high == low, but be careful: they could point	 * past the last slot on the page.	 *	 * On a leaf page, we always return the first key >= scan key (resp. >	 * scan key), which could be the last slot + 1.	 */	if (P_ISLEAF(opaque))		return low;	/*	 * On a non-leaf page, return the last key < scan key (resp. <= scan key).	 * There must be one if _bt_compare() is playing by the rules.	 */	Assert(low > P_FIRSTDATAKEY(opaque));	return OffsetNumberPrev(low);}/*---------- *	_bt_compare() -- Compare scankey to a particular tuple on the page. * *	keysz: number of key conditions to be checked (might be less than the *	total length of the scan key!) *	page/offnum: location of btree item to be compared to. * *		This routine returns: *			<0 if scankey < tuple at offnum; *			 0 if scankey == tuple at offnum; *			>0 if scankey > tuple at offnum. *		NULLs in the keys are treated as sortable values.  Therefore *		"equality" does not necessarily mean that the item should be *		returned to the caller as a matching key! * * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be * "minus infinity": this routine will always claim it is less than the * scankey.  The actual key value stored (if any, which there probably isn't) * does not matter.  This convention allows us to implement the Lehman and * Yao convention that the first down-link pointer is before the first key. * See backend/access/nbtree/README for details. *---------- */int32_bt_compare(Relation rel,			int keysz,			ScanKey scankey,			Page page,			OffsetNumber offnum){	TupleDesc	itupdesc = RelationGetDescr(rel);	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);	BTItem		btitem;	IndexTuple	itup;	int			i;	/*	 * Force result ">" if target item is first data item on an internal page	 * --- see NOTE above.	 */	if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))		return 1;	btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));	itup = &(btitem->bti_itup);	/*	 * The scan key is set up with the attribute number associated with each	 * term in the key.  It is important that, if the index is multi-key, the	 * scan contain the first k key attributes, and that they be in order.	If	 * you think about how multi-key ordering works, you'll understand why	 * this is.	 *	 * We don't test for violation of this condition here, however.  The	 * initial setup for the index scan had better have gotten it right (see	 * _bt_first).	 */	for (i = 1; i <= keysz; i++)	{		Datum		datum;		bool		isNull;		int32		result;		datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);		/* see comments about NULLs handling in btbuild */		if (scankey->sk_flags & SK_ISNULL)		/* key is NULL */		{			if (isNull)				result = 0;		/* NULL "=" NULL */			else				result = 1;		/* NULL ">" NOT_NULL */		}		else if (isNull)		/* key is NOT_NULL and item is NULL */		{			result = -1;		/* NOT_NULL "<" NULL */		}		else		{			/*			 * The sk_func needs to be passed the index value as left arg and			 * the sk_argument as right arg (they might be of different			 * types).	Since it is convenient for callers to think of			 * _bt_compare as comparing the scankey to the index item, we have			 * to flip the sign of the comparison result.			 *			 * Note: curious-looking coding is to avoid overflow if comparison			 * function returns INT_MIN.  There is no risk of overflow for			 * positive results.			 */			result = DatumGetInt32(FunctionCall2(&scankey->sk_func,												 datum,												 scankey->sk_argument));			result = (result < 0) ? 1 : -result;		}		/* if the keys are unequal, return the difference */		if (result != 0)			return result;		scankey++;	}	/* if we get here, the keys are equal */	return 0;}/* *	_bt_next() -- Get the next item in a scan. * *		On entry, we have a valid currentItemData in the scan, and a *		read lock and pin count on the page that contains that item. *		We return the next item in the scan, or false if no more. *		On successful exit, the page containing the new item is locked *		and pinned; on failure exit, no lock or pin is held. */bool_bt_next(IndexScanDesc scan, ScanDirection dir){	Relation	rel;	Buffer		buf;	Page		page;	OffsetNumber offnum;	ItemPointer current;	BTItem		btitem;	IndexTuple	itup;	BTScanOpaque so;	bool		continuescan;	rel = scan->indexRelation;	so = (BTScanOpaque) scan->opaque;	current = &(scan->currentItemData);	/* we still have the buffer pinned and locked */	buf = so->btso_curbuf;	Assert(BufferIsValid(buf));

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?