nbtsearch.c

来自「postgresql8.3.4源码,开源数据库」· C语言代码 · 共 1,492 行 · 第 1/3 页
1,492 行
/*------------------------------------------------------------------------- * * nbtsearch.c *	  Search code for postgres btrees. * * * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.116 2008/01/01 19:45:46 momjian Exp $ * *------------------------------------------------------------------------- */#include "postgres.h"#include "access/genam.h"#include "access/nbtree.h"#include "pgstat.h"#include "utils/lsyscache.h"static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,			 OffsetNumber offnum);static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);static Buffer _bt_walk_left(Relation rel, Buffer buf);static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);/* *	_bt_search() -- Search the tree for a particular scankey, *		or more precisely for the first leaf page it could be on. * * The passed scankey must be an insertion-type scankey (see nbtree/README), * but it can omit the rightmost column(s) of the index. * * When nextkey is false (the usual case), we are looking for the first * item >= scankey.  When nextkey is true, we are looking for the first * item strictly greater than scankey. * * Return value is a stack of parent-page pointers.  *bufP is set to the * address of the leaf-page buffer, which is read-locked and pinned. * No locks are held on the parent pages, however! * * NOTE that the returned buffer is read-locked regardless of the access * parameter.  However, access = BT_WRITE will allow an empty root page * to be created and returned.	When access = BT_READ, an empty index * will result in *bufP being set to InvalidBuffer. */BTStack_bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,		   Buffer *bufP, int access){	BTStack		stack_in = NULL;	/* Get the root page to start with */	*bufP = _bt_getroot(rel, access);	/* If index is empty and access = BT_READ, no root page is created. */	if (!BufferIsValid(*bufP))		return (BTStack) NULL;	/* Loop iterates once per level descended in the tree */	for (;;)	{		Page		page;		BTPageOpaque opaque;		OffsetNumber offnum;		ItemId		itemid;		IndexTuple	itup;		BlockNumber blkno;		BlockNumber par_blkno;		BTStack		new_stack;		/*		 * Race -- the page we just grabbed may have split since we read its		 * pointer in the parent (or metapage).  If it has, we may need to		 * move right to its new sibling.  Do that.		 */		*bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, BT_READ);		/* if this is a leaf page, we're done */		page = BufferGetPage(*bufP);		opaque = (BTPageOpaque) PageGetSpecialPointer(page);		if (P_ISLEAF(opaque))			break;		/*		 * Find the appropriate item on the internal page, and get the child		 * page that it points to.		 */		offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey);		itemid = PageGetItemId(page, offnum);		itup = (IndexTuple) PageGetItem(page, itemid);		blkno = ItemPointerGetBlockNumber(&(itup->t_tid));		par_blkno = BufferGetBlockNumber(*bufP);		/*		 * We need to save the location of the index entry we chose in the		 * parent page on a stack. In case we split the tree, we'll use the		 * stack to work back up to the parent page.  We also save the actual		 * downlink (TID) to uniquely identify the index entry, in case it		 * moves right while we're working lower in the tree.  See the paper		 * by Lehman and Yao for how this is detected and handled. (We use the		 * child link to disambiguate duplicate keys in the index -- Lehman		 * and Yao disallow duplicate keys.)		 */		new_stack = (BTStack) palloc(sizeof(BTStackData));		new_stack->bts_blkno = par_blkno;		new_stack->bts_offset = offnum;		memcpy(&new_stack->bts_btentry, itup, sizeof(IndexTupleData));		new_stack->bts_parent = stack_in;		/* drop the read lock on the parent page, acquire one on the child */		*bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ);		/* okay, all set to move down a level */		stack_in = new_stack;	}	return stack_in;}/* *	_bt_moveright() -- move right in the btree if necessary. * * When we follow a pointer to reach a page, it is possible that * the page has changed in the meanwhile.  If this happens, we're * guaranteed that the page has "split right" -- that is, that any * data that appeared on the page originally is either on the page * or strictly to the right of it. * * This routine decides whether or not we need to move right in the * tree by examining the high key entry on the page.  If that entry * is strictly less than the scankey, or <= the scankey in the nextkey=true * case, then we followed the wrong link and we need to move right. * * The passed scankey must be an insertion-type scankey (see nbtree/README), * but it can omit the rightmost column(s) of the index. * * When nextkey is false (the usual case), we are looking for the first * item >= scankey.  When nextkey is true, we are looking for the first * item strictly greater than scankey. * * On entry, we have the buffer pinned and a lock of the type specified by * 'access'.  If we move right, we release the buffer and lock and acquire * the same on the right sibling.  Return value is the buffer we stop at. */Buffer_bt_moveright(Relation rel,			  Buffer buf,			  int keysz,			  ScanKey scankey,			  bool nextkey,			  int access){	Page		page;	BTPageOpaque opaque;	int32		cmpval;	page = BufferGetPage(buf);	opaque = (BTPageOpaque) PageGetSpecialPointer(page);	/*	 * When nextkey = false (normal case): if the scan key that brought us to	 * this page is > the high key stored on the page, then the page has split	 * and we need to move right.  (If the scan key is equal to the high key,	 * we might or might not need to move right; have to scan the page first	 * anyway.)	 *	 * When nextkey = true: move right if the scan key is >= page's high key.	 *	 * The page could even have split more than once, so scan as far as	 * needed.	 *	 * We also have to move right if we followed a link that brought us to a	 * dead page.	 */	cmpval = nextkey ? 0 : 1;	while (!P_RIGHTMOST(opaque) &&		   (P_IGNORE(opaque) ||			_bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval))	{		/* step right one page */		BlockNumber rblkno = opaque->btpo_next;		buf = _bt_relandgetbuf(rel, buf, rblkno, access);		page = BufferGetPage(buf);		opaque = (BTPageOpaque) PageGetSpecialPointer(page);	}	if (P_IGNORE(opaque))		elog(ERROR, "fell off the end of index \"%s\"",			 RelationGetRelationName(rel));	return buf;}/* *	_bt_binsrch() -- Do a binary search for a key on a particular page. * * The passed scankey must be an insertion-type scankey (see nbtree/README), * but it can omit the rightmost column(s) of the index. * * When nextkey is false (the usual case), we are looking for the first * item >= scankey.  When nextkey is true, we are looking for the first * item strictly greater than scankey. * * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first * key >= given scankey, or > scankey if nextkey is true.  (NOTE: in * particular, this means it is possible to return a value 1 greater than the * number of keys on the page, if the scankey is > all keys on the page.) * * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber * of the last key < given scankey, or last key <= given scankey if nextkey * is true.  (Since _bt_compare treats the first data key of such a page as * minus infinity, there will be at least one key < scankey, so the result * always points at one of the keys on the page.)  This key indicates the * right place to descend to be sure we find all leaf keys >= given scankey * (or leaf keys > given scankey when nextkey is true). * * This procedure is not responsible for walking right, it just examines * the given page.	_bt_binsrch() has no lock or refcount side effects * on the buffer. */OffsetNumber_bt_binsrch(Relation rel,			Buffer buf,			int keysz,			ScanKey scankey,			bool nextkey){	Page		page;	BTPageOpaque opaque;	OffsetNumber low,				high;	int32		result,				cmpval;	page = BufferGetPage(buf);	opaque = (BTPageOpaque) PageGetSpecialPointer(page);	low = P_FIRSTDATAKEY(opaque);	high = PageGetMaxOffsetNumber(page);	/*	 * If there are no keys on the page, return the first available slot. Note	 * this covers two cases: the page is really empty (no keys), or it	 * contains only a high key.  The latter case is possible after vacuuming.	 * This can never happen on an internal page, however, since they are	 * never empty (an internal page must have children).	 */	if (high < low)		return low;	/*	 * Binary search to find the first key on the page >= scan key, or first	 * key > scankey when nextkey is true.	 *	 * For nextkey=false (cmpval=1), the loop invariant is: all slots before	 * 'low' are < scan key, all slots at or after 'high' are >= scan key.	 *	 * For nextkey=true (cmpval=0), the loop invariant is: all slots before	 * 'low' are <= scan key, all slots at or after 'high' are > scan key.	 *	 * We can fall out when high == low.	 */	high++;						/* establish the loop invariant for high */	cmpval = nextkey ? 0 : 1;	/* select comparison value */	while (high > low)	{		OffsetNumber mid = low + ((high - low) / 2);		/* We have low <= mid < high, so mid points at a real slot */		result = _bt_compare(rel, keysz, scankey, page, mid);		if (result >= cmpval)			low = mid + 1;		else			high = mid;	}	/*	 * At this point we have high == low, but be careful: they could point	 * past the last slot on the page.	 *	 * On a leaf page, we always return the first key >= scan key (resp. >	 * scan key), which could be the last slot + 1.	 */	if (P_ISLEAF(opaque))		return low;	/*	 * On a non-leaf page, return the last key < scan key (resp. <= scan key).	 * There must be one if _bt_compare() is playing by the rules.	 */	Assert(low > P_FIRSTDATAKEY(opaque));	return OffsetNumberPrev(low);}/*---------- *	_bt_compare() -- Compare scankey to a particular tuple on the page. * * The passed scankey must be an insertion-type scankey (see nbtree/README), * but it can omit the rightmost column(s) of the index. * *	keysz: number of key conditions to be checked (might be less than the *		number of index columns!) *	page/offnum: location of btree item to be compared to. * *		This routine returns: *			<0 if scankey < tuple at offnum; *			 0 if scankey == tuple at offnum; *			>0 if scankey > tuple at offnum. *		NULLs in the keys are treated as sortable values.  Therefore *		"equality" does not necessarily mean that the item should be *		returned to the caller as a matching key! * * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be * "minus infinity": this routine will always claim it is less than the * scankey.  The actual key value stored (if any, which there probably isn't) * does not matter.  This convention allows us to implement the Lehman and * Yao convention that the first down-link pointer is before the first key. * See backend/access/nbtree/README for details. *---------- */int32_bt_compare(Relation rel,			int keysz,			ScanKey scankey,			Page page,			OffsetNumber offnum){	TupleDesc	itupdesc = RelationGetDescr(rel);	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);	IndexTuple	itup;	int			i;	/*	 * Force result ">" if target item is first data item on an internal page	 * --- see NOTE above.	 */	if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))		return 1;	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));	/*	 * The scan key is set up with the attribute number associated with each	 * term in the key.  It is important that, if the index is multi-key, the	 * scan contain the first k key attributes, and that they be in order.	If	 * you think about how multi-key ordering works, you'll understand why	 * this is.	 *	 * We don't test for violation of this condition here, however.  The	 * initial setup for the index scan had better have gotten it right (see	 * _bt_first).	 */	for (i = 1; i <= keysz; i++)	{		Datum		datum;		bool		isNull;		int32		result;		datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);		/* see comments about NULLs handling in btbuild */		if (scankey->sk_flags & SK_ISNULL)		/* key is NULL */		{			if (isNull)				result = 0;		/* NULL "=" NULL */			else if (scankey->sk_flags & SK_BT_NULLS_FIRST)				result = -1;	/* NULL "<" NOT_NULL */			else				result = 1;		/* NULL ">" NOT_NULL */		}		else if (isNull)		/* key is NOT_NULL and item is NULL */		{			if (scankey->sk_flags & SK_BT_NULLS_FIRST)				result = 1;		/* NOT_NULL ">" NULL */			else				result = -1;	/* NOT_NULL "<" NULL */		}		else		{			/*			 * The sk_func needs to be passed the index value as left arg and			 * the sk_argument as right arg (they might be of different			 * types).	Since it is convenient for callers to think of			 * _bt_compare as comparing the scankey to the index item, we have			 * to flip the sign of the comparison result.  (Unless it's a DESC			 * column, in which case we *don't* flip the sign.)			 */			result = DatumGetInt32(FunctionCall2(&scankey->sk_func,												 datum,												 scankey->sk_argument));			if (!(scankey->sk_flags & SK_BT_DESC))				result = -result;		}		/* if the keys are unequal, return the difference */		if (result != 0)			return result;		scankey++;	}	/* if we get here, the keys are equal */	return 0;}/* *	_bt_first() -- Find the first item in a scan. * *		We need to be clever about the direction of scan, the search *		conditions, and the tree ordering.	We find the first item (or, *		if backwards scan, the last item) in the tree that satisfies the *		qualifications in the scan key.  On success exit, the page containing *		the current index tuple is pinned but not locked, and data about *		the matching tuple(s) on the page has been loaded into so->currPos, *		and scan->xs_ctup.t_self is set to the heap TID of the current tuple. * * If there are no matching items in the index, we return FALSE, with no * pins or locks held. * * Note that scan->keyData[], and the so->keyData[] scankey built from it, * are both search-type scankeys (see nbtree/README for more about this). * Within this routine, we build a temporary insertion-type scankey to use * in locating the scan start position. */bool_bt_first(IndexScanDesc scan, ScanDirection dir){	Relation	rel = scan->indexRelation;	BTScanOpaque so = (BTScanOpaque) scan->opaque;	Buffer		buf;	BTStack		stack;	OffsetNumber offnum;	StrategyNumber strat;	bool		nextkey;	bool		goback;	ScanKey		startKeys[INDEX_MAX_KEYS];	ScanKeyData scankeys[INDEX_MAX_KEYS];	int			keysCount = 0;	int			i;	StrategyNumber strat_total;	pgstat_count_index_scan(rel);	/*	 * Examine the scan keys and eliminate any redundant keys; also mark the	 * keys that must be matched to continue the scan.	 */	_bt_preprocess_keys(scan);	/*	 * Quit now if _bt_preprocess_keys() discovered that the scan keys can	 * never be satisfied (eg, x == 1 AND x > 2).	 */	if (!so->qual_ok)		return false;	/*----------	 * Examine the scan keys to discover where we need to start the scan.	 *	 * We want to identify the keys that can be used as starting boundaries;	 * these are =, >, or >= keys for a forward scan or =, <, <= keys for	 * a backwards scan.  We can use keys for multiple attributes so long as	 * the prior attributes had only =, >= (resp. =, <=) keys.	Once we accept	 * a > or < boundary or find an attribute with no boundary (which can be	 * thought of as the same as "> -infinity"), we can't use keys for any	 * attributes to its right, because it would break our simplistic notion	 * of what initial positioning strategy to use.	 *	 * When the scan keys include cross-type operators, _bt_preprocess_keys	 * may not be able to eliminate redundant keys; in such cases we will	 * arbitrarily pick a usable one for each attribute.  This is correct	 * but possibly not optimal behavior.  (For example, with keys like	 * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when	 * x=5 would be more efficient.)  Since the situation only arises given	 * a poorly-worded query plus an incomplete opfamily, live with it.	 *	 * When both equality and inequality keys appear for a single attribute	 * (again, only possible when cross-type operators appear), we *must*	 * select one of the equality keys for the starting point, because	 * _bt_checkkeys() will stop the scan as soon as an equality qual fails.	 * For example, if we have keys like "x >= 4 AND x = 10" and we elect to	 * start at x=4, we will fail and stop before reaching x=10.  If multiple	 * equality quals survive preprocessing, however, it doesn't matter which	 * one we use --- by definition, they are either redundant or
nbtsearch.c - 源码说明

本页面展示了「postgresql8.3.4源码,开源数据库」中的 nbtsearch.c 源码文件，采用 C语言编程语言编写，共 1,492 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与postgresql相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?