📄 nbtsearch.c
字号:
/*------------------------------------------------------------------------- * * nbtsearch.c * Search code for postgres btrees. * * * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.116 2008/01/01 19:45:46 momjian Exp $ * *------------------------------------------------------------------------- */#include "postgres.h"#include "access/genam.h"#include "access/nbtree.h"#include "pgstat.h"#include "utils/lsyscache.h"static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum);static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);static Buffer _bt_walk_left(Relation rel, Buffer buf);static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);/* * _bt_search() -- Search the tree for a particular scankey, * or more precisely for the first leaf page it could be on. * * The passed scankey must be an insertion-type scankey (see nbtree/README), * but it can omit the rightmost column(s) of the index. * * When nextkey is false (the usual case), we are looking for the first * item >= scankey. When nextkey is true, we are looking for the first * item strictly greater than scankey. * * Return value is a stack of parent-page pointers. *bufP is set to the * address of the leaf-page buffer, which is read-locked and pinned. * No locks are held on the parent pages, however! * * NOTE that the returned buffer is read-locked regardless of the access * parameter. However, access = BT_WRITE will allow an empty root page * to be created and returned. When access = BT_READ, an empty index * will result in *bufP being set to InvalidBuffer. */BTStack_bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, Buffer *bufP, int access){ BTStack stack_in = NULL; /* Get the root page to start with */ *bufP = _bt_getroot(rel, access); /* If index is empty and access = BT_READ, no root page is created. */ if (!BufferIsValid(*bufP)) return (BTStack) NULL; /* Loop iterates once per level descended in the tree */ for (;;) { Page page; BTPageOpaque opaque; OffsetNumber offnum; ItemId itemid; IndexTuple itup; BlockNumber blkno; BlockNumber par_blkno; BTStack new_stack; /* * Race -- the page we just grabbed may have split since we read its * pointer in the parent (or metapage). If it has, we may need to * move right to its new sibling. Do that. */ *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, BT_READ); /* if this is a leaf page, we're done */ page = BufferGetPage(*bufP); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (P_ISLEAF(opaque)) break; /* * Find the appropriate item on the internal page, and get the child * page that it points to. */ offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey); itemid = PageGetItemId(page, offnum); itup = (IndexTuple) PageGetItem(page, itemid); blkno = ItemPointerGetBlockNumber(&(itup->t_tid)); par_blkno = BufferGetBlockNumber(*bufP); /* * We need to save the location of the index entry we chose in the * parent page on a stack. In case we split the tree, we'll use the * stack to work back up to the parent page. We also save the actual * downlink (TID) to uniquely identify the index entry, in case it * moves right while we're working lower in the tree. See the paper * by Lehman and Yao for how this is detected and handled. (We use the * child link to disambiguate duplicate keys in the index -- Lehman * and Yao disallow duplicate keys.) */ new_stack = (BTStack) palloc(sizeof(BTStackData)); new_stack->bts_blkno = par_blkno; new_stack->bts_offset = offnum; memcpy(&new_stack->bts_btentry, itup, sizeof(IndexTupleData)); new_stack->bts_parent = stack_in; /* drop the read lock on the parent page, acquire one on the child */ *bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ); /* okay, all set to move down a level */ stack_in = new_stack; } return stack_in;}/* * _bt_moveright() -- move right in the btree if necessary. * * When we follow a pointer to reach a page, it is possible that * the page has changed in the meanwhile. If this happens, we're * guaranteed that the page has "split right" -- that is, that any * data that appeared on the page originally is either on the page * or strictly to the right of it. * * This routine decides whether or not we need to move right in the * tree by examining the high key entry on the page. If that entry * is strictly less than the scankey, or <= the scankey in the nextkey=true * case, then we followed the wrong link and we need to move right. * * The passed scankey must be an insertion-type scankey (see nbtree/README), * but it can omit the rightmost column(s) of the index. * * When nextkey is false (the usual case), we are looking for the first * item >= scankey. When nextkey is true, we are looking for the first * item strictly greater than scankey. * * On entry, we have the buffer pinned and a lock of the type specified by * 'access'. If we move right, we release the buffer and lock and acquire * the same on the right sibling. Return value is the buffer we stop at. */Buffer_bt_moveright(Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey, int access){ Page page; BTPageOpaque opaque; int32 cmpval; page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* * When nextkey = false (normal case): if the scan key that brought us to * this page is > the high key stored on the page, then the page has split * and we need to move right. (If the scan key is equal to the high key, * we might or might not need to move right; have to scan the page first * anyway.) * * When nextkey = true: move right if the scan key is >= page's high key. * * The page could even have split more than once, so scan as far as * needed. * * We also have to move right if we followed a link that brought us to a * dead page. */ cmpval = nextkey ? 0 : 1; while (!P_RIGHTMOST(opaque) && (P_IGNORE(opaque) || _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval)) { /* step right one page */ BlockNumber rblkno = opaque->btpo_next; buf = _bt_relandgetbuf(rel, buf, rblkno, access); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); } if (P_IGNORE(opaque)) elog(ERROR, "fell off the end of index \"%s\"", RelationGetRelationName(rel)); return buf;}/* * _bt_binsrch() -- Do a binary search for a key on a particular page. * * The passed scankey must be an insertion-type scankey (see nbtree/README), * but it can omit the rightmost column(s) of the index. * * When nextkey is false (the usual case), we are looking for the first * item >= scankey. When nextkey is true, we are looking for the first * item strictly greater than scankey. * * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first * key >= given scankey, or > scankey if nextkey is true. (NOTE: in * particular, this means it is possible to return a value 1 greater than the * number of keys on the page, if the scankey is > all keys on the page.) * * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber * of the last key < given scankey, or last key <= given scankey if nextkey * is true. (Since _bt_compare treats the first data key of such a page as * minus infinity, there will be at least one key < scankey, so the result * always points at one of the keys on the page.) This key indicates the * right place to descend to be sure we find all leaf keys >= given scankey * (or leaf keys > given scankey when nextkey is true). * * This procedure is not responsible for walking right, it just examines * the given page. _bt_binsrch() has no lock or refcount side effects * on the buffer. */OffsetNumber_bt_binsrch(Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey){ Page page; BTPageOpaque opaque; OffsetNumber low, high; int32 result, cmpval; page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); low = P_FIRSTDATAKEY(opaque); high = PageGetMaxOffsetNumber(page); /* * If there are no keys on the page, return the first available slot. Note * this covers two cases: the page is really empty (no keys), or it * contains only a high key. The latter case is possible after vacuuming. * This can never happen on an internal page, however, since they are * never empty (an internal page must have children). */ if (high < low) return low; /* * Binary search to find the first key on the page >= scan key, or first * key > scankey when nextkey is true. * * For nextkey=false (cmpval=1), the loop invariant is: all slots before * 'low' are < scan key, all slots at or after 'high' are >= scan key. * * For nextkey=true (cmpval=0), the loop invariant is: all slots before * 'low' are <= scan key, all slots at or after 'high' are > scan key. * * We can fall out when high == low. */ high++; /* establish the loop invariant for high */ cmpval = nextkey ? 0 : 1; /* select comparison value */ while (high > low) { OffsetNumber mid = low + ((high - low) / 2); /* We have low <= mid < high, so mid points at a real slot */ result = _bt_compare(rel, keysz, scankey, page, mid); if (result >= cmpval) low = mid + 1; else high = mid; } /* * At this point we have high == low, but be careful: they could point * past the last slot on the page. * * On a leaf page, we always return the first key >= scan key (resp. > * scan key), which could be the last slot + 1. */ if (P_ISLEAF(opaque)) return low; /* * On a non-leaf page, return the last key < scan key (resp. <= scan key). * There must be one if _bt_compare() is playing by the rules. */ Assert(low > P_FIRSTDATAKEY(opaque)); return OffsetNumberPrev(low);}/*---------- * _bt_compare() -- Compare scankey to a particular tuple on the page. * * The passed scankey must be an insertion-type scankey (see nbtree/README), * but it can omit the rightmost column(s) of the index. * * keysz: number of key conditions to be checked (might be less than the * number of index columns!) * page/offnum: location of btree item to be compared to. * * This routine returns: * <0 if scankey < tuple at offnum; * 0 if scankey == tuple at offnum; * >0 if scankey > tuple at offnum. * NULLs in the keys are treated as sortable values. Therefore * "equality" does not necessarily mean that the item should be * returned to the caller as a matching key! * * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be * "minus infinity": this routine will always claim it is less than the * scankey. The actual key value stored (if any, which there probably isn't) * does not matter. This convention allows us to implement the Lehman and * Yao convention that the first down-link pointer is before the first key. * See backend/access/nbtree/README for details. *---------- */int32_bt_compare(Relation rel, int keysz, ScanKey scankey, Page page, OffsetNumber offnum){ TupleDesc itupdesc = RelationGetDescr(rel); BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); IndexTuple itup; int i; /* * Force result ">" if target item is first data item on an internal page * --- see NOTE above. */ if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque)) return 1; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); /* * The scan key is set up with the attribute number associated with each * term in the key. It is important that, if the index is multi-key, the * scan contain the first k key attributes, and that they be in order. If * you think about how multi-key ordering works, you'll understand why * this is. * * We don't test for violation of this condition here, however. The * initial setup for the index scan had better have gotten it right (see * _bt_first). */ for (i = 1; i <= keysz; i++) { Datum datum; bool isNull; int32 result; datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull); /* see comments about NULLs handling in btbuild */ if (scankey->sk_flags & SK_ISNULL) /* key is NULL */ { if (isNull) result = 0; /* NULL "=" NULL */ else if (scankey->sk_flags & SK_BT_NULLS_FIRST) result = -1; /* NULL "<" NOT_NULL */ else result = 1; /* NULL ">" NOT_NULL */ } else if (isNull) /* key is NOT_NULL and item is NULL */ { if (scankey->sk_flags & SK_BT_NULLS_FIRST) result = 1; /* NOT_NULL ">" NULL */ else result = -1; /* NOT_NULL "<" NULL */ } else { /* * The sk_func needs to be passed the index value as left arg and * the sk_argument as right arg (they might be of different * types). Since it is convenient for callers to think of * _bt_compare as comparing the scankey to the index item, we have * to flip the sign of the comparison result. (Unless it's a DESC * column, in which case we *don't* flip the sign.) */ result = DatumGetInt32(FunctionCall2(&scankey->sk_func, datum, scankey->sk_argument)); if (!(scankey->sk_flags & SK_BT_DESC)) result = -result; } /* if the keys are unequal, return the difference */ if (result != 0) return result; scankey++; } /* if we get here, the keys are equal */ return 0;}/* * _bt_first() -- Find the first item in a scan. * * We need to be clever about the direction of scan, the search * conditions, and the tree ordering. We find the first item (or, * if backwards scan, the last item) in the tree that satisfies the * qualifications in the scan key. On success exit, the page containing * the current index tuple is pinned but not locked, and data about * the matching tuple(s) on the page has been loaded into so->currPos, * and scan->xs_ctup.t_self is set to the heap TID of the current tuple. * * If there are no matching items in the index, we return FALSE, with no * pins or locks held. * * Note that scan->keyData[], and the so->keyData[] scankey built from it, * are both search-type scankeys (see nbtree/README for more about this). * Within this routine, we build a temporary insertion-type scankey to use * in locating the scan start position. */bool_bt_first(IndexScanDesc scan, ScanDirection dir){ Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; Buffer buf; BTStack stack; OffsetNumber offnum; StrategyNumber strat; bool nextkey; bool goback; ScanKey startKeys[INDEX_MAX_KEYS]; ScanKeyData scankeys[INDEX_MAX_KEYS]; int keysCount = 0; int i; StrategyNumber strat_total; pgstat_count_index_scan(rel); /* * Examine the scan keys and eliminate any redundant keys; also mark the * keys that must be matched to continue the scan. */ _bt_preprocess_keys(scan); /* * Quit now if _bt_preprocess_keys() discovered that the scan keys can * never be satisfied (eg, x == 1 AND x > 2). */ if (!so->qual_ok) return false; /*---------- * Examine the scan keys to discover where we need to start the scan. * * We want to identify the keys that can be used as starting boundaries; * these are =, >, or >= keys for a forward scan or =, <, <= keys for * a backwards scan. We can use keys for multiple attributes so long as * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept * a > or < boundary or find an attribute with no boundary (which can be * thought of as the same as "> -infinity"), we can't use keys for any * attributes to its right, because it would break our simplistic notion * of what initial positioning strategy to use. * * When the scan keys include cross-type operators, _bt_preprocess_keys * may not be able to eliminate redundant keys; in such cases we will * arbitrarily pick a usable one for each attribute. This is correct * but possibly not optimal behavior. (For example, with keys like * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when * x=5 would be more efficient.) Since the situation only arises given * a poorly-worded query plus an incomplete opfamily, live with it. * * When both equality and inequality keys appear for a single attribute * (again, only possible when cross-type operators appear), we *must* * select one of the equality keys for the starting point, because * _bt_checkkeys() will stop the scan as soon as an equality qual fails. * For example, if we have keys like "x >= 4 AND x = 10" and we elect to * start at x=4, we will fail and stop before reaching x=10. If multiple * equality quals survive preprocessing, however, it doesn't matter which * one we use --- by definition, they are either redundant or
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -