📄 nbtsearch.c
字号:
/*------------------------------------------------------------------------- * * nbtsearch.c * Search code for postgres btrees. * * * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.80 2003/08/08 21:41:27 momjian Exp $ * *------------------------------------------------------------------------- */#include "postgres.h"#include "access/genam.h"#include "access/nbtree.h"static Buffer _bt_walk_left(Relation rel, Buffer buf);static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);/* * _bt_search() -- Search the tree for a particular scankey, * or more precisely for the first leaf page it could be on. * * Return value is a stack of parent-page pointers. *bufP is set to the * address of the leaf-page buffer, which is read-locked and pinned. * No locks are held on the parent pages, however! * * NOTE that the returned buffer is read-locked regardless of the access * parameter. However, access = BT_WRITE will allow an empty root page * to be created and returned. When access = BT_READ, an empty index * will result in *bufP being set to InvalidBuffer. */BTStack_bt_search(Relation rel, int keysz, ScanKey scankey, Buffer *bufP, int access){ BTStack stack_in = NULL; /* Get the root page to start with */ *bufP = _bt_getroot(rel, access); /* If index is empty and access = BT_READ, no root page is created. */ if (!BufferIsValid(*bufP)) return (BTStack) NULL; /* Loop iterates once per level descended in the tree */ for (;;) { Page page; BTPageOpaque opaque; OffsetNumber offnum; ItemId itemid; BTItem btitem; IndexTuple itup; BlockNumber blkno; BlockNumber par_blkno; BTStack new_stack; /* * Race -- the page we just grabbed may have split since we read * its pointer in the parent (or metapage). If it has, we may * need to move right to its new sibling. Do that. */ *bufP = _bt_moveright(rel, *bufP, keysz, scankey, BT_READ); /* if this is a leaf page, we're done */ page = BufferGetPage(*bufP); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (P_ISLEAF(opaque)) break; /* * Find the appropriate item on the internal page, and get the * child page that it points to. */ offnum = _bt_binsrch(rel, *bufP, keysz, scankey); itemid = PageGetItemId(page, offnum); btitem = (BTItem) PageGetItem(page, itemid); itup = &(btitem->bti_itup); blkno = ItemPointerGetBlockNumber(&(itup->t_tid)); par_blkno = BufferGetBlockNumber(*bufP); /* * We need to save the location of the index entry we chose in the * parent page on a stack. In case we split the tree, we'll use * the stack to work back up to the parent page. We also save the * actual downlink (TID) to uniquely identify the index entry, in * case it moves right while we're working lower in the tree. See * the paper by Lehman and Yao for how this is detected and * handled. (We use the child link to disambiguate duplicate keys * in the index -- Lehman and Yao disallow duplicate keys.) */ new_stack = (BTStack) palloc(sizeof(BTStackData)); new_stack->bts_blkno = par_blkno; new_stack->bts_offset = offnum; memcpy(&new_stack->bts_btitem, btitem, sizeof(BTItemData)); new_stack->bts_parent = stack_in; /* drop the read lock on the parent page, acquire one on the child */ _bt_relbuf(rel, *bufP); *bufP = _bt_getbuf(rel, blkno, BT_READ); /* okay, all set to move down a level */ stack_in = new_stack; } return stack_in;}/* * _bt_moveright() -- move right in the btree if necessary. * * When we follow a pointer to reach a page, it is possible that * the page has changed in the meanwhile. If this happens, we're * guaranteed that the page has "split right" -- that is, that any * data that appeared on the page originally is either on the page * or strictly to the right of it. * * This routine decides whether or not we need to move right in the * tree by examining the high key entry on the page. If that entry * is strictly less than one we expect to be on the page, then our * picture of the page is incorrect and we need to move right. * * On entry, we have the buffer pinned and a lock of the proper type. * If we move right, we release the buffer and lock and acquire the * same on the right sibling. Return value is the buffer we stop at. */Buffer_bt_moveright(Relation rel, Buffer buf, int keysz, ScanKey scankey, int access){ Page page; BTPageOpaque opaque; page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* * If the scan key that brought us to this page is > the high key * stored on the page, then the page has split and we need to move * right. (If the scan key is equal to the high key, we might or * might not need to move right; have to scan the page first anyway.) * It could even have split more than once, so scan as far as needed. * * We also have to move right if we followed a link that brought us to a * dead page. */ while (!P_RIGHTMOST(opaque) && (P_IGNORE(opaque) || _bt_compare(rel, keysz, scankey, page, P_HIKEY) > 0)) { /* step right one page */ BlockNumber rblkno = opaque->btpo_next; _bt_relbuf(rel, buf); buf = _bt_getbuf(rel, rblkno, access); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); } if (P_IGNORE(opaque)) elog(ERROR, "fell off the end of \"%s\"", RelationGetRelationName(rel)); return buf;}/* * _bt_binsrch() -- Do a binary search for a key on a particular page. * * The scankey we get has the compare function stored in the procedure * entry of each data struct. We invoke this regproc to do the * comparison for every key in the scankey. * * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first * key >= given scankey. (NOTE: in particular, this means it is possible * to return a value 1 greater than the number of keys on the page, * if the scankey is > all keys on the page.) * * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber * of the last key < given scankey. (Since _bt_compare treats the first * data key of such a page as minus infinity, there will be at least one * key < scankey, so the result always points at one of the keys on the * page.) This key indicates the right place to descend to be sure we * find all leaf keys >= given scankey. * * This procedure is not responsible for walking right, it just examines * the given page. _bt_binsrch() has no lock or refcount side effects * on the buffer. */OffsetNumber_bt_binsrch(Relation rel, Buffer buf, int keysz, ScanKey scankey){ TupleDesc itupdesc; Page page; BTPageOpaque opaque; OffsetNumber low, high; int32 result; itupdesc = RelationGetDescr(rel); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); low = P_FIRSTDATAKEY(opaque); high = PageGetMaxOffsetNumber(page); /* * If there are no keys on the page, return the first available slot. * Note this covers two cases: the page is really empty (no keys), or * it contains only a high key. The latter case is possible after * vacuuming. This can never happen on an internal page, however, * since they are never empty (an internal page must have children). */ if (high < low) return low; /* * Binary search to find the first key on the page >= scan key. Loop * invariant: all slots before 'low' are < scan key, all slots at or * after 'high' are >= scan key. We can fall out when high == low. */ high++; /* establish the loop invariant for high */ while (high > low) { OffsetNumber mid = low + ((high - low) / 2); /* We have low <= mid < high, so mid points at a real slot */ result = _bt_compare(rel, keysz, scankey, page, mid); if (result > 0) low = mid + 1; else high = mid; } /* * At this point we have high == low, but be careful: they could point * past the last slot on the page. * * On a leaf page, we always return the first key >= scan key (which * could be the last slot + 1). */ if (P_ISLEAF(opaque)) return low; /* * On a non-leaf page, return the last key < scan key. There must be * one if _bt_compare() is playing by the rules. */ Assert(low > P_FIRSTDATAKEY(opaque)); return OffsetNumberPrev(low);}/*---------- * _bt_compare() -- Compare scankey to a particular tuple on the page. * * keysz: number of key conditions to be checked (might be less than the * total length of the scan key!) * page/offnum: location of btree item to be compared to. * * This routine returns: * <0 if scankey < tuple at offnum; * 0 if scankey == tuple at offnum; * >0 if scankey > tuple at offnum. * NULLs in the keys are treated as sortable values. Therefore * "equality" does not necessarily mean that the item should be * returned to the caller as a matching key! * * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be * "minus infinity": this routine will always claim it is less than the * scankey. The actual key value stored (if any, which there probably isn't) * does not matter. This convention allows us to implement the Lehman and * Yao convention that the first down-link pointer is before the first key. * See backend/access/nbtree/README for details. *---------- */int32_bt_compare(Relation rel, int keysz, ScanKey scankey, Page page, OffsetNumber offnum){ TupleDesc itupdesc = RelationGetDescr(rel); BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); BTItem btitem; IndexTuple itup; int i; /* * Force result ">" if target item is first data item on an internal * page --- see NOTE above. */ if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque)) return 1; btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); itup = &(btitem->bti_itup); /* * The scan key is set up with the attribute number associated with * each term in the key. It is important that, if the index is * multi-key, the scan contain the first k key attributes, and that * they be in order. If you think about how multi-key ordering works, * you'll understand why this is. * * We don't test for violation of this condition here, however. The * initial setup for the index scan had better have gotten it right * (see _bt_first). */ for (i = 0; i < keysz; i++) { ScanKey entry = &scankey[i]; Datum datum; bool isNull; int32 result; datum = index_getattr(itup, entry->sk_attno, itupdesc, &isNull); /* see comments about NULLs handling in btbuild */ if (entry->sk_flags & SK_ISNULL) /* key is NULL */ { if (isNull) result = 0; /* NULL "=" NULL */ else result = 1; /* NULL ">" NOT_NULL */ } else if (isNull) /* key is NOT_NULL and item is NULL */ { result = -1; /* NOT_NULL "<" NULL */ } else { result = DatumGetInt32(FunctionCall2(&entry->sk_func, entry->sk_argument, datum)); } /* if the keys are unequal, return the difference */ if (result != 0) return result; } /* if we get here, the keys are equal */ return 0;}/* * _bt_next() -- Get the next item in a scan. * * On entry, we have a valid currentItemData in the scan, and a * read lock and pin count on the page that contains that item. * We return the next item in the scan, or false if no more. * On successful exit, the page containing the new item is locked * and pinned; on failure exit, no lock or pin is held. */bool_bt_next(IndexScanDesc scan, ScanDirection dir){ Relation rel; Buffer buf; Page page; OffsetNumber offnum; ItemPointer current; BTItem btitem; IndexTuple itup; BTScanOpaque so; bool continuescan; rel = scan->indexRelation; so = (BTScanOpaque) scan->opaque; current = &(scan->currentItemData); /* we still have the buffer pinned and locked */ buf = so->btso_curbuf; Assert(BufferIsValid(buf)); do { /* step one tuple in the appropriate direction */ if (!_bt_step(scan, &buf, dir)) return false; /* current is the next candidate tuple to return */ offnum = ItemPointerGetOffsetNumber(current); page = BufferGetPage(buf); btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); itup = &btitem->bti_itup; if (_bt_checkkeys(scan, itup, dir, &continuescan)) { /* tuple passes all scan key conditions, so return it */ scan->xs_ctup.t_self = itup->t_tid;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -