📄 nbtsearch.c
字号:
/*------------------------------------------------------------------------- * * btsearch.c * search code for postgres btrees. * * Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION * $Header: /usr/local/cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.45.2.1 1999/08/02 05:56:42 scrappy Exp $ * *------------------------------------------------------------------------- */#include "postgres.h"#include "access/genam.h"#include "access/nbtree.h"static BTStack _bt_searchr(Relation rel, int keysz, ScanKey scankey, Buffer *bufP, BTStack stack_in);static OffsetNumber _bt_firsteq(Relation rel, TupleDesc itupdesc, Page page, Size keysz, ScanKey scankey, OffsetNumber offnum);static int _bt_compare(Relation rel, TupleDesc itupdesc, Page page, int keysz, ScanKey scankey, OffsetNumber offnum);static bool _bt_twostep(IndexScanDesc scan, Buffer *bufP, ScanDirection dir);static RetrieveIndexResult _bt_endpoint(IndexScanDesc scan, ScanDirection dir);/* * _bt_search() -- Search for a scan key in the index. * * This routine is actually just a helper that sets things up and * calls a recursive-descent search routine on the tree. */BTStack_bt_search(Relation rel, int keysz, ScanKey scankey, Buffer *bufP){ *bufP = _bt_getroot(rel, BT_READ); return _bt_searchr(rel, keysz, scankey, bufP, (BTStack) NULL);}/* * _bt_searchr() -- Search the tree recursively for a particular scankey. */static BTStack_bt_searchr(Relation rel, int keysz, ScanKey scankey, Buffer *bufP, BTStack stack_in){ BTStack stack; OffsetNumber offnum; Page page; BTPageOpaque opaque; BlockNumber par_blkno; BlockNumber blkno; ItemId itemid; BTItem btitem; BTItem item_save; int item_nbytes; IndexTuple itup; /* if this is a leaf page, we're done */ page = BufferGetPage(*bufP); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (opaque->btpo_flags & BTP_LEAF) return stack_in; /* * Find the appropriate item on the internal page, and get the child * page that it points to. */ par_blkno = BufferGetBlockNumber(*bufP); offnum = _bt_binsrch(rel, *bufP, keysz, scankey, BT_DESCENT); itemid = PageGetItemId(page, offnum); btitem = (BTItem) PageGetItem(page, itemid); itup = &(btitem->bti_itup); blkno = ItemPointerGetBlockNumber(&(itup->t_tid)); /* * We need to save the bit image of the index entry we chose in the * parent page on a stack. In case we split the tree, we'll use this * bit image to figure out what our real parent page is, in case the * parent splits while we're working lower in the tree. See the paper * by Lehman and Yao for how this is detected and handled. (We use * unique OIDs to disambiguate duplicate keys in the index -- Lehman * and Yao disallow duplicate keys). */ item_nbytes = ItemIdGetLength(itemid); item_save = (BTItem) palloc(item_nbytes); memmove((char *) item_save, (char *) btitem, item_nbytes); stack = (BTStack) palloc(sizeof(BTStackData)); stack->bts_blkno = par_blkno; stack->bts_offset = offnum; stack->bts_btitem = item_save; stack->bts_parent = stack_in; /* drop the read lock on the parent page and acquire one on the child */ _bt_relbuf(rel, *bufP, BT_READ); *bufP = _bt_getbuf(rel, blkno, BT_READ); /* * Race -- the page we just grabbed may have split since we read its * pointer in the parent. If it has, we may need to move right to its * new sibling. Do that. */ *bufP = _bt_moveright(rel, *bufP, keysz, scankey, BT_READ); /* okay, all set to move down a level */ return _bt_searchr(rel, keysz, scankey, bufP, stack);}/* * _bt_moveright() -- move right in the btree if necessary. * * When we drop and reacquire a pointer to a page, it is possible that * the page has changed in the meanwhile. If this happens, we're * guaranteed that the page has "split right" -- that is, that any * data that appeared on the page originally is either on the page * or strictly to the right of it. * * This routine decides whether or not we need to move right in the * tree by examining the high key entry on the page. If that entry * is strictly less than one we expect to be on the page, then our * picture of the page is incorrect and we need to move right. * * On entry, we have the buffer pinned and a lock of the proper type. * If we move right, we release the buffer and lock and acquire the * same on the right sibling. */Buffer_bt_moveright(Relation rel, Buffer buf, int keysz, ScanKey scankey, int access){ Page page; BTPageOpaque opaque; ItemId hikey; BlockNumber rblkno; int natts = rel->rd_rel->relnatts; page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* if we're on a rightmost page, we don't need to move right */ if (P_RIGHTMOST(opaque)) return buf; /* by convention, item 0 on non-rightmost pages is the high key */ hikey = PageGetItemId(page, P_HIKEY); /* * If the scan key that brought us to this page is >= the high key * stored on the page, then the page has split and we need to move * right. */ if (_bt_skeycmp(rel, keysz, scankey, page, hikey, BTGreaterEqualStrategyNumber)) { /* move right as long as we need to */ do { OffsetNumber offmax = PageGetMaxOffsetNumber(page); /* * If this page consists of all duplicate keys (hikey and * first key on the page have the same value), then we don't * need to step right. * * NOTE for multi-column indices: we may do scan using keys not * for all attrs. But we handle duplicates using all attrs in * _bt_insert/_bt_spool code. And so we've to compare scankey * with _last_ item on this page to do not lose "good" tuples * if number of attrs > keysize. Example: (2,0) - last items * on this page, (2,1) - first item on next page (hikey), our * scankey is x = 2. Scankey == (2,1) because of we compare * first attrs only, but we shouldn't to move right of here. - * vadim 04/15/97 * * Also, if this page is not LEAF one (and # of attrs > keysize) * then we can't move too. - vadim 10/22/97 */ if (_bt_skeycmp(rel, keysz, scankey, page, hikey, BTEqualStrategyNumber)) { if (opaque->btpo_flags & BTP_CHAIN) { Assert((opaque->btpo_flags & BTP_LEAF) || offmax > P_HIKEY); break; } if (offmax > P_HIKEY) { if (natts == keysz) /* sanity checks */ { if (_bt_skeycmp(rel, keysz, scankey, page, PageGetItemId(page, P_FIRSTKEY), BTEqualStrategyNumber)) elog(FATAL, "btree: BTP_CHAIN flag was expected in %s (access = %s)", rel->rd_rel->relname.data, access ? "bt_write" : "bt_read"); if (_bt_skeycmp(rel, keysz, scankey, page, PageGetItemId(page, offmax), BTEqualStrategyNumber)) elog(FATAL, "btree: unexpected equal last item"); if (_bt_skeycmp(rel, keysz, scankey, page, PageGetItemId(page, offmax), BTLessStrategyNumber)) elog(FATAL, "btree: unexpected greater last item"); /* move right */ } else if (!(opaque->btpo_flags & BTP_LEAF)) break; else if (_bt_skeycmp(rel, keysz, scankey, page, PageGetItemId(page, offmax), BTLessEqualStrategyNumber)) break; } } /* step right one page */ rblkno = opaque->btpo_next; _bt_relbuf(rel, buf, access); buf = _bt_getbuf(rel, rblkno, access); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); hikey = PageGetItemId(page, P_HIKEY); } while (!P_RIGHTMOST(opaque) && _bt_skeycmp(rel, keysz, scankey, page, hikey, BTGreaterEqualStrategyNumber)); } return buf;}/* * _bt_skeycmp() -- compare a scan key to a particular item on a page using * a requested strategy (<, <=, =, >=, >). * * We ignore the unique OIDs stored in the btree item here. Those * numbers are intended for use internally only, in repositioning a * scan after a page split. They do not impose any meaningful ordering. * * The comparison is A <op> B, where A is the scan key and B is the * tuple pointed at by itemid on page. */bool_bt_skeycmp(Relation rel, Size keysz, ScanKey scankey, Page page, ItemId itemid, StrategyNumber strat){ BTItem item; IndexTuple indexTuple; TupleDesc tupDes; ScanKey entry; int i; Datum attrDatum; Datum keyDatum; bool compare; bool isNull; bool useEqual = false; bool keyNull; if (strat == BTLessEqualStrategyNumber) { useEqual = true; strat = BTLessStrategyNumber; } else if (strat == BTGreaterEqualStrategyNumber) { useEqual = true; strat = BTGreaterStrategyNumber; } item = (BTItem) PageGetItem(page, itemid); indexTuple = &(item->bti_itup); tupDes = RelationGetDescr(rel); /* see if the comparison is true for all of the key attributes */ for (i = 1; i <= keysz; i++) { entry = &scankey[i - 1]; Assert(entry->sk_attno == i); attrDatum = index_getattr(indexTuple, entry->sk_attno, tupDes, &isNull); keyDatum = entry->sk_argument; /* see comments about NULLs handling in btbuild */ if (entry->sk_flags & SK_ISNULL) /* key is NULL */ { Assert(entry->sk_procedure == F_NULLVALUE); keyNull = true; if (isNull) compare = (strat == BTEqualStrategyNumber) ? true : false; else compare = (strat == BTGreaterStrategyNumber) ? true : false; } else if (isNull) /* key is NOT_NULL and item is NULL */ { keyNull = false; compare = (strat == BTLessStrategyNumber) ? true : false; } else { keyNull = false; compare = _bt_invokestrat(rel, i, strat, keyDatum, attrDatum); } if (compare) /* true for one of ">, <, =" */ { if (strat != BTEqualStrategyNumber) return true; } else/* false for one of ">, <, =" */ { if (strat == BTEqualStrategyNumber) return false; /* * if original strat was "<=, >=" OR "<, >" but some * attribute(s) left - need to test for Equality */ if (useEqual || i < keysz) { if (keyNull || isNull) compare = (keyNull && isNull) ? true : false; else compare = _bt_invokestrat(rel, i, BTEqualStrategyNumber, keyDatum, attrDatum); if (compare) /* key' and item' attributes are equal */ continue; /* - try to compare next attributes */ } return false; } } return true;}/* * _bt_binsrch() -- Do a binary search for a key on a particular page. * * The scankey we get has the compare function stored in the procedure * entry of each data struct. We invoke this regproc to do the * comparison for every key in the scankey. _bt_binsrch() returns * the OffsetNumber of the first matching key on the page, or the * OffsetNumber at which the matching key would appear if it were * on this page. * * By the time this procedure is called, we're sure we're looking * at the right page -- don't need to walk right. _bt_binsrch() has * no lock or refcount side effects on the buffer. */OffsetNumber_bt_binsrch(Relation rel, Buffer buf, int keysz, ScanKey scankey, int srchtype){ TupleDesc itupdesc; Page page; BTPageOpaque opaque; OffsetNumber low, mid, high; int natts = rel->rd_rel->relnatts; int result; itupdesc = RelationGetDescr(rel); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* by convention, item 1 on any non-rightmost page is the high key */ low = mid = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; high = PageGetMaxOffsetNumber(page); /* * Since for non-rightmost pages, the first item on the page is the * high key, there are two notions of emptiness. One is if nothing * appears on the page. The other is if nothing but the high key * does. The reason we test high <= low, rather than high == low, is * that after vacuuming there may be nothing *but* the high key on a * page. In that case, given the scheme above, low = 2 and high = 1. */ if (PageIsEmpty(page)) return low; if ((!P_RIGHTMOST(opaque) && high <= low)) { if (high < low || (srchtype == BT_DESCENT && !(opaque->btpo_flags & BTP_LEAF))) return low; /* It's insertion and high == low == 2 */ result = _bt_compare(rel, itupdesc, page, keysz, scankey, low); if (result > 0) return OffsetNumberNext(low); return low; } while ((high - low) > 1) { mid = low + ((high - low) / 2); result = _bt_compare(rel, itupdesc, page, keysz, scankey, mid); if (result > 0) low = mid; else if (result < 0) high = mid - 1; else { mid = _bt_firsteq(rel, itupdesc, page, keysz, scankey, mid); /* * NOTE for multi-column indices: we may do scan using keys * not for all attrs. But we handle duplicates using all attrs * in _bt_insert/_bt_spool code. And so while searching on * internal pages having number of attrs > keysize we want to * point at the last item < the scankey, not at the first item * = the scankey (!!!), and let _bt_moveright decide later * whether to move right or not (see comments and example * there). Note also that INSERTions are not affected by this * code (natts == keysz). - vadim 04/15/97 */ if (natts == keysz || opaque->btpo_flags & BTP_LEAF) return mid; low = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; if (mid == low) return mid; return OffsetNumberPrev(mid); } } /* * We terminated because the endpoints got too close together. There * are two cases to take care of. * * For non-insertion searches on internal pages, we want to point at the * last key <, or first key =, the scankey on the page. This * guarantees that we'll descend the tree correctly. (NOTE comments * above for multi-column indices). * * For all other cases, we want to point at the first key >= the scankey * on the page. This guarantees that scans and insertions will happen * correctly. */ if (!(opaque->btpo_flags & BTP_LEAF) && srchtype == BT_DESCENT) { /* We want the last key <, or first key * ==, the scan key. */ result = _bt_compare(rel, itupdesc, page, keysz, scankey, high); if (result == 0) { mid = _bt_firsteq(rel, itupdesc, page, keysz, scankey, high); /* * If natts > keysz we want last item < the scan key. See * comments above for multi-column indices. */ if (natts == keysz) return mid; low = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; if (mid == low) return mid; return OffsetNumberPrev(mid); } else if (result > 0) return high; else return low; } else/* we want the first key >= the scan key */ { result = _bt_compare(rel, itupdesc, page, keysz, scankey, low); if (result <= 0) return low; else { if (low == high) return OffsetNumberNext(low); result = _bt_compare(rel, itupdesc, page, keysz, scankey, high); if (result <= 0) return high; else return OffsetNumberNext(high); } }}static OffsetNumber_bt_firsteq(Relation rel, TupleDesc itupdesc, Page page, Size keysz, ScanKey scankey, OffsetNumber offnum)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -