📄 nbtinsert.c
字号:
"right sibling"); itup_off = rightoff; itup_blkno = BufferGetBlockNumber(rbuf); rightoff = OffsetNumberNext(rightoff); } } /* * We have to grab the right sibling (if any) and fix the prev pointer * there. We are guaranteed that this is deadlock-free since no other * writer will be holding a lock on that page and trying to move left, and * all readers release locks on a page before trying to fetch its * neighbors. */ if (!P_RIGHTMOST(ropaque)) { sbuf = _bt_getbuf(rel, ropaque->btpo_next, BT_WRITE); spage = BufferGetPage(sbuf); sopaque = (BTPageOpaque) PageGetSpecialPointer(spage); if (sopaque->btpo_prev != ropaque->btpo_prev) elog(PANIC, "right sibling's left-link doesn't match"); } /* * Right sibling is locked, new siblings are prepared, but original page * is not updated yet. Log changes before continuing. * * NO EREPORT(ERROR) till right sibling is updated. */ START_CRIT_SECTION(); if (!P_RIGHTMOST(ropaque)) sopaque->btpo_prev = BufferGetBlockNumber(rbuf); /* XLOG stuff */ if (!rel->rd_istemp) { xl_btree_split xlrec; uint8 xlinfo; XLogRecPtr recptr; XLogRecData rdata[4]; xlrec.target.node = rel->rd_node; ItemPointerSet(&(xlrec.target.tid), itup_blkno, itup_off); if (newitemonleft) xlrec.otherblk = BufferGetBlockNumber(rbuf); else xlrec.otherblk = BufferGetBlockNumber(buf); xlrec.leftblk = lopaque->btpo_prev; xlrec.rightblk = ropaque->btpo_next; xlrec.level = lopaque->btpo.level; /* * Direct access to page is not good but faster - we should implement * some new func in page API. Note we only store the tuples * themselves, knowing that the item pointers are in the same order * and can be reconstructed by scanning the tuples. */ xlrec.leftlen = ((PageHeader) leftpage)->pd_special - ((PageHeader) leftpage)->pd_upper; rdata[0].data = (char *) &xlrec; rdata[0].len = SizeOfBtreeSplit; rdata[0].buffer = InvalidBuffer; rdata[0].next = &(rdata[1]); rdata[1].data = (char *) leftpage + ((PageHeader) leftpage)->pd_upper; rdata[1].len = xlrec.leftlen; rdata[1].buffer = InvalidBuffer; rdata[1].next = &(rdata[2]); rdata[2].data = (char *) rightpage + ((PageHeader) rightpage)->pd_upper; rdata[2].len = ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper; rdata[2].buffer = InvalidBuffer; rdata[2].next = NULL; if (!P_RIGHTMOST(ropaque)) { rdata[2].next = &(rdata[3]); rdata[3].data = NULL; rdata[3].len = 0; rdata[3].buffer = sbuf; rdata[3].buffer_std = true; rdata[3].next = NULL; } if (P_ISROOT(oopaque)) xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L_ROOT : XLOG_BTREE_SPLIT_R_ROOT; else xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R; recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata); PageSetLSN(leftpage, recptr); PageSetTLI(leftpage, ThisTimeLineID); PageSetLSN(rightpage, recptr); PageSetTLI(rightpage, ThisTimeLineID); if (!P_RIGHTMOST(ropaque)) { PageSetLSN(spage, recptr); PageSetTLI(spage, ThisTimeLineID); } } /* * By here, the original data page has been split into two new halves, and * these are correct. The algorithm requires that the left page never * move during a split, so we copy the new left page back on top of the * original. Note that this is not a waste of time, since we also require * (in the page management code) that the center of a page always be * clean, and the most efficient way to guarantee this is just to compact * the data by reinserting it into a new left page. */ PageRestoreTempPage(leftpage, origpage); END_CRIT_SECTION(); /* write and release the old right sibling */ if (!P_RIGHTMOST(ropaque)) _bt_wrtbuf(rel, sbuf); /* split's done */ return rbuf;}/* * _bt_findsplitloc() -- find an appropriate place to split a page. * * The idea here is to equalize the free space that will be on each split * page, *after accounting for the inserted tuple*. (If we fail to account * for it, we might find ourselves with too little room on the page that * it needs to go into!) * * If the page is the rightmost page on its level, we instead try to arrange * for twice as much free space on the right as on the left. In this way, * when we are inserting successively increasing keys (consider sequences, * timestamps, etc) we will end up with a tree whose pages are about 67% full, * instead of the 50% full result that we'd get without this special case. * (We could bias it even further to make the initially-loaded tree more full. * But since the steady-state load for a btree is about 70%, we'd likely just * be making more page-splitting work for ourselves later on, when we start * seeing updates to existing tuples.) * * We are passed the intended insert position of the new tuple, expressed as * the offsetnumber of the tuple it must go in front of. (This could be * maxoff+1 if the tuple is to go at the end.) * * We return the index of the first existing tuple that should go on the * righthand page, plus a boolean indicating whether the new tuple goes on * the left or right page. The bool is necessary to disambiguate the case * where firstright == newitemoff. */static OffsetNumber_bt_findsplitloc(Relation rel, Page page, OffsetNumber newitemoff, Size newitemsz, bool *newitemonleft){ BTPageOpaque opaque; OffsetNumber offnum; OffsetNumber maxoff; ItemId itemid; FindSplitData state; int leftspace, rightspace, goodenough, dataitemtotal, dataitemstoleft; opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */ newitemsz += sizeof(ItemIdData); state.newitemsz = newitemsz; state.is_leaf = P_ISLEAF(opaque); state.is_rightmost = P_RIGHTMOST(opaque); state.have_split = false; /* Total free space available on a btree page, after fixed overhead */ leftspace = rightspace = PageGetPageSize(page) - SizeOfPageHeaderData - MAXALIGN(sizeof(BTPageOpaqueData)); /* * Finding the best possible split would require checking all the possible * split points, because of the high-key and left-key special cases. * That's probably more work than it's worth; instead, stop as soon as we * find a "good-enough" split, where good-enough is defined as an * imbalance in free space of no more than pagesize/16 (arbitrary...) This * should let us stop near the middle on most pages, instead of plowing to * the end. */ goodenough = leftspace / 16; /* The right page will have the same high key as the old page */ if (!P_RIGHTMOST(opaque)) { itemid = PageGetItemId(page, P_HIKEY); rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData)); } /* Count up total space in data items without actually scanning 'em */ dataitemtotal = rightspace - (int) PageGetFreeSpace(page); /* * Scan through the data items and calculate space usage for a split at * each possible position. */ dataitemstoleft = 0; maxoff = PageGetMaxOffsetNumber(page); for (offnum = P_FIRSTDATAKEY(opaque); offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { Size itemsz; int leftfree, rightfree; itemid = PageGetItemId(page, offnum); itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData); /* * We have to allow for the current item becoming the high key of the * left page; therefore it counts against left space as well as right * space. */ leftfree = leftspace - dataitemstoleft - (int) itemsz; rightfree = rightspace - (dataitemtotal - dataitemstoleft); /* * Will the new item go to left or right of split? */ if (offnum > newitemoff) _bt_checksplitloc(&state, offnum, leftfree, rightfree, true, itemsz); else if (offnum < newitemoff) _bt_checksplitloc(&state, offnum, leftfree, rightfree, false, itemsz); else { /* need to try it both ways! */ _bt_checksplitloc(&state, offnum, leftfree, rightfree, true, itemsz); /* here we are contemplating newitem as first on right */ _bt_checksplitloc(&state, offnum, leftfree, rightfree, false, newitemsz); } /* Abort scan once we find a good-enough choice */ if (state.have_split && state.best_delta <= goodenough) break; dataitemstoleft += itemsz; } /* * I believe it is not possible to fail to find a feasible split, but just * in case ... */ if (!state.have_split) elog(ERROR, "could not find a feasible split point for \"%s\"", RelationGetRelationName(rel)); *newitemonleft = state.newitemonleft; return state.firstright;}/* * Subroutine to analyze a particular possible split choice (ie, firstright * and newitemonleft settings), and record the best split so far in *state. */static void_bt_checksplitloc(FindSplitData *state, OffsetNumber firstright, int leftfree, int rightfree, bool newitemonleft, Size firstrightitemsz){ /* * Account for the new item on whichever side it is to be put. */ if (newitemonleft) leftfree -= (int) state->newitemsz; else rightfree -= (int) state->newitemsz; /* * If we are not on the leaf level, we will be able to discard the key * data from the first item that winds up on the right page. */ if (!state->is_leaf) rightfree += (int) firstrightitemsz - (int) (MAXALIGN(sizeof(BTItemData)) + sizeof(ItemIdData)); /* * If feasible split point, remember best delta. */ if (leftfree >= 0 && rightfree >= 0) { int delta; if (state->is_rightmost) { /* * On a rightmost page, try to equalize right free space with * twice the left free space. See comments for _bt_findsplitloc. */ delta = (2 * leftfree) - rightfree; } else { /* Otherwise, aim for equal free space on both sides */ delta = leftfree - rightfree; } if (delta < 0) delta = -delta; if (!state->have_split || delta < state->best_delta) { state->have_split = true; state->newitemonleft = newitemonleft; state->firstright = firstright; state->best_delta = delta; } }}/* * _bt_insert_parent() -- Insert downlink into parent after a page split. * * On entry, buf and rbuf are the left and right split pages, which we * still hold write locks on per the L&Y algorithm. We release the * write locks once we have write lock on the parent page. (Any sooner, * and it'd be possible for some other process to try to split or delete * one of these pages, and get confused because it cannot find the downlink.) * * stack - stack showing how we got here. May be NULL in cases that don't * have to be efficient (concurrent ROOT split, WAL recovery) * is_root - we split the true root * is_only - we split a page alone on its level (might have been fast root) * * This is exported so it can be called by nbtxlog.c. */void_bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only){ /* * Here we have to do something Lehman and Yao don't talk about: deal with * a root split and construction of a new root. If our stack is empty * then we have just split a node on what had been the root level when we * descended the tree. If it was still the root then we perform a * new-root construction. If it *wasn't* the root anymore, search to find * the next higher level that someone constructed meanwhile, and find the * right place to insert as for the normal case. * * If we have to search for the parent level, we do so by re-descending * from the root. This is not super-efficient, but it's rare enough not * to matter. (This path is also taken when called from WAL recovery --- * we have no stack in that case.) */ if (is_root) { Buffer rootbuf; Assert(stack == NULL); Assert(is_only); /* create a new root node and update the metapage */ rootbuf = _bt_newroot(rel, buf, rbuf); /* release the split buffers */ _bt_wrtbuf(rel, rootbuf); _bt_wrtbuf(rel, rbuf); _bt_wrtbuf(rel, buf); } else { BlockNumber bknum = BufferGetBlockNumber(buf); BlockNumber rbknum = BufferGetBlockNumber(rbuf); Page page = BufferGetPage(buf); BTItem new_item; BTStackData fakestack; BTItem ritem; Buffer pbuf; if (stack == NULL) { BTPageOpaque lpageop; if (!InRecovery) elog(DEBUG2, "concurrent ROOT page split");
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -