📄 nbtinsert.c
字号:
ropaque->btpo_flags |= BTP_SPLIT_END; } /* * Right sibling is locked, new siblings are prepared, but original page * is not updated yet. * * NO EREPORT(ERROR) till right sibling is updated. We can get away with * not starting the critical section till here because we haven't been * scribbling on the original page yet, and we don't care about the new * sibling until it's linked into the btree. */ START_CRIT_SECTION(); /* * By here, the original data page has been split into two new halves, and * these are correct. The algorithm requires that the left page never * move during a split, so we copy the new left page back on top of the * original. Note that this is not a waste of time, since we also require * (in the page management code) that the center of a page always be * clean, and the most efficient way to guarantee this is just to compact * the data by reinserting it into a new left page. (XXX the latter * comment is probably obsolete.) * * We need to do this before writing the WAL record, so that XLogInsert * can WAL log an image of the page if necessary. */ PageRestoreTempPage(leftpage, origpage); MarkBufferDirty(buf); MarkBufferDirty(rbuf); if (!P_RIGHTMOST(ropaque)) { sopaque->btpo_prev = BufferGetBlockNumber(rbuf); MarkBufferDirty(sbuf); } /* XLOG stuff */ if (!rel->rd_istemp) { xl_btree_split xlrec; uint8 xlinfo; XLogRecPtr recptr; XLogRecData rdata[7]; XLogRecData *lastrdata; xlrec.node = rel->rd_node; xlrec.leftsib = BufferGetBlockNumber(buf); xlrec.rightsib = BufferGetBlockNumber(rbuf); xlrec.rnext = ropaque->btpo_next; xlrec.level = ropaque->btpo.level; xlrec.firstright = firstright; rdata[0].data = (char *) &xlrec; rdata[0].len = SizeOfBtreeSplit; rdata[0].buffer = InvalidBuffer; lastrdata = &rdata[0]; if (ropaque->btpo.level > 0) { /* Log downlink on non-leaf pages */ lastrdata->next = lastrdata + 1; lastrdata++; lastrdata->data = (char *) &newitem->t_tid.ip_blkid; lastrdata->len = sizeof(BlockIdData); lastrdata->buffer = InvalidBuffer; /* * We must also log the left page's high key, because the right * page's leftmost key is suppressed on non-leaf levels. Show it * as belonging to the left page buffer, so that it is not stored * if XLogInsert decides it needs a full-page image of the left * page. */ lastrdata->next = lastrdata + 1; lastrdata++; itemid = PageGetItemId(origpage, P_HIKEY); item = (IndexTuple) PageGetItem(origpage, itemid); lastrdata->data = (char *) item; lastrdata->len = MAXALIGN(IndexTupleSize(item)); lastrdata->buffer = buf; /* backup block 1 */ lastrdata->buffer_std = true; } /* * Log the new item and its offset, if it was inserted on the left * page. (If it was put on the right page, we don't need to explicitly * WAL log it because it's included with all the other items on the * right page.) Show the new item as belonging to the left page * buffer, so that it is not stored if XLogInsert decides it needs a * full-page image of the left page. We store the offset anyway, * though, to support archive compression of these records. */ if (newitemonleft) { lastrdata->next = lastrdata + 1; lastrdata++; lastrdata->data = (char *) &newitemoff; lastrdata->len = sizeof(OffsetNumber); lastrdata->buffer = InvalidBuffer; lastrdata->next = lastrdata + 1; lastrdata++; lastrdata->data = (char *) newitem; lastrdata->len = MAXALIGN(newitemsz); lastrdata->buffer = buf; /* backup block 1 */ lastrdata->buffer_std = true; } else if (ropaque->btpo.level == 0) { /* * Although we don't need to WAL-log the new item, we still need * XLogInsert to consider storing a full-page image of the left * page, so make an empty entry referencing that buffer. This also * ensures that the left page is always backup block 1. */ lastrdata->next = lastrdata + 1; lastrdata++; lastrdata->data = NULL; lastrdata->len = 0; lastrdata->buffer = buf; /* backup block 1 */ lastrdata->buffer_std = true; } /* * Log the contents of the right page in the format understood by * _bt_restore_page(). We set lastrdata->buffer to InvalidBuffer, * because we're going to recreate the whole page anyway, so it should * never be stored by XLogInsert. * * Direct access to page is not good but faster - we should implement * some new func in page API. Note we only store the tuples * themselves, knowing that they were inserted in item-number order * and so the item pointers can be reconstructed. See comments for * _bt_restore_page(). */ lastrdata->next = lastrdata + 1; lastrdata++; lastrdata->data = (char *) rightpage + ((PageHeader) rightpage)->pd_upper; lastrdata->len = ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper; lastrdata->buffer = InvalidBuffer; /* Log the right sibling, because we've changed its' prev-pointer. */ if (!P_RIGHTMOST(ropaque)) { lastrdata->next = lastrdata + 1; lastrdata++; lastrdata->data = NULL; lastrdata->len = 0; lastrdata->buffer = sbuf; /* backup block 2 */ lastrdata->buffer_std = true; } lastrdata->next = NULL; if (isroot) xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L_ROOT : XLOG_BTREE_SPLIT_R_ROOT; else xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R; recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata); PageSetLSN(origpage, recptr); PageSetTLI(origpage, ThisTimeLineID); PageSetLSN(rightpage, recptr); PageSetTLI(rightpage, ThisTimeLineID); if (!P_RIGHTMOST(ropaque)) { PageSetLSN(spage, recptr); PageSetTLI(spage, ThisTimeLineID); } } END_CRIT_SECTION(); /* release the old right sibling */ if (!P_RIGHTMOST(ropaque)) _bt_relbuf(rel, sbuf); /* split's done */ return rbuf;}/* * _bt_findsplitloc() -- find an appropriate place to split a page. * * The idea here is to equalize the free space that will be on each split * page, *after accounting for the inserted tuple*. (If we fail to account * for it, we might find ourselves with too little room on the page that * it needs to go into!) * * If the page is the rightmost page on its level, we instead try to arrange * to leave the left split page fillfactor% full. In this way, when we are * inserting successively increasing keys (consider sequences, timestamps, * etc) we will end up with a tree whose pages are about fillfactor% full, * instead of the 50% full result that we'd get without this special case. * This is the same as nbtsort.c produces for a newly-created tree. Note * that leaf and nonleaf pages use different fillfactors. * * We are passed the intended insert position of the new tuple, expressed as * the offsetnumber of the tuple it must go in front of. (This could be * maxoff+1 if the tuple is to go at the end.) * * We return the index of the first existing tuple that should go on the * righthand page, plus a boolean indicating whether the new tuple goes on * the left or right page. The bool is necessary to disambiguate the case * where firstright == newitemoff. */static OffsetNumber_bt_findsplitloc(Relation rel, Page page, OffsetNumber newitemoff, Size newitemsz, bool *newitemonleft){ BTPageOpaque opaque; OffsetNumber offnum; OffsetNumber maxoff; ItemId itemid; FindSplitData state; int leftspace, rightspace, goodenough, olddataitemstotal, olddataitemstoleft; bool goodenoughfound; opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */ newitemsz += sizeof(ItemIdData); /* Total free space available on a btree page, after fixed overhead */ leftspace = rightspace = PageGetPageSize(page) - SizeOfPageHeaderData - MAXALIGN(sizeof(BTPageOpaqueData)); /* The right page will have the same high key as the old page */ if (!P_RIGHTMOST(opaque)) { itemid = PageGetItemId(page, P_HIKEY); rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData)); } /* Count up total space in data items without actually scanning 'em */ olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(page); state.newitemsz = newitemsz; state.is_leaf = P_ISLEAF(opaque); state.is_rightmost = P_RIGHTMOST(opaque); state.have_split = false; if (state.is_leaf) state.fillfactor = RelationGetFillFactor(rel, BTREE_DEFAULT_FILLFACTOR); else state.fillfactor = BTREE_NONLEAF_FILLFACTOR; state.newitemonleft = false; /* these just to keep compiler quiet */ state.firstright = 0; state.best_delta = 0; state.leftspace = leftspace; state.rightspace = rightspace; state.olddataitemstotal = olddataitemstotal; state.newitemoff = newitemoff; /* * Finding the best possible split would require checking all the possible * split points, because of the high-key and left-key special cases. * That's probably more work than it's worth; instead, stop as soon as we * find a "good-enough" split, where good-enough is defined as an * imbalance in free space of no more than pagesize/16 (arbitrary...) This * should let us stop near the middle on most pages, instead of plowing to * the end. */ goodenough = leftspace / 16; /* * Scan through the data items and calculate space usage for a split at * each possible position. */ olddataitemstoleft = 0; goodenoughfound = false; maxoff = PageGetMaxOffsetNumber(page); for (offnum = P_FIRSTDATAKEY(opaque); offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { Size itemsz; itemid = PageGetItemId(page, offnum); itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData); /* * Will the new item go to left or right of split? */ if (offnum > newitemoff) _bt_checksplitloc(&state, offnum, true, olddataitemstoleft, itemsz); else if (offnum < newitemoff) _bt_checksplitloc(&state, offnum, false, olddataitemstoleft, itemsz); else { /* need to try it both ways! */ _bt_checksplitloc(&state, offnum, true, olddataitemstoleft, itemsz); _bt_checksplitloc(&state, offnum, false, olddataitemstoleft, itemsz); } /* Abort scan once we find a good-enough choice */ if (state.have_split && state.best_delta <= goodenough) { goodenoughfound = true; break; } olddataitemstoleft += itemsz; } /* * If the new item goes as the last item, check for splitting so that all * the old items go to the left page and the new item goes to the right * page. */ if (newitemoff > maxoff && !goodenoughfound) _bt_checksplitloc(&state, newitemoff, false, olddataitemstotal, 0); /* * I believe it is not possible to fail to find a feasible split, but just * in case ... */ if (!state.have_split) elog(ERROR, "could not find a feasible split point for index \"%s\"", RelationGetRelationName(rel)); *newitemonleft = state.newitemonleft; return state.firstright;}/* * Subroutine to analyze a particular possible split choice (ie, firstright * and newitemonleft settings), and record the best split so far in *state. * * firstoldonright is the offset of the first item on the original page * that goes to the right page, and firstoldonrightsz is the size of that * tuple. firstoldonright can be > max offset, which means that all the old * items go to the left page and only the new item goes to the right page. * In that case, firstoldonrightsz is not used. * * olddataitemstoleft is the total size of all old items to the left of * firstoldonright. */static void_bt_checksplitloc(FindSplitData *state, OffsetNumber firstoldonright, bool newitemonleft, int olddataitemstoleft, Size firstoldonrightsz){ int leftfree, rightfree; Size firstrightitemsz; bool newitemisfirstonright; /* Is the new item going to be the first item on the right page? */ newitemisfirstonright = (firstoldonright == state->newitemoff && !newitemonleft); if (newitemisfirstonright) firstrightitemsz = state->newitemsz; else firstrightitemsz = firstoldonrightsz; /* Account for all the old tuples */ leftfree = state->leftspace - olddataitemstoleft; rightfree = state->rightspace - (state->olddataitemstotal - olddataitemstoleft); /* * The first item on the right page becomes the high key of the left page; * therefore it counts against left space as well as right space. */ leftfree -= firstrightitemsz; /* account for the new item */ if (newitemonleft) leftfree -= (int) state->newitemsz; else rightfree -= (int) state->newitemsz; /* * If we are not on the leaf level, we will be able to discard the key * data from the first item that winds up on the right page. */ if (!state->is_leaf) rightfree += (int) firstrightitemsz - (int) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData)); /* * If feasible split point, remember best delta. */ if (leftfree >= 0 && rightfree >= 0) { int delta; if (state->is_rightmost) { /* * If splitting a rightmost page, try to put (100-fillfactor)% of * free space on left page. See comments for _bt_findsplitloc. */ delta = (state->fillfactor * leftfree) - ((100 - state->fillfactor) * rightfree); } else { /* Otherwise, aim for equal free space on both sides */ delta = leftfree - rightfree; } if (delta < 0) delta = -delta; if (!state->have_split || delta < state->best_delta) { state->have_split = true; state->newitemonleft = newitemonleft; state->firstright = firstoldonright; state->best_delta = delta; } }}/* * _bt_insert_parent() -- Insert downlink into parent after a page split. * * On entry, buf and rbuf are the left and right split pages, which we * still hold write locks on per the L&Y algorithm. We release the * write locks once we have write lock on the parent page. (Any sooner, * and it'd be possible for some other process to try to split or delete * one of these pages, and get confused because it cannot find the downlink.) * * stack - stack showing how we got here. May be NULL in cases that don't * have to be efficient (concurrent ROOT split, WAL recovery) * is_root - we split the true root * is_only - we split a page alone on its level (might have been fast root) * * This is exported so it can be called by nbtxlog.c. */void_bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only){ /* * Here we have to do something Lehman and Yao don't talk about: deal with * a root split and construction of a new root. If our stack is empty * then we have just split a node on what had been the root level when we * descended the tree. If it was still the root then we perform a * new-root construction. If it *wasn't* the root anymore, search to find * the next higher level that someone constructed meanwhile, and find the * right place to insert as for the normal case. * * If we have to search for the parent level, we do so by re-descending * from the root. This is not super-efficient, but it's rare enough not * to matter. (This path is also taken when called from WAL recovery --- * we have no stack in that case.)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -