📄 nbtinsert.c

📁 postgresql8.3.4源码,开源数据库
💻 C
📖 第 1 页 / 共 4 页
字号:
			ropaque->btpo_flags |= BTP_SPLIT_END;	}	/*	 * Right sibling is locked, new siblings are prepared, but original page	 * is not updated yet.	 *	 * NO EREPORT(ERROR) till right sibling is updated.  We can get away with	 * not starting the critical section till here because we haven't been	 * scribbling on the original page yet, and we don't care about the new	 * sibling until it's linked into the btree.	 */	START_CRIT_SECTION();	/*	 * By here, the original data page has been split into two new halves, and	 * these are correct.  The algorithm requires that the left page never	 * move during a split, so we copy the new left page back on top of the	 * original.  Note that this is not a waste of time, since we also require	 * (in the page management code) that the center of a page always be	 * clean, and the most efficient way to guarantee this is just to compact	 * the data by reinserting it into a new left page.  (XXX the latter	 * comment is probably obsolete.)	 *	 * We need to do this before writing the WAL record, so that XLogInsert	 * can WAL log an image of the page if necessary.	 */	PageRestoreTempPage(leftpage, origpage);	MarkBufferDirty(buf);	MarkBufferDirty(rbuf);	if (!P_RIGHTMOST(ropaque))	{		sopaque->btpo_prev = BufferGetBlockNumber(rbuf);		MarkBufferDirty(sbuf);	}	/* XLOG stuff */	if (!rel->rd_istemp)	{		xl_btree_split xlrec;		uint8		xlinfo;		XLogRecPtr	recptr;		XLogRecData rdata[7];		XLogRecData *lastrdata;		xlrec.node = rel->rd_node;		xlrec.leftsib = BufferGetBlockNumber(buf);		xlrec.rightsib = BufferGetBlockNumber(rbuf);		xlrec.rnext = ropaque->btpo_next;		xlrec.level = ropaque->btpo.level;		xlrec.firstright = firstright;		rdata[0].data = (char *) &xlrec;		rdata[0].len = SizeOfBtreeSplit;		rdata[0].buffer = InvalidBuffer;		lastrdata = &rdata[0];		if (ropaque->btpo.level > 0)		{			/* Log downlink on non-leaf pages */			lastrdata->next = lastrdata + 1;			lastrdata++;			lastrdata->data = (char *) &newitem->t_tid.ip_blkid;			lastrdata->len = sizeof(BlockIdData);			lastrdata->buffer = InvalidBuffer;			/*			 * We must also log the left page's high key, because the right			 * page's leftmost key is suppressed on non-leaf levels.  Show it			 * as belonging to the left page buffer, so that it is not stored			 * if XLogInsert decides it needs a full-page image of the left			 * page.			 */			lastrdata->next = lastrdata + 1;			lastrdata++;			itemid = PageGetItemId(origpage, P_HIKEY);			item = (IndexTuple) PageGetItem(origpage, itemid);			lastrdata->data = (char *) item;			lastrdata->len = MAXALIGN(IndexTupleSize(item));			lastrdata->buffer = buf;	/* backup block 1 */			lastrdata->buffer_std = true;		}		/*		 * Log the new item and its offset, if it was inserted on the left		 * page. (If it was put on the right page, we don't need to explicitly		 * WAL log it because it's included with all the other items on the		 * right page.) Show the new item as belonging to the left page		 * buffer, so that it is not stored if XLogInsert decides it needs a		 * full-page image of the left page.  We store the offset anyway,		 * though, to support archive compression of these records.		 */		if (newitemonleft)		{			lastrdata->next = lastrdata + 1;			lastrdata++;			lastrdata->data = (char *) &newitemoff;			lastrdata->len = sizeof(OffsetNumber);			lastrdata->buffer = InvalidBuffer;			lastrdata->next = lastrdata + 1;			lastrdata++;			lastrdata->data = (char *) newitem;			lastrdata->len = MAXALIGN(newitemsz);			lastrdata->buffer = buf;	/* backup block 1 */			lastrdata->buffer_std = true;		}		else if (ropaque->btpo.level == 0)		{			/*			 * Although we don't need to WAL-log the new item, we still need			 * XLogInsert to consider storing a full-page image of the left			 * page, so make an empty entry referencing that buffer. This also			 * ensures that the left page is always backup block 1.			 */			lastrdata->next = lastrdata + 1;			lastrdata++;			lastrdata->data = NULL;			lastrdata->len = 0;			lastrdata->buffer = buf;	/* backup block 1 */			lastrdata->buffer_std = true;		}		/*		 * Log the contents of the right page in the format understood by		 * _bt_restore_page(). We set lastrdata->buffer to InvalidBuffer,		 * because we're going to recreate the whole page anyway, so it should		 * never be stored by XLogInsert.		 *		 * Direct access to page is not good but faster - we should implement		 * some new func in page API.  Note we only store the tuples		 * themselves, knowing that they were inserted in item-number order		 * and so the item pointers can be reconstructed.  See comments for		 * _bt_restore_page().		 */		lastrdata->next = lastrdata + 1;		lastrdata++;		lastrdata->data = (char *) rightpage +			((PageHeader) rightpage)->pd_upper;		lastrdata->len = ((PageHeader) rightpage)->pd_special -			((PageHeader) rightpage)->pd_upper;		lastrdata->buffer = InvalidBuffer;		/* Log the right sibling, because we've changed its' prev-pointer. */		if (!P_RIGHTMOST(ropaque))		{			lastrdata->next = lastrdata + 1;			lastrdata++;			lastrdata->data = NULL;			lastrdata->len = 0;			lastrdata->buffer = sbuf;	/* backup block 2 */			lastrdata->buffer_std = true;		}		lastrdata->next = NULL;		if (isroot)			xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L_ROOT : XLOG_BTREE_SPLIT_R_ROOT;		else			xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;		recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata);		PageSetLSN(origpage, recptr);		PageSetTLI(origpage, ThisTimeLineID);		PageSetLSN(rightpage, recptr);		PageSetTLI(rightpage, ThisTimeLineID);		if (!P_RIGHTMOST(ropaque))		{			PageSetLSN(spage, recptr);			PageSetTLI(spage, ThisTimeLineID);		}	}	END_CRIT_SECTION();	/* release the old right sibling */	if (!P_RIGHTMOST(ropaque))		_bt_relbuf(rel, sbuf);	/* split's done */	return rbuf;}/* *	_bt_findsplitloc() -- find an appropriate place to split a page. * * The idea here is to equalize the free space that will be on each split * page, *after accounting for the inserted tuple*.  (If we fail to account * for it, we might find ourselves with too little room on the page that * it needs to go into!) * * If the page is the rightmost page on its level, we instead try to arrange * to leave the left split page fillfactor% full.  In this way, when we are * inserting successively increasing keys (consider sequences, timestamps, * etc) we will end up with a tree whose pages are about fillfactor% full, * instead of the 50% full result that we'd get without this special case. * This is the same as nbtsort.c produces for a newly-created tree.  Note * that leaf and nonleaf pages use different fillfactors. * * We are passed the intended insert position of the new tuple, expressed as * the offsetnumber of the tuple it must go in front of.  (This could be * maxoff+1 if the tuple is to go at the end.) * * We return the index of the first existing tuple that should go on the * righthand page, plus a boolean indicating whether the new tuple goes on * the left or right page.	The bool is necessary to disambiguate the case * where firstright == newitemoff. */static OffsetNumber_bt_findsplitloc(Relation rel,				 Page page,				 OffsetNumber newitemoff,				 Size newitemsz,				 bool *newitemonleft){	BTPageOpaque opaque;	OffsetNumber offnum;	OffsetNumber maxoff;	ItemId		itemid;	FindSplitData state;	int			leftspace,				rightspace,				goodenough,				olddataitemstotal,				olddataitemstoleft;	bool		goodenoughfound;	opaque = (BTPageOpaque) PageGetSpecialPointer(page);	/* Passed-in newitemsz is MAXALIGNED but does not include line pointer */	newitemsz += sizeof(ItemIdData);	/* Total free space available on a btree page, after fixed overhead */	leftspace = rightspace =		PageGetPageSize(page) - SizeOfPageHeaderData -		MAXALIGN(sizeof(BTPageOpaqueData));	/* The right page will have the same high key as the old page */	if (!P_RIGHTMOST(opaque))	{		itemid = PageGetItemId(page, P_HIKEY);		rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +							 sizeof(ItemIdData));	}	/* Count up total space in data items without actually scanning 'em */	olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(page);	state.newitemsz = newitemsz;	state.is_leaf = P_ISLEAF(opaque);	state.is_rightmost = P_RIGHTMOST(opaque);	state.have_split = false;	if (state.is_leaf)		state.fillfactor = RelationGetFillFactor(rel,												 BTREE_DEFAULT_FILLFACTOR);	else		state.fillfactor = BTREE_NONLEAF_FILLFACTOR;	state.newitemonleft = false;	/* these just to keep compiler quiet */	state.firstright = 0;	state.best_delta = 0;	state.leftspace = leftspace;	state.rightspace = rightspace;	state.olddataitemstotal = olddataitemstotal;	state.newitemoff = newitemoff;	/*	 * Finding the best possible split would require checking all the possible	 * split points, because of the high-key and left-key special cases.	 * That's probably more work than it's worth; instead, stop as soon as we	 * find a "good-enough" split, where good-enough is defined as an	 * imbalance in free space of no more than pagesize/16 (arbitrary...) This	 * should let us stop near the middle on most pages, instead of plowing to	 * the end.	 */	goodenough = leftspace / 16;	/*	 * Scan through the data items and calculate space usage for a split at	 * each possible position.	 */	olddataitemstoleft = 0;	goodenoughfound = false;	maxoff = PageGetMaxOffsetNumber(page);	for (offnum = P_FIRSTDATAKEY(opaque);		 offnum <= maxoff;		 offnum = OffsetNumberNext(offnum))	{		Size		itemsz;		itemid = PageGetItemId(page, offnum);		itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);		/*		 * Will the new item go to left or right of split?		 */		if (offnum > newitemoff)			_bt_checksplitloc(&state, offnum, true,							  olddataitemstoleft, itemsz);		else if (offnum < newitemoff)			_bt_checksplitloc(&state, offnum, false,							  olddataitemstoleft, itemsz);		else		{			/* need to try it both ways! */			_bt_checksplitloc(&state, offnum, true,							  olddataitemstoleft, itemsz);			_bt_checksplitloc(&state, offnum, false,							  olddataitemstoleft, itemsz);		}		/* Abort scan once we find a good-enough choice */		if (state.have_split && state.best_delta <= goodenough)		{			goodenoughfound = true;			break;		}		olddataitemstoleft += itemsz;	}	/*	 * If the new item goes as the last item, check for splitting so that all	 * the old items go to the left page and the new item goes to the right	 * page.	 */	if (newitemoff > maxoff && !goodenoughfound)		_bt_checksplitloc(&state, newitemoff, false, olddataitemstotal, 0);	/*	 * I believe it is not possible to fail to find a feasible split, but just	 * in case ...	 */	if (!state.have_split)		elog(ERROR, "could not find a feasible split point for index \"%s\"",			 RelationGetRelationName(rel));	*newitemonleft = state.newitemonleft;	return state.firstright;}/* * Subroutine to analyze a particular possible split choice (ie, firstright * and newitemonleft settings), and record the best split so far in *state. * * firstoldonright is the offset of the first item on the original page * that goes to the right page, and firstoldonrightsz is the size of that * tuple. firstoldonright can be > max offset, which means that all the old * items go to the left page and only the new item goes to the right page. * In that case, firstoldonrightsz is not used. * * olddataitemstoleft is the total size of all old items to the left of * firstoldonright. */static void_bt_checksplitloc(FindSplitData *state,				  OffsetNumber firstoldonright,				  bool newitemonleft,				  int olddataitemstoleft,				  Size firstoldonrightsz){	int			leftfree,				rightfree;	Size		firstrightitemsz;	bool		newitemisfirstonright;	/* Is the new item going to be the first item on the right page? */	newitemisfirstonright = (firstoldonright == state->newitemoff							 && !newitemonleft);	if (newitemisfirstonright)		firstrightitemsz = state->newitemsz;	else		firstrightitemsz = firstoldonrightsz;	/* Account for all the old tuples */	leftfree = state->leftspace - olddataitemstoleft;	rightfree = state->rightspace -		(state->olddataitemstotal - olddataitemstoleft);	/*	 * The first item on the right page becomes the high key of the left page;	 * therefore it counts against left space as well as right space.	 */	leftfree -= firstrightitemsz;	/* account for the new item */	if (newitemonleft)		leftfree -= (int) state->newitemsz;	else		rightfree -= (int) state->newitemsz;	/*	 * If we are not on the leaf level, we will be able to discard the key	 * data from the first item that winds up on the right page.	 */	if (!state->is_leaf)		rightfree += (int) firstrightitemsz -			(int) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData));	/*	 * If feasible split point, remember best delta.	 */	if (leftfree >= 0 && rightfree >= 0)	{		int			delta;		if (state->is_rightmost)		{			/*			 * If splitting a rightmost page, try to put (100-fillfactor)% of			 * free space on left page. See comments for _bt_findsplitloc.			 */			delta = (state->fillfactor * leftfree)				- ((100 - state->fillfactor) * rightfree);		}		else		{			/* Otherwise, aim for equal free space on both sides */			delta = leftfree - rightfree;		}		if (delta < 0)			delta = -delta;		if (!state->have_split || delta < state->best_delta)		{			state->have_split = true;			state->newitemonleft = newitemonleft;			state->firstright = firstoldonright;			state->best_delta = delta;		}	}}/* * _bt_insert_parent() -- Insert downlink into parent after a page split. * * On entry, buf and rbuf are the left and right split pages, which we * still hold write locks on per the L&Y algorithm.  We release the * write locks once we have write lock on the parent page.	(Any sooner, * and it'd be possible for some other process to try to split or delete * one of these pages, and get confused because it cannot find the downlink.) * * stack - stack showing how we got here.  May be NULL in cases that don't *			have to be efficient (concurrent ROOT split, WAL recovery) * is_root - we split the true root * is_only - we split a page alone on its level (might have been fast root) * * This is exported so it can be called by nbtxlog.c. */void_bt_insert_parent(Relation rel,				  Buffer buf,				  Buffer rbuf,				  BTStack stack,				  bool is_root,				  bool is_only){	/*	 * Here we have to do something Lehman and Yao don't talk about: deal with	 * a root split and construction of a new root.  If our stack is empty	 * then we have just split a node on what had been the root level when we	 * descended the tree.	If it was still the root then we perform a	 * new-root construction.  If it *wasn't* the root anymore, search to find	 * the next higher level that someone constructed meanwhile, and find the	 * right place to insert as for the normal case.	 *	 * If we have to search for the parent level, we do so by re-descending	 * from the root.  This is not super-efficient, but it's rare enough not	 * to matter.  (This path is also taken when called from WAL recovery ---	 * we have no stack in that case.)
💿 文件大小 17116 K
👤 上传用户 fsweb
📂 所属分类其他书籍
🏷️ 相关标签

#postgresql #源码 #开源 #数据库
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -