nbtinsert.c

来自「PostgreSQL 8.1.4的源码适用于Linux下的开源数据库系统」· C语言代码 · 共 1,592 行 · 第 1/4 页
1,592 行
						 "right sibling");			itup_off = rightoff;			itup_blkno = BufferGetBlockNumber(rbuf);			rightoff = OffsetNumberNext(rightoff);		}	}	/*	 * We have to grab the right sibling (if any) and fix the prev pointer	 * there. We are guaranteed that this is deadlock-free since no other	 * writer will be holding a lock on that page and trying to move left, and	 * all readers release locks on a page before trying to fetch its	 * neighbors.	 */	if (!P_RIGHTMOST(ropaque))	{		sbuf = _bt_getbuf(rel, ropaque->btpo_next, BT_WRITE);		spage = BufferGetPage(sbuf);		sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);		if (sopaque->btpo_prev != ropaque->btpo_prev)			elog(PANIC, "right sibling's left-link doesn't match");	}	/*	 * Right sibling is locked, new siblings are prepared, but original page	 * is not updated yet. Log changes before continuing.	 *	 * NO EREPORT(ERROR) till right sibling is updated.	 */	START_CRIT_SECTION();	if (!P_RIGHTMOST(ropaque))		sopaque->btpo_prev = BufferGetBlockNumber(rbuf);	/* XLOG stuff */	if (!rel->rd_istemp)	{		xl_btree_split xlrec;		uint8		xlinfo;		XLogRecPtr	recptr;		XLogRecData rdata[4];		xlrec.target.node = rel->rd_node;		ItemPointerSet(&(xlrec.target.tid), itup_blkno, itup_off);		if (newitemonleft)			xlrec.otherblk = BufferGetBlockNumber(rbuf);		else			xlrec.otherblk = BufferGetBlockNumber(buf);		xlrec.leftblk = lopaque->btpo_prev;		xlrec.rightblk = ropaque->btpo_next;		xlrec.level = lopaque->btpo.level;		/*		 * Direct access to page is not good but faster - we should implement		 * some new func in page API.  Note we only store the tuples		 * themselves, knowing that the item pointers are in the same order		 * and can be reconstructed by scanning the tuples.		 */		xlrec.leftlen = ((PageHeader) leftpage)->pd_special -			((PageHeader) leftpage)->pd_upper;		rdata[0].data = (char *) &xlrec;		rdata[0].len = SizeOfBtreeSplit;		rdata[0].buffer = InvalidBuffer;		rdata[0].next = &(rdata[1]);		rdata[1].data = (char *) leftpage + ((PageHeader) leftpage)->pd_upper;		rdata[1].len = xlrec.leftlen;		rdata[1].buffer = InvalidBuffer;		rdata[1].next = &(rdata[2]);		rdata[2].data = (char *) rightpage + ((PageHeader) rightpage)->pd_upper;		rdata[2].len = ((PageHeader) rightpage)->pd_special -			((PageHeader) rightpage)->pd_upper;		rdata[2].buffer = InvalidBuffer;		rdata[2].next = NULL;		if (!P_RIGHTMOST(ropaque))		{			rdata[2].next = &(rdata[3]);			rdata[3].data = NULL;			rdata[3].len = 0;			rdata[3].buffer = sbuf;			rdata[3].buffer_std = true;			rdata[3].next = NULL;		}		if (P_ISROOT(oopaque))			xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L_ROOT : XLOG_BTREE_SPLIT_R_ROOT;		else			xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;		recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata);		PageSetLSN(leftpage, recptr);		PageSetTLI(leftpage, ThisTimeLineID);		PageSetLSN(rightpage, recptr);		PageSetTLI(rightpage, ThisTimeLineID);		if (!P_RIGHTMOST(ropaque))		{			PageSetLSN(spage, recptr);			PageSetTLI(spage, ThisTimeLineID);		}	}	/*	 * By here, the original data page has been split into two new halves, and	 * these are correct.  The algorithm requires that the left page never	 * move during a split, so we copy the new left page back on top of the	 * original.  Note that this is not a waste of time, since we also require	 * (in the page management code) that the center of a page always be	 * clean, and the most efficient way to guarantee this is just to compact	 * the data by reinserting it into a new left page.	 */	PageRestoreTempPage(leftpage, origpage);	END_CRIT_SECTION();	/* write and release the old right sibling */	if (!P_RIGHTMOST(ropaque))		_bt_wrtbuf(rel, sbuf);	/* split's done */	return rbuf;}/* *	_bt_findsplitloc() -- find an appropriate place to split a page. * * The idea here is to equalize the free space that will be on each split * page, *after accounting for the inserted tuple*.  (If we fail to account * for it, we might find ourselves with too little room on the page that * it needs to go into!) * * If the page is the rightmost page on its level, we instead try to arrange * for twice as much free space on the right as on the left.  In this way, * when we are inserting successively increasing keys (consider sequences, * timestamps, etc) we will end up with a tree whose pages are about 67% full, * instead of the 50% full result that we'd get without this special case. * (We could bias it even further to make the initially-loaded tree more full. * But since the steady-state load for a btree is about 70%, we'd likely just * be making more page-splitting work for ourselves later on, when we start * seeing updates to existing tuples.) * * We are passed the intended insert position of the new tuple, expressed as * the offsetnumber of the tuple it must go in front of.  (This could be * maxoff+1 if the tuple is to go at the end.) * * We return the index of the first existing tuple that should go on the * righthand page, plus a boolean indicating whether the new tuple goes on * the left or right page.	The bool is necessary to disambiguate the case * where firstright == newitemoff. */static OffsetNumber_bt_findsplitloc(Relation rel,				 Page page,				 OffsetNumber newitemoff,				 Size newitemsz,				 bool *newitemonleft){	BTPageOpaque opaque;	OffsetNumber offnum;	OffsetNumber maxoff;	ItemId		itemid;	FindSplitData state;	int			leftspace,				rightspace,				goodenough,				dataitemtotal,				dataitemstoleft;	opaque = (BTPageOpaque) PageGetSpecialPointer(page);	/* Passed-in newitemsz is MAXALIGNED but does not include line pointer */	newitemsz += sizeof(ItemIdData);	state.newitemsz = newitemsz;	state.is_leaf = P_ISLEAF(opaque);	state.is_rightmost = P_RIGHTMOST(opaque);	state.have_split = false;	/* Total free space available on a btree page, after fixed overhead */	leftspace = rightspace =		PageGetPageSize(page) - SizeOfPageHeaderData -		MAXALIGN(sizeof(BTPageOpaqueData));	/*	 * Finding the best possible split would require checking all the possible	 * split points, because of the high-key and left-key special cases.	 * That's probably more work than it's worth; instead, stop as soon as we	 * find a "good-enough" split, where good-enough is defined as an	 * imbalance in free space of no more than pagesize/16 (arbitrary...) This	 * should let us stop near the middle on most pages, instead of plowing to	 * the end.	 */	goodenough = leftspace / 16;	/* The right page will have the same high key as the old page */	if (!P_RIGHTMOST(opaque))	{		itemid = PageGetItemId(page, P_HIKEY);		rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +							 sizeof(ItemIdData));	}	/* Count up total space in data items without actually scanning 'em */	dataitemtotal = rightspace - (int) PageGetFreeSpace(page);	/*	 * Scan through the data items and calculate space usage for a split at	 * each possible position.	 */	dataitemstoleft = 0;	maxoff = PageGetMaxOffsetNumber(page);	for (offnum = P_FIRSTDATAKEY(opaque);		 offnum <= maxoff;		 offnum = OffsetNumberNext(offnum))	{		Size		itemsz;		int			leftfree,					rightfree;		itemid = PageGetItemId(page, offnum);		itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);		/*		 * We have to allow for the current item becoming the high key of the		 * left page; therefore it counts against left space as well as right		 * space.		 */		leftfree = leftspace - dataitemstoleft - (int) itemsz;		rightfree = rightspace - (dataitemtotal - dataitemstoleft);		/*		 * Will the new item go to left or right of split?		 */		if (offnum > newitemoff)			_bt_checksplitloc(&state, offnum, leftfree, rightfree,							  true, itemsz);		else if (offnum < newitemoff)			_bt_checksplitloc(&state, offnum, leftfree, rightfree,							  false, itemsz);		else		{			/* need to try it both ways! */			_bt_checksplitloc(&state, offnum, leftfree, rightfree,							  true, itemsz);			/* here we are contemplating newitem as first on right */			_bt_checksplitloc(&state, offnum, leftfree, rightfree,							  false, newitemsz);		}		/* Abort scan once we find a good-enough choice */		if (state.have_split && state.best_delta <= goodenough)			break;		dataitemstoleft += itemsz;	}	/*	 * I believe it is not possible to fail to find a feasible split, but just	 * in case ...	 */	if (!state.have_split)		elog(ERROR, "could not find a feasible split point for \"%s\"",			 RelationGetRelationName(rel));	*newitemonleft = state.newitemonleft;	return state.firstright;}/* * Subroutine to analyze a particular possible split choice (ie, firstright * and newitemonleft settings), and record the best split so far in *state. */static void_bt_checksplitloc(FindSplitData *state, OffsetNumber firstright,				  int leftfree, int rightfree,				  bool newitemonleft, Size firstrightitemsz){	/*	 * Account for the new item on whichever side it is to be put.	 */	if (newitemonleft)		leftfree -= (int) state->newitemsz;	else		rightfree -= (int) state->newitemsz;	/*	 * If we are not on the leaf level, we will be able to discard the key	 * data from the first item that winds up on the right page.	 */	if (!state->is_leaf)		rightfree += (int) firstrightitemsz -			(int) (MAXALIGN(sizeof(BTItemData)) + sizeof(ItemIdData));	/*	 * If feasible split point, remember best delta.	 */	if (leftfree >= 0 && rightfree >= 0)	{		int			delta;		if (state->is_rightmost)		{			/*			 * On a rightmost page, try to equalize right free space with			 * twice the left free space.  See comments for _bt_findsplitloc.			 */			delta = (2 * leftfree) - rightfree;		}		else		{			/* Otherwise, aim for equal free space on both sides */			delta = leftfree - rightfree;		}		if (delta < 0)			delta = -delta;		if (!state->have_split || delta < state->best_delta)		{			state->have_split = true;			state->newitemonleft = newitemonleft;			state->firstright = firstright;			state->best_delta = delta;		}	}}/* * _bt_insert_parent() -- Insert downlink into parent after a page split. * * On entry, buf and rbuf are the left and right split pages, which we * still hold write locks on per the L&Y algorithm.  We release the * write locks once we have write lock on the parent page.	(Any sooner, * and it'd be possible for some other process to try to split or delete * one of these pages, and get confused because it cannot find the downlink.) * * stack - stack showing how we got here.  May be NULL in cases that don't *			have to be efficient (concurrent ROOT split, WAL recovery) * is_root - we split the true root * is_only - we split a page alone on its level (might have been fast root) * * This is exported so it can be called by nbtxlog.c. */void_bt_insert_parent(Relation rel,				  Buffer buf,				  Buffer rbuf,				  BTStack stack,				  bool is_root,				  bool is_only){	/*	 * Here we have to do something Lehman and Yao don't talk about: deal with	 * a root split and construction of a new root.  If our stack is empty	 * then we have just split a node on what had been the root level when we	 * descended the tree.	If it was still the root then we perform a	 * new-root construction.  If it *wasn't* the root anymore, search to find	 * the next higher level that someone constructed meanwhile, and find the	 * right place to insert as for the normal case.	 *	 * If we have to search for the parent level, we do so by re-descending	 * from the root.  This is not super-efficient, but it's rare enough not	 * to matter.  (This path is also taken when called from WAL recovery ---	 * we have no stack in that case.)	 */	if (is_root)	{		Buffer		rootbuf;		Assert(stack == NULL);		Assert(is_only);		/* create a new root node and update the metapage */		rootbuf = _bt_newroot(rel, buf, rbuf);		/* release the split buffers */		_bt_wrtbuf(rel, rootbuf);		_bt_wrtbuf(rel, rbuf);		_bt_wrtbuf(rel, buf);	}	else	{		BlockNumber bknum = BufferGetBlockNumber(buf);		BlockNumber rbknum = BufferGetBlockNumber(rbuf);		Page		page = BufferGetPage(buf);		BTItem		new_item;		BTStackData fakestack;		BTItem		ritem;		Buffer		pbuf;		if (stack == NULL)		{			BTPageOpaque lpageop;			if (!InRecovery)				elog(DEBUG2, "concurrent ROOT page split");
nbtinsert.c - 源码说明

本页面展示了「PostgreSQL 8.1.4的源码适用于Linux下的开源数据库系统」中的 nbtinsert.c 源码文件，采用 C语言编程语言编写，共 1,592 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与PostgreSQL相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?