nbtsort.c

来自「PostgreSQL 8.1.4的源码适用于Linux下的开源数据库系统」· C语言代码 · 共 835 行 · 第 1/2 页
835 行
/*------------------------------------------------------------------------- * * nbtsort.c *		Build a btree from sorted input by loading leaf pages sequentially. * * NOTES * * We use tuplesort.c to sort the given index tuples into order. * Then we scan the index tuples in order and build the btree pages * for each level.	We load source tuples into leaf-level pages. * Whenever we fill a page at one level, we add a link to it to its * parent level (starting a new parent level if necessary).  When * done, we write out each final page on each level, adding it to * its parent level.  When we have only one page on a level, it must be * the root -- it can be attached to the btree metapage and we are done. * * This code is moderately slow (~10% slower) compared to the regular * btree (insertion) build code on sorted or well-clustered data.  On * random data, however, the insertion build code is unusable -- the * difference on a 60MB heap is a factor of 15 because the random * probes into the btree thrash the buffer pool.  (NOTE: the above * "10%" estimate is probably obsolete, since it refers to an old and * not very good external sort implementation that used to exist in * this module.  tuplesort.c is almost certainly faster.) * * It is not wise to pack the pages entirely full, since then *any* * insertion would cause a split (and not only of the leaf page; the need * for a split would cascade right up the tree).  The steady-state load * factor for btrees is usually estimated at 70%.  We choose to pack leaf * pages to 90% and upper pages to 70%.  This gives us reasonable density * (there aren't many upper pages if the keys are reasonable-size) without * incurring a lot of cascading splits during early insertions. * * Formerly the index pages being built were kept in shared buffers, but * that is of no value (since other backends have no interest in them yet) * and it created locking problems for CHECKPOINT, because the upper-level * pages were held exclusive-locked for long periods.  Now we just build * the pages in local memory and smgrwrite() them as we finish them.  They * will need to be re-read into shared buffers on first use after the build * finishes. * * Since the index will never be used unless it is completely built, * from a crash-recovery point of view there is no need to WAL-log the * steps of the build.	After completing the index build, we can just sync * the whole file to disk using smgrimmedsync() before exiting this module. * This can be seen to be sufficient for crash recovery by considering that * it's effectively equivalent to what would happen if a CHECKPOINT occurred * just after the index build.	However, it is clearly not sufficient if the * DBA is using the WAL log for PITR or replication purposes, since another * machine would not be able to reconstruct the index from WAL.  Therefore, * we log the completed index pages to WAL if and only if WAL archiving is * active. * * * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.95.2.3 2006/03/10 20:18:25 tgl Exp $ * *------------------------------------------------------------------------- */#include "postgres.h"#include "access/nbtree.h"#include "access/xlog.h"#include "miscadmin.h"#include "storage/smgr.h"#include "utils/tuplesort.h"/* * Status record for spooling/sorting phase.  (Note we may have two of * these due to the special requirements for uniqueness-checking with * dead tuples.) */struct BTSpool{	Tuplesortstate *sortstate;	/* state data for tuplesort.c */	Relation	index;	bool		isunique;};/* * Status record for a btree page being built.	We have one of these * for each active tree level. * * The reason we need to store a copy of the minimum key is that we'll * need to propagate it to the parent node when this page is linked * into its parent.  However, if the page is not a leaf page, the first * entry on the page doesn't need to contain a key, so we will not have * stored the key itself on the page.  (You might think we could skip * copying the minimum key on leaf pages, but actually we must have a * writable copy anyway because we'll poke the page's address into it * before passing it up to the parent...) */typedef struct BTPageState{	Page		btps_page;		/* workspace for page building */	BlockNumber btps_blkno;		/* block # to write this page at */	BTItem		btps_minkey;	/* copy of minimum key (first item) on page */	OffsetNumber btps_lastoff;	/* last item offset loaded */	uint32		btps_level;		/* tree level (0 = leaf) */	Size		btps_full;		/* "full" if less than this much free space */	struct BTPageState *btps_next;		/* link to parent level, if any */} BTPageState;/* * Overall status record for index writing phase. */typedef struct BTWriteState{	Relation	index;	bool		btws_use_wal;	/* dump pages to WAL? */	BlockNumber btws_pages_alloced;		/* # pages allocated */	BlockNumber btws_pages_written;		/* # pages written out */	Page		btws_zeropage;	/* workspace for filling zeroes */} BTWriteState;#define BTITEMSZ(btitem) \	((btitem) ? \	 (IndexTupleDSize((btitem)->bti_itup) + \	  (sizeof(BTItemData) - sizeof(IndexTupleData))) : \	 0)static Page _bt_blnewpage(uint32 level);static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);static void _bt_slideleft(Page page);static void _bt_sortaddtup(Page page, Size itemsize,			   BTItem btitem, OffsetNumber itup_off);static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, BTItem bti);static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);static void _bt_load(BTWriteState *wstate,		 BTSpool *btspool, BTSpool *btspool2);/* * Interface routines *//* * create and initialize a spool structure */BTSpool *_bt_spoolinit(Relation index, bool isunique, bool isdead){	BTSpool    *btspool = (BTSpool *) palloc0(sizeof(BTSpool));	int			btKbytes;	btspool->index = index;	btspool->isunique = isunique;	/*	 * We size the sort area as maintenance_work_mem rather than work_mem to	 * speed index creation.  This should be OK since a single backend can't	 * run multiple index creations in parallel.  Note that creation of a	 * unique index actually requires two BTSpool objects.	We expect that the	 * second one (for dead tuples) won't get very full, so we give it only	 * work_mem.	 */	btKbytes = isdead ? work_mem : maintenance_work_mem;	btspool->sortstate = tuplesort_begin_index(index, isunique,											   btKbytes, false);	/*	 * Currently, tuplesort provides sort functions on IndexTuples. If we kept	 * anything in a BTItem other than a regular IndexTuple, we'd need to	 * modify tuplesort to understand BTItems as such.	 */	Assert(sizeof(BTItemData) == sizeof(IndexTupleData));	return btspool;}/* * clean up a spool structure and its substructures. */void_bt_spooldestroy(BTSpool *btspool){	tuplesort_end(btspool->sortstate);	pfree(btspool);}/* * spool a btitem into the sort file. */void_bt_spool(BTItem btitem, BTSpool *btspool){	/* A BTItem is really just an IndexTuple */	tuplesort_puttuple(btspool->sortstate, (void *) btitem);}/* * given a spool loaded by successive calls to _bt_spool, * create an entire btree. */void_bt_leafbuild(BTSpool *btspool, BTSpool *btspool2){	BTWriteState wstate;#ifdef BTREE_BUILD_STATS	if (log_btree_build_stats)	{		ShowUsage("BTREE BUILD (Spool) STATISTICS");		ResetUsage();	}#endif   /* BTREE_BUILD_STATS */	tuplesort_performsort(btspool->sortstate);	if (btspool2)		tuplesort_performsort(btspool2->sortstate);	wstate.index = btspool->index;	/*	 * We need to log index creation in WAL iff WAL archiving is enabled AND	 * it's not a temp index.	 */	wstate.btws_use_wal = XLogArchivingActive() && !wstate.index->rd_istemp;	/* reserve the metapage */	wstate.btws_pages_alloced = BTREE_METAPAGE + 1;	wstate.btws_pages_written = 0;	wstate.btws_zeropage = NULL;	/* until needed */	_bt_load(&wstate, btspool, btspool2);}/* * Internal routines. *//* * allocate workspace for a new, clean btree page, not linked to any siblings. */static Page_bt_blnewpage(uint32 level){	Page		page;	BTPageOpaque opaque;	page = (Page) palloc(BLCKSZ);	/* Zero the page and set up standard page header info */	_bt_pageinit(page, BLCKSZ);	/* Initialize BT opaque state */	opaque = (BTPageOpaque) PageGetSpecialPointer(page);	opaque->btpo_prev = opaque->btpo_next = P_NONE;	opaque->btpo.level = level;	opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;	/* Make the P_HIKEY line pointer appear allocated */	((PageHeader) page)->pd_lower += sizeof(ItemIdData);	return page;}/* * emit a completed btree page, and release the working storage. */static void_bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno){	/* Ensure rd_smgr is open (could have been closed by relcache flush!) */	RelationOpenSmgr(wstate->index);	/* XLOG stuff */	if (wstate->btws_use_wal)	{		/* We use the heap NEWPAGE record type for this */		xl_heap_newpage xlrec;		XLogRecPtr	recptr;		XLogRecData rdata[2];		/* NO ELOG(ERROR) from here till newpage op is logged */		START_CRIT_SECTION();		xlrec.node = wstate->index->rd_node;		xlrec.blkno = blkno;		rdata[0].data = (char *) &xlrec;		rdata[0].len = SizeOfHeapNewpage;		rdata[0].buffer = InvalidBuffer;		rdata[0].next = &(rdata[1]);		rdata[1].data = (char *) page;		rdata[1].len = BLCKSZ;		rdata[1].buffer = InvalidBuffer;		rdata[1].next = NULL;		recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_NEWPAGE, rdata);		PageSetLSN(page, recptr);		PageSetTLI(page, ThisTimeLineID);		END_CRIT_SECTION();	}	else	{		/* Leave the page LSN zero if not WAL-logged, but set TLI anyway */		PageSetTLI(page, ThisTimeLineID);	}	/*	 * If we have to write pages nonsequentially, fill in the space with	 * zeroes until we come back and overwrite.  This is not logically	 * necessary on standard Unix filesystems (unwritten space will read as	 * zeroes anyway), but it should help to avoid fragmentation. The dummy	 * pages aren't WAL-logged though.	 */	while (blkno > wstate->btws_pages_written)	{		if (!wstate->btws_zeropage)			wstate->btws_zeropage = (Page) palloc0(BLCKSZ);		smgrwrite(wstate->index->rd_smgr, wstate->btws_pages_written++,				  (char *) wstate->btws_zeropage,				  true);	}	/*	 * Now write the page.	We say isTemp = true even if it's not a temp	 * index, because there's no need for smgr to schedule an fsync for this	 * write; we'll do it ourselves before ending the build.	 */	smgrwrite(wstate->index->rd_smgr, blkno, (char *) page, true);	if (blkno == wstate->btws_pages_written)		wstate->btws_pages_written++;	pfree(page);}/* * allocate and initialize a new BTPageState.  the returned structure * is suitable for immediate use by _bt_buildadd. */static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level){	BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));	/* create initial page for level */	state->btps_page = _bt_blnewpage(level);	/* and assign it a page position */	state->btps_blkno = wstate->btws_pages_alloced++;	state->btps_minkey = NULL;	/* initialize lastoff so first item goes into P_FIRSTKEY */	state->btps_lastoff = P_HIKEY;	state->btps_level = level;	/* set "full" threshold based on level.  See notes at head of file. */	if (level > 0)		state->btps_full = (PageGetPageSize(state->btps_page) * 3) / 10;	else		state->btps_full = PageGetPageSize(state->btps_page) / 10;	/* no parent level, yet */	state->btps_next = NULL;	return state;}/* * slide an array of ItemIds back one slot (from P_FIRSTKEY to * P_HIKEY, overwriting P_HIKEY).  we need to do this when we discover * that we have built an ItemId array in what has turned out to be a * P_RIGHTMOST page. */static void_bt_slideleft(Page page){	OffsetNumber off;	OffsetNumber maxoff;	ItemId		previi;	ItemId		thisii;	if (!PageIsEmpty(page))	{		maxoff = PageGetMaxOffsetNumber(page);		previi = PageGetItemId(page, P_HIKEY);		for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))		{			thisii = PageGetItemId(page, off);			*previi = *thisii;			previi = thisii;		}		((PageHeader) page)->pd_lower -= sizeof(ItemIdData);	}}/* * Add an item to a page being built. * * The main difference between this routine and a bare PageAddItem call * is that this code knows that the leftmost data item on a non-leaf * btree page doesn't need to have a key.  Therefore, it strips such * items down to just the item header. * * This is almost like nbtinsert.c's _bt_pgaddtup(), but we can't use * that because it assumes that P_RIGHTMOST() will return the correct * answer for the page.  Here, we don't know yet if the page will be * rightmost.  Offset P_FIRSTKEY is always the first data key. */static void_bt_sortaddtup(Page page,			   Size itemsize,			   BTItem btitem,			   OffsetNumber itup_off)
nbtsort.c - 源码说明

本页面展示了「PostgreSQL 8.1.4的源码适用于Linux下的开源数据库系统」中的 nbtsort.c 源码文件，采用 C语言编程语言编写，共 835 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与PostgreSQL相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?