📄 nbtsort.c
字号:
/*------------------------------------------------------------------------- * * nbtsort.c * Build a btree from sorted input by loading leaf pages sequentially. * * NOTES * * We use tuplesort.c to sort the given index tuples into order. * Then we scan the index tuples in order and build the btree pages * for each level. We load source tuples into leaf-level pages. * Whenever we fill a page at one level, we add a link to it to its * parent level (starting a new parent level if necessary). When * done, we write out each final page on each level, adding it to * its parent level. When we have only one page on a level, it must be * the root -- it can be attached to the btree metapage and we are done. * * This code is moderately slow (~10% slower) compared to the regular * btree (insertion) build code on sorted or well-clustered data. On * random data, however, the insertion build code is unusable -- the * difference on a 60MB heap is a factor of 15 because the random * probes into the btree thrash the buffer pool. (NOTE: the above * "10%" estimate is probably obsolete, since it refers to an old and * not very good external sort implementation that used to exist in * this module. tuplesort.c is almost certainly faster.) * * It is not wise to pack the pages entirely full, since then *any* * insertion would cause a split (and not only of the leaf page; the need * for a split would cascade right up the tree). The steady-state load * factor for btrees is usually estimated at 70%. We choose to pack leaf * pages to the user-controllable fill factor (default 90%) while upper pages * are always packed to 70%. This gives us reasonable density (there aren't * many upper pages if the keys are reasonable-size) without risking a lot of * cascading splits during early insertions. * * Formerly the index pages being built were kept in shared buffers, but * that is of no value (since other backends have no interest in them yet) * and it created locking problems for CHECKPOINT, because the upper-level * pages were held exclusive-locked for long periods. Now we just build * the pages in local memory and smgrwrite or smgrextend them as we finish * them. They will need to be re-read into shared buffers on first use after * the build finishes. * * Since the index will never be used unless it is completely built, * from a crash-recovery point of view there is no need to WAL-log the * steps of the build. After completing the index build, we can just sync * the whole file to disk using smgrimmedsync() before exiting this module. * This can be seen to be sufficient for crash recovery by considering that * it's effectively equivalent to what would happen if a CHECKPOINT occurred * just after the index build. However, it is clearly not sufficient if the * DBA is using the WAL log for PITR or replication purposes, since another * machine would not be able to reconstruct the index from WAL. Therefore, * we log the completed index pages to WAL if and only if WAL archiving is * active. * * * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.114 2008/01/01 19:45:46 momjian Exp $ * *------------------------------------------------------------------------- */#include "postgres.h"#include "access/heapam.h"#include "access/nbtree.h"#include "miscadmin.h"#include "storage/smgr.h"#include "utils/tuplesort.h"/* * Status record for spooling/sorting phase. (Note we may have two of * these due to the special requirements for uniqueness-checking with * dead tuples.) */struct BTSpool{ Tuplesortstate *sortstate; /* state data for tuplesort.c */ Relation index; bool isunique;};/* * Status record for a btree page being built. We have one of these * for each active tree level. * * The reason we need to store a copy of the minimum key is that we'll * need to propagate it to the parent node when this page is linked * into its parent. However, if the page is not a leaf page, the first * entry on the page doesn't need to contain a key, so we will not have * stored the key itself on the page. (You might think we could skip * copying the minimum key on leaf pages, but actually we must have a * writable copy anyway because we'll poke the page's address into it * before passing it up to the parent...) */typedef struct BTPageState{ Page btps_page; /* workspace for page building */ BlockNumber btps_blkno; /* block # to write this page at */ IndexTuple btps_minkey; /* copy of minimum key (first item) on page */ OffsetNumber btps_lastoff; /* last item offset loaded */ uint32 btps_level; /* tree level (0 = leaf) */ Size btps_full; /* "full" if less than this much free space */ struct BTPageState *btps_next; /* link to parent level, if any */} BTPageState;/* * Overall status record for index writing phase. */typedef struct BTWriteState{ Relation index; bool btws_use_wal; /* dump pages to WAL? */ BlockNumber btws_pages_alloced; /* # pages allocated */ BlockNumber btws_pages_written; /* # pages written out */ Page btws_zeropage; /* workspace for filling zeroes */} BTWriteState;static Page _bt_blnewpage(uint32 level);static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);static void _bt_slideleft(Page page);static void _bt_sortaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off);static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup);static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);static void _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2);/* * Interface routines *//* * create and initialize a spool structure */BTSpool *_bt_spoolinit(Relation index, bool isunique, bool isdead){ BTSpool *btspool = (BTSpool *) palloc0(sizeof(BTSpool)); int btKbytes; btspool->index = index; btspool->isunique = isunique; /* * We size the sort area as maintenance_work_mem rather than work_mem to * speed index creation. This should be OK since a single backend can't * run multiple index creations in parallel. Note that creation of a * unique index actually requires two BTSpool objects. We expect that the * second one (for dead tuples) won't get very full, so we give it only * work_mem. */ btKbytes = isdead ? work_mem : maintenance_work_mem; btspool->sortstate = tuplesort_begin_index(index, isunique, btKbytes, false); return btspool;}/* * clean up a spool structure and its substructures. */void_bt_spooldestroy(BTSpool *btspool){ tuplesort_end(btspool->sortstate); pfree(btspool);}/* * spool an index entry into the sort file. */void_bt_spool(IndexTuple itup, BTSpool *btspool){ tuplesort_putindextuple(btspool->sortstate, itup);}/* * given a spool loaded by successive calls to _bt_spool, * create an entire btree. */void_bt_leafbuild(BTSpool *btspool, BTSpool *btspool2){ BTWriteState wstate;#ifdef BTREE_BUILD_STATS if (log_btree_build_stats) { ShowUsage("BTREE BUILD (Spool) STATISTICS"); ResetUsage(); }#endif /* BTREE_BUILD_STATS */ tuplesort_performsort(btspool->sortstate); if (btspool2) tuplesort_performsort(btspool2->sortstate); wstate.index = btspool->index; /* * We need to log index creation in WAL iff WAL archiving is enabled AND * it's not a temp index. */ wstate.btws_use_wal = XLogArchivingActive() && !wstate.index->rd_istemp; /* reserve the metapage */ wstate.btws_pages_alloced = BTREE_METAPAGE + 1; wstate.btws_pages_written = 0; wstate.btws_zeropage = NULL; /* until needed */ _bt_load(&wstate, btspool, btspool2);}/* * Internal routines. *//* * allocate workspace for a new, clean btree page, not linked to any siblings. */static Page_bt_blnewpage(uint32 level){ Page page; BTPageOpaque opaque; page = (Page) palloc(BLCKSZ); /* Zero the page and set up standard page header info */ _bt_pageinit(page, BLCKSZ); /* Initialize BT opaque state */ opaque = (BTPageOpaque) PageGetSpecialPointer(page); opaque->btpo_prev = opaque->btpo_next = P_NONE; opaque->btpo.level = level; opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF; opaque->btpo_cycleid = 0; /* Make the P_HIKEY line pointer appear allocated */ ((PageHeader) page)->pd_lower += sizeof(ItemIdData); return page;}/* * emit a completed btree page, and release the working storage. */static void_bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno){ /* Ensure rd_smgr is open (could have been closed by relcache flush!) */ RelationOpenSmgr(wstate->index); /* XLOG stuff */ if (wstate->btws_use_wal) { /* We use the heap NEWPAGE record type for this */ log_newpage(&wstate->index->rd_node, blkno, page); } else { /* Leave the page LSN zero if not WAL-logged, but set TLI anyway */ PageSetTLI(page, ThisTimeLineID); } /* * If we have to write pages nonsequentially, fill in the space with * zeroes until we come back and overwrite. This is not logically * necessary on standard Unix filesystems (unwritten space will read as * zeroes anyway), but it should help to avoid fragmentation. The dummy * pages aren't WAL-logged though. */ while (blkno > wstate->btws_pages_written) { if (!wstate->btws_zeropage) wstate->btws_zeropage = (Page) palloc0(BLCKSZ); smgrextend(wstate->index->rd_smgr, wstate->btws_pages_written++, (char *) wstate->btws_zeropage, true); } /* * Now write the page. We say isTemp = true even if it's not a temp * index, because there's no need for smgr to schedule an fsync for this * write; we'll do it ourselves before ending the build. */ if (blkno == wstate->btws_pages_written) { /* extending the file... */ smgrextend(wstate->index->rd_smgr, blkno, (char *) page, true); wstate->btws_pages_written++; } else { /* overwriting a block we zero-filled before */ smgrwrite(wstate->index->rd_smgr, blkno, (char *) page, true); } pfree(page);}/* * allocate and initialize a new BTPageState. the returned structure * is suitable for immediate use by _bt_buildadd. */static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level){ BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState)); /* create initial page for level */ state->btps_page = _bt_blnewpage(level); /* and assign it a page position */ state->btps_blkno = wstate->btws_pages_alloced++; state->btps_minkey = NULL; /* initialize lastoff so first item goes into P_FIRSTKEY */ state->btps_lastoff = P_HIKEY; state->btps_level = level; /* set "full" threshold based on level. See notes at head of file. */ if (level > 0) state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100); else state->btps_full = RelationGetTargetPageFreeSpace(wstate->index, BTREE_DEFAULT_FILLFACTOR); /* no parent level, yet */ state->btps_next = NULL; return state;}/* * slide an array of ItemIds back one slot (from P_FIRSTKEY to * P_HIKEY, overwriting P_HIKEY). we need to do this when we discover * that we have built an ItemId array in what has turned out to be a * P_RIGHTMOST page. */static void_bt_slideleft(Page page){ OffsetNumber off; OffsetNumber maxoff; ItemId previi; ItemId thisii; if (!PageIsEmpty(page)) { maxoff = PageGetMaxOffsetNumber(page); previi = PageGetItemId(page, P_HIKEY); for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off)) { thisii = PageGetItemId(page, off); *previi = *thisii; previi = thisii; } ((PageHeader) page)->pd_lower -= sizeof(ItemIdData); }}/* * Add an item to a page being built. * * The main difference between this routine and a bare PageAddItem call * is that this code knows that the leftmost data item on a non-leaf * btree page doesn't need to have a key. Therefore, it strips such * items down to just the item header. * * This is almost like nbtinsert.c's _bt_pgaddtup(), but we can't use * that because it assumes that P_RIGHTMOST() will return the correct * answer for the page. Here, we don't know yet if the page will be * rightmost. Offset P_FIRSTKEY is always the first data key. */static void_bt_sortaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off){ BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); IndexTupleData trunctuple; if (!P_ISLEAF(opaque) && itup_off == P_FIRSTKEY) { trunctuple = *itup; trunctuple.t_info = sizeof(IndexTupleData); itup = &trunctuple; itemsize = sizeof(IndexTupleData); } if (PageAddItem(page, (Item) itup, itemsize, itup_off, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add item to the index page");}/*----------
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -