📄 nbtpage.c
字号:
* When this routine returns, the appropriate lock is set on the * requested buffer and its reference count has been incremented * (ie, the buffer is "locked and pinned"). */Buffer_bt_getbuf(Relation rel, BlockNumber blkno, int access){ Buffer buf; if (blkno != P_NEW) { /* Read an existing block of the relation */ buf = ReadBuffer(rel, blkno); LockBuffer(buf, access); } else { bool needLock; Page page; Assert(access == BT_WRITE); /* * First see if the FSM knows of any free pages. * * We can't trust the FSM's report unreservedly; we have to check * that the page is still free. (For example, an already-free * page could have been re-used between the time the last VACUUM * scanned it and the time the VACUUM made its FSM updates.) * * In fact, it's worse than that: we can't even assume that it's * safe to take a lock on the reported page. If somebody else * has a lock on it, or even worse our own caller does, we could * deadlock. (The own-caller scenario is actually not improbable. * Consider an index on a serial or timestamp column. Nearly all * splits will be at the rightmost page, so it's entirely likely * that _bt_split will call us while holding a lock on the page most * recently acquired from FSM. A VACUUM running concurrently with * the previous split could well have placed that page back in FSM.) * * To get around that, we ask for only a conditional lock on the * reported page. If we fail, then someone else is using the page, * and we may reasonably assume it's not free. (If we happen to be * wrong, the worst consequence is the page will be lost to use till * the next VACUUM, which is no big problem.) */ for (;;) { blkno = GetFreeIndexPage(&rel->rd_node); if (blkno == InvalidBlockNumber) break; buf = ReadBuffer(rel, blkno); if (ConditionalLockBuffer(buf)) { page = BufferGetPage(buf); if (_bt_page_recyclable(page)) { /* Okay to use page. Re-initialize and return it */ _bt_pageinit(page, BufferGetPageSize(buf)); return buf; } elog(DEBUG2, "FSM returned nonrecyclable page"); _bt_relbuf(rel, buf); } else { elog(DEBUG2, "FSM returned nonlockable page"); /* couldn't get lock, so just drop pin */ ReleaseBuffer(buf); } } /* * Extend the relation by one page. * * We have to use a lock to ensure no one else is extending the rel * at the same time, else we will both try to initialize the same * new page. We can skip locking for new or temp relations, * however, since no one else could be accessing them. */ needLock = !(rel->rd_isnew || rel->rd_istemp); if (needLock) LockPage(rel, 0, ExclusiveLock); buf = ReadBuffer(rel, P_NEW); /* * Release the file-extension lock; it's now OK for someone else * to extend the relation some more. */ if (needLock) UnlockPage(rel, 0, ExclusiveLock); /* Acquire appropriate buffer lock on new page */ LockBuffer(buf, access); /* Initialize the new page before returning it */ page = BufferGetPage(buf); _bt_pageinit(page, BufferGetPageSize(buf)); } /* ref count and lock type are correct */ return buf;}/* * _bt_relbuf() -- release a locked buffer. * * Lock and pin (refcount) are both dropped. Note that either read or * write lock can be dropped this way, but if we modified the buffer, * this is NOT the right way to release a write lock. */void_bt_relbuf(Relation rel, Buffer buf){ LockBuffer(buf, BUFFER_LOCK_UNLOCK); ReleaseBuffer(buf);}/* * _bt_wrtbuf() -- write a btree page to disk. * * This routine releases the lock held on the buffer and our refcount * for it. It is an error to call _bt_wrtbuf() without a write lock * and a pin on the buffer. * * NOTE: actually, the buffer manager just marks the shared buffer page * dirty here; the real I/O happens later. This is okay since we are not * relying on write ordering anyway. The WAL mechanism is responsible for * guaranteeing correctness after a crash. */void_bt_wrtbuf(Relation rel, Buffer buf){ LockBuffer(buf, BUFFER_LOCK_UNLOCK); WriteBuffer(buf);}/* * _bt_wrtnorelbuf() -- write a btree page to disk, but do not release * our reference or lock. * * It is an error to call _bt_wrtnorelbuf() without a write lock * and a pin on the buffer. * * See above NOTE. */void_bt_wrtnorelbuf(Relation rel, Buffer buf){ WriteNoReleaseBuffer(buf);}/* * _bt_pageinit() -- Initialize a new page. * * On return, the page header is initialized; data space is empty; * special space is zeroed out. */void_bt_pageinit(Page page, Size size){ PageInit(page, size, sizeof(BTPageOpaqueData));}/* * _bt_page_recyclable() -- Is an existing page recyclable? * * This exists to make sure _bt_getbuf and btvacuumcleanup have the same * policy about whether a page is safe to re-use. */bool_bt_page_recyclable(Page page){ BTPageOpaque opaque; /* * It's possible to find an all-zeroes page in an index --- for * example, a backend might successfully extend the relation one page * and then crash before it is able to make a WAL entry for adding the * page. If we find a zeroed page then reclaim it. */ if (PageIsNew(page)) return true; /* * Otherwise, recycle if deleted and too old to have any processes * interested in it. */ opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (P_ISDELETED(opaque) && TransactionIdPrecedesOrEquals(opaque->btpo.xact, RecentXmin)) return true; return false;}/* * _bt_metaproot() -- Change the root page of the btree. * * Lehman and Yao require that the root page move around in order to * guarantee deadlock-free short-term, fine-granularity locking. When * we split the root page, we record the new parent in the metadata page * for the relation. This routine does the work. * * No direct preconditions, but if you don't have the write lock on * at least the old root page when you call this, you're making a big * mistake. On exit, metapage data is correct and we no longer have * a pin or lock on the metapage. * * Actually this is not used for splitting on-the-fly anymore. It's only used * in nbtsort.c at the completion of btree building, where we know we have * sole access to the index anyway. */void_bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level){ Buffer metabuf; Page metap; BTPageOpaque metaopaque; BTMetaPageData *metad; metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); metap = BufferGetPage(metabuf); metaopaque = (BTPageOpaque) PageGetSpecialPointer(metap); Assert(metaopaque->btpo_flags & BTP_META); /* NO ELOG(ERROR) from here till newmeta op is logged */ START_CRIT_SECTION(); metad = BTPageGetMeta(metap); Assert(metad->btm_magic == BTREE_MAGIC || metad->btm_magic == 0); metad->btm_magic = BTREE_MAGIC; /* it's valid now for sure */ metad->btm_root = rootbknum; metad->btm_level = level; metad->btm_fastroot = rootbknum; metad->btm_fastlevel = level; /* XLOG stuff */ if (!rel->rd_istemp) { xl_btree_newmeta xlrec; XLogRecPtr recptr; XLogRecData rdata[1]; xlrec.node = rel->rd_node; xlrec.meta.root = metad->btm_root; xlrec.meta.level = metad->btm_level; xlrec.meta.fastroot = metad->btm_fastroot; xlrec.meta.fastlevel = metad->btm_fastlevel; rdata[0].buffer = InvalidBuffer; rdata[0].data = (char *) &xlrec; rdata[0].len = SizeOfBtreeNewmeta; rdata[0].next = NULL; recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWMETA, rdata); PageSetLSN(metap, recptr); PageSetSUI(metap, ThisStartUpID); } END_CRIT_SECTION(); _bt_wrtbuf(rel, metabuf);}/* * Delete item(s) from a btree page. * * This must only be used for deleting leaf items. Deleting an item on a * non-leaf page has to be done as part of an atomic action that includes * deleting the page it points to. * * This routine assumes that the caller has pinned and locked the buffer, * and will write the buffer afterwards. Also, the given itemnos *must* * appear in increasing order in the array. */void_bt_delitems(Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems){ Page page = BufferGetPage(buf); int i; /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); /* * Delete the items in reverse order so we don't have to think about * adjusting item numbers for previous deletions. */ for (i = nitems - 1; i >= 0; i--) PageIndexTupleDelete(page, itemnos[i]); /* XLOG stuff */ if (!rel->rd_istemp) { xl_btree_delete xlrec; XLogRecPtr recptr; XLogRecData rdata[2]; xlrec.node = rel->rd_node; xlrec.block = BufferGetBlockNumber(buf); rdata[0].buffer = InvalidBuffer; rdata[0].data = (char *) &xlrec; rdata[0].len = SizeOfBtreeDelete; rdata[0].next = &(rdata[1]); /* * The target-offsets array is not in the buffer, but pretend that * it is. When XLogInsert stores the whole buffer, the offsets * array need not be stored too. */ rdata[1].buffer = buf; if (nitems > 0) { rdata[1].data = (char *) itemnos; rdata[1].len = nitems * sizeof(OffsetNumber); } else { rdata[1].data = NULL; rdata[1].len = 0; } rdata[1].next = NULL; recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata); PageSetLSN(page, recptr); PageSetSUI(page, ThisStartUpID); } END_CRIT_SECTION();}/* * _bt_pagedel() -- Delete a page from the b-tree. * * This action unlinks the page from the b-tree structure, removing all * pointers leading to it --- but not touching its own left and right links. * The page cannot be physically reclaimed right away, since other processes * may currently be trying to follow links leading to the page; they have to * be allowed to use its right-link to recover. See nbtree/README. * * On entry, the target buffer must be pinned and read-locked. This lock and * pin will be dropped before exiting. * * Returns the number of pages successfully deleted (zero on failure; could * be more than one if parent blocks were deleted). * * NOTE: this leaks memory. Rather than trying to clean up everything * carefully, it's better to run it in a temp context that can be reset * frequently. */int_bt_pagedel(Relation rel, Buffer buf, bool vacuum_full){ BlockNumber target, leftsib, rightsib, parent; OffsetNumber poffset, maxoff; uint32 targetlevel, ilevel; ItemId itemid; BTItem targetkey, btitem; ScanKey itup_scankey; BTStack stack; Buffer lbuf, rbuf, pbuf; bool parent_half_dead; bool parent_one_child; bool rightsib_empty; Buffer metabuf = InvalidBuffer; Page metapg = NULL; BTMetaPageData *metad = NULL; Page page; BTPageOpaque opaque; /* * We can never delete rightmost pages nor root pages. While at it, * check that page is not already deleted and is empty. */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -