📄 nbtpage.c
字号:
rootbuf = metabuf; for (;;) { rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ); rootpage = BufferGetPage(rootbuf); rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); if (!P_IGNORE(rootopaque)) break; /* it's dead, Jim. step right one page */ if (P_RIGHTMOST(rootopaque)) elog(ERROR, "no live root page found in \"%s\"", RelationGetRelationName(rel)); rootblkno = rootopaque->btpo_next; } /* Note: can't check btpo.level on deleted pages */ if (rootopaque->btpo.level != rootlevel) elog(ERROR, "root page %u of \"%s\" has level %u, expected %u", rootblkno, RelationGetRelationName(rel), rootopaque->btpo.level, rootlevel); return rootbuf;}/* * _bt_getbuf() -- Get a buffer by block number for read or write. * * blkno == P_NEW means to get an unallocated index page. * * When this routine returns, the appropriate lock is set on the * requested buffer and its reference count has been incremented * (ie, the buffer is "locked and pinned"). */Buffer_bt_getbuf(Relation rel, BlockNumber blkno, int access){ Buffer buf; if (blkno != P_NEW) { /* Read an existing block of the relation */ buf = ReadBuffer(rel, blkno); LockBuffer(buf, access); } else { bool needLock; Page page; Assert(access == BT_WRITE); /* * First see if the FSM knows of any free pages. * * We can't trust the FSM's report unreservedly; we have to check that * the page is still free. (For example, an already-free page could * have been re-used between the time the last VACUUM scanned it and * the time the VACUUM made its FSM updates.) * * In fact, it's worse than that: we can't even assume that it's safe * to take a lock on the reported page. If somebody else has a lock * on it, or even worse our own caller does, we could deadlock. (The * own-caller scenario is actually not improbable. Consider an index * on a serial or timestamp column. Nearly all splits will be at the * rightmost page, so it's entirely likely that _bt_split will call us * while holding a lock on the page most recently acquired from FSM. A * VACUUM running concurrently with the previous split could well have * placed that page back in FSM.) * * To get around that, we ask for only a conditional lock on the * reported page. If we fail, then someone else is using the page, * and we may reasonably assume it's not free. (If we happen to be * wrong, the worst consequence is the page will be lost to use till * the next VACUUM, which is no big problem.) */ for (;;) { blkno = GetFreeIndexPage(&rel->rd_node); if (blkno == InvalidBlockNumber) break; buf = ReadBuffer(rel, blkno); if (ConditionalLockBuffer(buf)) { page = BufferGetPage(buf); if (_bt_page_recyclable(page)) { /* Okay to use page. Re-initialize and return it */ _bt_pageinit(page, BufferGetPageSize(buf)); return buf; } elog(DEBUG2, "FSM returned nonrecyclable page"); _bt_relbuf(rel, buf); } else { elog(DEBUG2, "FSM returned nonlockable page"); /* couldn't get lock, so just drop pin */ ReleaseBuffer(buf); } } /* * Extend the relation by one page. * * We have to use a lock to ensure no one else is extending the rel at * the same time, else we will both try to initialize the same new * page. We can skip locking for new or temp relations, however, * since no one else could be accessing them. */ needLock = !RELATION_IS_LOCAL(rel); if (needLock) LockRelationForExtension(rel, ExclusiveLock); buf = ReadBuffer(rel, P_NEW); /* Acquire buffer lock on new page */ LockBuffer(buf, BT_WRITE); /* * Release the file-extension lock; it's now OK for someone else to * extend the relation some more. Note that we cannot release this * lock before we have buffer lock on the new page, or we risk a race * condition against btvacuumcleanup --- see comments therein. */ if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); /* Initialize the new page before returning it */ page = BufferGetPage(buf); Assert(PageIsNew((PageHeader) page)); _bt_pageinit(page, BufferGetPageSize(buf)); } /* ref count and lock type are correct */ return buf;}/* * _bt_relandgetbuf() -- release a locked buffer and get another one. * * This is equivalent to _bt_relbuf followed by _bt_getbuf, with the * exception that blkno may not be P_NEW. Also, if obuf is InvalidBuffer * then it reduces to just _bt_getbuf; allowing this case simplifies some * callers. The motivation for using this is to avoid two entries to the * bufmgr when one will do. */Buffer_bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access){ Buffer buf; Assert(blkno != P_NEW); if (BufferIsValid(obuf)) LockBuffer(obuf, BUFFER_LOCK_UNLOCK); buf = ReleaseAndReadBuffer(obuf, rel, blkno); LockBuffer(buf, access); return buf;}/* * _bt_relbuf() -- release a locked buffer. * * Lock and pin (refcount) are both dropped. Note that either read or * write lock can be dropped this way, but if we modified the buffer, * this is NOT the right way to release a write lock. */void_bt_relbuf(Relation rel, Buffer buf){ LockBuffer(buf, BUFFER_LOCK_UNLOCK); ReleaseBuffer(buf);}/* * _bt_wrtbuf() -- write a btree page to disk. * * This routine releases the lock held on the buffer and our refcount * for it. It is an error to call _bt_wrtbuf() without a write lock * and a pin on the buffer. * * NOTE: actually, the buffer manager just marks the shared buffer page * dirty here; the real I/O happens later. This is okay since we are not * relying on write ordering anyway. The WAL mechanism is responsible for * guaranteeing correctness after a crash. */void_bt_wrtbuf(Relation rel, Buffer buf){ LockBuffer(buf, BUFFER_LOCK_UNLOCK); WriteBuffer(buf);}/* * _bt_wrtnorelbuf() -- write a btree page to disk, but do not release * our reference or lock. * * It is an error to call _bt_wrtnorelbuf() without a write lock * and a pin on the buffer. * * See above NOTE. */void_bt_wrtnorelbuf(Relation rel, Buffer buf){ WriteNoReleaseBuffer(buf);}/* * _bt_pageinit() -- Initialize a new page. * * On return, the page header is initialized; data space is empty; * special space is zeroed out. */void_bt_pageinit(Page page, Size size){ PageInit(page, size, sizeof(BTPageOpaqueData));}/* * _bt_page_recyclable() -- Is an existing page recyclable? * * This exists to make sure _bt_getbuf and btvacuumcleanup have the same * policy about whether a page is safe to re-use. */bool_bt_page_recyclable(Page page){ BTPageOpaque opaque; /* * It's possible to find an all-zeroes page in an index --- for example, a * backend might successfully extend the relation one page and then crash * before it is able to make a WAL entry for adding the page. If we find a * zeroed page then reclaim it. */ if (PageIsNew(page)) return true; /* * Otherwise, recycle if deleted and too old to have any processes * interested in it. */ opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (P_ISDELETED(opaque) && TransactionIdPrecedesOrEquals(opaque->btpo.xact, RecentXmin)) return true; return false;}/* * Delete item(s) from a btree page. * * This must only be used for deleting leaf items. Deleting an item on a * non-leaf page has to be done as part of an atomic action that includes * deleting the page it points to. * * This routine assumes that the caller has pinned and locked the buffer, * and will write the buffer afterwards. Also, the given itemnos *must* * appear in increasing order in the array. */void_bt_delitems(Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems){ Page page = BufferGetPage(buf); /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); /* Fix the page */ PageIndexMultiDelete(page, itemnos, nitems); /* XLOG stuff */ if (!rel->rd_istemp) { xl_btree_delete xlrec; XLogRecPtr recptr; XLogRecData rdata[2]; xlrec.node = rel->rd_node; xlrec.block = BufferGetBlockNumber(buf); rdata[0].data = (char *) &xlrec; rdata[0].len = SizeOfBtreeDelete; rdata[0].buffer = InvalidBuffer; rdata[0].next = &(rdata[1]); /* * The target-offsets array is not in the buffer, but pretend that it * is. When XLogInsert stores the whole buffer, the offsets array * need not be stored too. */ if (nitems > 0) { rdata[1].data = (char *) itemnos; rdata[1].len = nitems * sizeof(OffsetNumber); } else { rdata[1].data = NULL; rdata[1].len = 0; } rdata[1].buffer = buf; rdata[1].buffer_std = true; rdata[1].next = NULL; recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } END_CRIT_SECTION();}/* * _bt_pagedel() -- Delete a page from the b-tree. * * This action unlinks the page from the b-tree structure, removing all * pointers leading to it --- but not touching its own left and right links. * The page cannot be physically reclaimed right away, since other processes * may currently be trying to follow links leading to the page; they have to * be allowed to use its right-link to recover. See nbtree/README. * * On entry, the target buffer must be pinned and read-locked. This lock and * pin will be dropped before exiting. * * Returns the number of pages successfully deleted (zero on failure; could * be more than one if parent blocks were deleted). * * NOTE: this leaks memory. Rather than trying to clean up everything * carefully, it's better to run it in a temp context that can be reset * frequently. */int_bt_pagedel(Relation rel, Buffer buf, bool vacuum_full){ BlockNumber target, leftsib, rightsib, parent; OffsetNumber poffset, maxoff; uint32 targetlevel, ilevel; ItemId itemid; BTItem targetkey, btitem; ScanKey itup_scankey; BTStack stack; Buffer lbuf, rbuf, pbuf; bool parent_half_dead; bool parent_one_child; bool rightsib_empty; Buffer metabuf = InvalidBuffer; Page metapg = NULL; BTMetaPageData *metad = NULL; Page page; BTPageOpaque opaque; /* * We can never delete rightmost pages nor root pages. While at it, check * that page is not already deleted and is empty. */ page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) || P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page)) { _bt_relbuf(rel, buf); return 0; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -