📄 bufmgr.c
字号:
/*------------------------------------------------------------------------- * * bufmgr.c * buffer manager interface routines * * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION * $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.228 2008/01/01 19:45:51 momjian Exp $ * *------------------------------------------------------------------------- *//* * Principal entry points: * * ReadBuffer() -- find or create a buffer holding the requested page, * and pin it so that no one can destroy it while this process * is using it. * * ReleaseBuffer() -- unpin a buffer * * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty". * The disk write is delayed until buffer replacement or checkpoint. * * See also these files: * freelist.c -- chooses victim for buffer replacement * buf_table.c -- manages the buffer lookup table */#include "postgres.h"#include <sys/file.h>#include <unistd.h>#include "miscadmin.h"#include "postmaster/bgwriter.h"#include "storage/buf_internals.h"#include "storage/bufpage.h"#include "storage/ipc.h"#include "storage/proc.h"#include "storage/smgr.h"#include "utils/resowner.h"#include "pgstat.h"/* Note: these two macros only work on shared buffers, not local ones! */#define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))#define BufferGetLSN(bufHdr) (*((XLogRecPtr*) BufHdrGetBlock(bufHdr)))/* Note: this macro only works on local buffers, not shared ones! */#define LocalBufHdrGetBlock(bufHdr) \ LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]/* Bits in SyncOneBuffer's return value */#define BUF_WRITTEN 0x01#define BUF_REUSABLE 0x02/* GUC variables */bool zero_damaged_pages = false;int bgwriter_lru_maxpages = 100;double bgwriter_lru_multiplier = 2.0;long NDirectFileRead; /* some I/O's are direct file access. bypass * bufmgr */long NDirectFileWrite; /* e.g., I/O in psort and hashjoin. *//* local state for StartBufferIO and related functions */static volatile BufferDesc *InProgressBuf = NULL;static bool IsForInput;/* local state for LockBufferForCleanup */static volatile BufferDesc *PinCountWaitBuf = NULL;static Buffer ReadBuffer_common(Relation reln, BlockNumber blockNum, bool zeroPage, BufferAccessStrategy strategy);static bool PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy);static void PinBuffer_Locked(volatile BufferDesc *buf);static void UnpinBuffer(volatile BufferDesc *buf, bool fixOwner);static void BufferSync(int flags);static int SyncOneBuffer(int buf_id, bool skip_recently_used);static void WaitIO(volatile BufferDesc *buf);static bool StartBufferIO(volatile BufferDesc *buf, bool forInput);static void TerminateBufferIO(volatile BufferDesc *buf, bool clear_dirty, int set_flag_bits);static void buffer_write_error_callback(void *arg);static volatile BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr);static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln);static void AtProcExit_Buffers(int code, Datum arg);/* * ReadBuffer -- returns a buffer containing the requested * block of the requested relation. If the blknum * requested is P_NEW, extend the relation file and * allocate a new block. (Caller is responsible for * ensuring that only one backend tries to extend a * relation at the same time!) * * Returns: the buffer number for the buffer containing * the block read. The returned buffer has been pinned. * Does not return on error --- elog's instead. * * Assume when this function is called, that reln has been * opened already. */BufferReadBuffer(Relation reln, BlockNumber blockNum){ return ReadBuffer_common(reln, blockNum, false, NULL);}/* * ReadBufferWithStrategy -- same as ReadBuffer, except caller can specify * a nondefault buffer access strategy. See buffer/README for details. */BufferReadBufferWithStrategy(Relation reln, BlockNumber blockNum, BufferAccessStrategy strategy){ return ReadBuffer_common(reln, blockNum, false, strategy);}/* * ReadOrZeroBuffer -- like ReadBuffer, but if the page isn't in buffer * cache already, it's filled with zeros instead of reading it from * disk. Useful when the caller intends to fill the page from scratch, * since this saves I/O and avoids unnecessary failure if the * page-on-disk has corrupt page headers. * * Caution: do not use this to read a page that is beyond the relation's * current physical EOF; that is likely to cause problems in md.c when * the page is modified and written out. P_NEW is OK, though. */BufferReadOrZeroBuffer(Relation reln, BlockNumber blockNum){ return ReadBuffer_common(reln, blockNum, true, NULL);}/* * ReadBuffer_common -- common logic for ReadBuffer variants */static BufferReadBuffer_common(Relation reln, BlockNumber blockNum, bool zeroPage, BufferAccessStrategy strategy){ volatile BufferDesc *bufHdr; Block bufBlock; bool found; bool isExtend; bool isLocalBuf; /* Make sure we will have room to remember the buffer pin */ ResourceOwnerEnlargeBuffers(CurrentResourceOwner); isExtend = (blockNum == P_NEW); isLocalBuf = reln->rd_istemp; /* Open it at the smgr level if not already done */ RelationOpenSmgr(reln); /* Substitute proper block number if caller asked for P_NEW */ if (isExtend) blockNum = smgrnblocks(reln->rd_smgr); pgstat_count_buffer_read(reln); if (isLocalBuf) { ReadLocalBufferCount++; bufHdr = LocalBufferAlloc(reln, blockNum, &found); if (found) LocalBufferHitCount++; } else { ReadBufferCount++; /* * lookup the buffer. IO_IN_PROGRESS is set if the requested block is * not currently in memory. */ bufHdr = BufferAlloc(reln, blockNum, strategy, &found); if (found) BufferHitCount++; } /* At this point we do NOT hold any locks. */ /* if it was already in the buffer pool, we're done */ if (found) { if (!isExtend) { /* Just need to update stats before we exit */ pgstat_count_buffer_hit(reln); if (VacuumCostActive) VacuumCostBalance += VacuumCostPageHit; return BufferDescriptorGetBuffer(bufHdr); } /* * We get here only in the corner case where we are trying to extend * the relation but we found a pre-existing buffer marked BM_VALID. * This can happen because mdread doesn't complain about reads beyond * EOF (when zero_damaged_pages is ON) and so a previous attempt to * read a block beyond EOF could have left a "valid" zero-filled * buffer. Unfortunately, we have also seen this case occurring * because of buggy Linux kernels that sometimes return an * lseek(SEEK_END) result that doesn't account for a recent write. In * that situation, the pre-existing buffer would contain valid data * that we don't want to overwrite. Since the legitimate case should * always have left a zero-filled buffer, complain if not PageIsNew. */ bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr); if (!PageIsNew((PageHeader) bufBlock)) ereport(ERROR, (errmsg("unexpected data beyond EOF in block %u of relation \"%s\"", blockNum, RelationGetRelationName(reln)), errhint("This has been seen to occur with buggy kernels; consider updating your system."))); /* * We *must* do smgrextend before succeeding, else the page will not * be reserved by the kernel, and the next P_NEW call will decide to * return the same page. Clear the BM_VALID bit, do the StartBufferIO * call that BufferAlloc didn't, and proceed. */ if (isLocalBuf) { /* Only need to adjust flags */ Assert(bufHdr->flags & BM_VALID); bufHdr->flags &= ~BM_VALID; } else { /* * Loop to handle the very small possibility that someone re-sets * BM_VALID between our clearing it and StartBufferIO inspecting * it. */ do { LockBufHdr(bufHdr); Assert(bufHdr->flags & BM_VALID); bufHdr->flags &= ~BM_VALID; UnlockBufHdr(bufHdr); } while (!StartBufferIO(bufHdr, true)); } } /* * if we have gotten to this point, we have allocated a buffer for the * page but its contents are not yet valid. IO_IN_PROGRESS is set for it, * if it's a shared buffer. * * Note: if smgrextend fails, we will end up with a buffer that is * allocated but not marked BM_VALID. P_NEW will still select the same * block number (because the relation didn't get any longer on disk) and * so future attempts to extend the relation will find the same buffer (if * it's not been recycled) but come right back here to try smgrextend * again. */ Assert(!(bufHdr->flags & BM_VALID)); /* spinlock not needed */ bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr); if (isExtend) { /* new buffers are zero-filled */ MemSet((char *) bufBlock, 0, BLCKSZ); smgrextend(reln->rd_smgr, blockNum, (char *) bufBlock, reln->rd_istemp); } else { /* * Read in the page, unless the caller intends to overwrite it and * just wants us to allocate a buffer. */ if (zeroPage) MemSet((char *) bufBlock, 0, BLCKSZ); else smgrread(reln->rd_smgr, blockNum, (char *) bufBlock); /* check for garbage data */ if (!PageHeaderIsValid((PageHeader) bufBlock)) { if (zero_damaged_pages) { ereport(WARNING, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("invalid page header in block %u of relation \"%s\"; zeroing out page", blockNum, RelationGetRelationName(reln)))); MemSet((char *) bufBlock, 0, BLCKSZ); } else ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("invalid page header in block %u of relation \"%s\"", blockNum, RelationGetRelationName(reln)))); } } if (isLocalBuf) { /* Only need to adjust flags */ bufHdr->flags |= BM_VALID; } else { /* Set BM_VALID, terminate IO, and wake up any waiters */ TerminateBufferIO(bufHdr, false, BM_VALID); } if (VacuumCostActive) VacuumCostBalance += VacuumCostPageMiss; return BufferDescriptorGetBuffer(bufHdr);}/* * BufferAlloc -- subroutine for ReadBuffer. Handles lookup of a shared * buffer. If no buffer exists already, selects a replacement * victim and evicts the old page, but does NOT read in new page. * * "strategy" can be a buffer replacement strategy object, or NULL for * the default strategy. The selected buffer's usage_count is advanced when * using the default strategy, but otherwise possibly not (see PinBuffer). * * The returned buffer is pinned and is already marked as holding the * desired page. If it already did have the desired page, *foundPtr is * set TRUE. Otherwise, *foundPtr is set FALSE and the buffer is marked * as IO_IN_PROGRESS; ReadBuffer will now need to do I/O to fill it. * * *foundPtr is actually redundant with the buffer's BM_VALID flag, but * we keep it for simplicity in ReadBuffer. * * No locks are held either at entry or exit. */static volatile BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr){ BufferTag newTag; /* identity of requested block */ uint32 newHash; /* hash value for newTag */ LWLockId newPartitionLock; /* buffer partition lock for it */ BufferTag oldTag; /* previous identity of selected buffer */ uint32 oldHash; /* hash value for oldTag */ LWLockId oldPartitionLock; /* buffer partition lock for it */ BufFlags oldFlags; int buf_id; volatile BufferDesc *buf; bool valid; /* create a tag so we can lookup the buffer */ INIT_BUFFERTAG(newTag, reln, blockNum); /* determine its hash code and partition lock ID */ newHash = BufTableHashCode(&newTag); newPartitionLock = BufMappingPartitionLock(newHash); /* see if the block is in the buffer pool already */ LWLockAcquire(newPartitionLock, LW_SHARED); buf_id = BufTableLookup(&newTag, newHash); if (buf_id >= 0) { /* * Found it. Now, pin the buffer so no one can steal it from the * buffer pool, and check to see if the correct data has been loaded * into the buffer. */ buf = &BufferDescriptors[buf_id]; valid = PinBuffer(buf, strategy); /* Can release the mapping lock as soon as we've pinned it */ LWLockRelease(newPartitionLock); *foundPtr = TRUE; if (!valid) { /* * We can only get here if (a) someone else is still reading in * the page, or (b) a previous read attempt failed. We have to * wait for any active read attempt to finish, and then set up our * own read attempt if the page is still not BM_VALID. * StartBufferIO does it all. */ if (StartBufferIO(buf, true)) { /* * If we get here, previous attempts to read the buffer must * have failed ... but we shall bravely try again. */ *foundPtr = FALSE; } } return buf; } /* * Didn't find it in the buffer pool. We'll have to initialize a new * buffer. Remember to unlock the mapping lock while doing the work. */ LWLockRelease(newPartitionLock); /* Loop here in case we have to try another victim buffer */ for (;;) { bool lock_held; /* * Select a victim buffer. The buffer is returned with its header * spinlock still held! Also (in most cases) the BufFreelistLock is * still held, since it would be bad to hold the spinlock while * possibly waking up other processes. */ buf = StrategyGetBuffer(strategy, &lock_held); Assert(buf->refcount == 0); /* Must copy buffer flags while we still hold the spinlock */ oldFlags = buf->flags; /* Pin the buffer and then release the buffer spinlock */ PinBuffer_Locked(buf); /* Now it's safe to release the freelist lock */ if (lock_held) LWLockRelease(BufFreelistLock); /* * If the buffer was dirty, try to write it out. There is a race * condition here, in that someone might dirty it after we released it
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -