📄 bufmgr.c
字号:
/*------------------------------------------------------------------------- * * bufmgr.c * buffer manager interface routines * * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION * $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.198.2.3 2006/01/06 00:04:26 tgl Exp $ * *------------------------------------------------------------------------- *//* * ReadBuffer() -- find or create a buffer holding the requested page, * and pin it so that no one can destroy it while this process * is using it. * * ReleaseBuffer() -- unpin the buffer * * WriteNoReleaseBuffer() -- mark the buffer contents as "dirty" * but don't unpin. The disk IO is delayed until buffer * replacement. * * WriteBuffer() -- WriteNoReleaseBuffer() + ReleaseBuffer() * * BufferSync() -- flush all dirty buffers in the buffer pool. * * BgBufferSync() -- flush some dirty buffers in the buffer pool. * * InitBufferPool() -- Init the buffer module. * * See other files: * freelist.c -- chooses victim for buffer replacement * buf_table.c -- manages the buffer lookup table */#include "postgres.h"#include <sys/file.h>#include <unistd.h>#include "lib/stringinfo.h"#include "miscadmin.h"#include "storage/buf_internals.h"#include "storage/bufmgr.h"#include "storage/bufpage.h"#include "storage/ipc.h"#include "storage/proc.h"#include "storage/smgr.h"#include "utils/relcache.h"#include "utils/resowner.h"#include "pgstat.h"/* Note: these two macros only work on shared buffers, not local ones! */#define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))#define BufferGetLSN(bufHdr) (*((XLogRecPtr*) BufHdrGetBlock(bufHdr)))/* Note: this macro only works on local buffers, not shared ones! */#define LocalBufHdrGetBlock(bufHdr) \ LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]/* GUC variables */bool zero_damaged_pages = false;double bgwriter_lru_percent = 1.0;double bgwriter_all_percent = 0.333;int bgwriter_lru_maxpages = 5;int bgwriter_all_maxpages = 5;long NDirectFileRead; /* some I/O's are direct file access. bypass * bufmgr */long NDirectFileWrite; /* e.g., I/O in psort and hashjoin. *//* local state for StartBufferIO and related functions */static volatile BufferDesc *InProgressBuf = NULL;static bool IsForInput;/* local state for LockBufferForCleanup */static volatile BufferDesc *PinCountWaitBuf = NULL;static bool PinBuffer(volatile BufferDesc *buf);static void PinBuffer_Locked(volatile BufferDesc *buf);static void UnpinBuffer(volatile BufferDesc *buf, bool fixOwner, bool normalAccess);static bool SyncOneBuffer(int buf_id, bool skip_pinned);static void WaitIO(volatile BufferDesc *buf);static bool StartBufferIO(volatile BufferDesc *buf, bool forInput);static void TerminateBufferIO(volatile BufferDesc *buf, bool clear_dirty, int set_flag_bits);static void buffer_write_error_callback(void *arg);static volatile BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr);static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln);static void AtProcExit_Buffers(int code, Datum arg);static void write_buffer(Buffer buffer, bool unpin);/* * ReadBuffer -- returns a buffer containing the requested * block of the requested relation. If the blknum * requested is P_NEW, extend the relation file and * allocate a new block. (Caller is responsible for * ensuring that only one backend tries to extend a * relation at the same time!) * * Returns: the buffer number for the buffer containing * the block read. The returned buffer has been pinned. * Does not return on error --- elog's instead. * * Assume when this function is called, that reln has been * opened already. */BufferReadBuffer(Relation reln, BlockNumber blockNum){ volatile BufferDesc *bufHdr; Block bufBlock; bool found; bool isExtend; bool isLocalBuf; /* Make sure we will have room to remember the buffer pin */ ResourceOwnerEnlargeBuffers(CurrentResourceOwner); isExtend = (blockNum == P_NEW); isLocalBuf = reln->rd_istemp; /* Open it at the smgr level if not already done */ RelationOpenSmgr(reln); /* Substitute proper block number if caller asked for P_NEW */ if (isExtend) blockNum = smgrnblocks(reln->rd_smgr); pgstat_count_buffer_read(&reln->pgstat_info, reln); if (isLocalBuf) { ReadLocalBufferCount++; bufHdr = LocalBufferAlloc(reln, blockNum, &found); if (found) LocalBufferHitCount++; } else { ReadBufferCount++; /* * lookup the buffer. IO_IN_PROGRESS is set if the requested block is * not currently in memory. */ bufHdr = BufferAlloc(reln, blockNum, &found); if (found) BufferHitCount++; } /* At this point we do NOT hold any locks. */ /* if it was already in the buffer pool, we're done */ if (found) { if (!isExtend) { /* Just need to update stats before we exit */ pgstat_count_buffer_hit(&reln->pgstat_info, reln); if (VacuumCostActive) VacuumCostBalance += VacuumCostPageHit; return BufferDescriptorGetBuffer(bufHdr); } /* * We get here only in the corner case where we are trying to extend * the relation but we found a pre-existing buffer marked BM_VALID. * (This can happen because mdread doesn't complain about reads * beyond EOF --- which is arguably bogus, but changing it seems * tricky.) We *must* do smgrextend before succeeding, else the * page will not be reserved by the kernel, and the next P_NEW call * will decide to return the same page. Clear the BM_VALID bit, * do the StartBufferIO call that BufferAlloc didn't, and proceed. */ if (isLocalBuf) { /* Only need to adjust flags */ Assert(bufHdr->flags & BM_VALID); bufHdr->flags &= ~BM_VALID; } else { /* * Loop to handle the very small possibility that someone * re-sets BM_VALID between our clearing it and StartBufferIO * inspecting it. */ do { LockBufHdr(bufHdr); Assert(bufHdr->flags & BM_VALID); bufHdr->flags &= ~BM_VALID; UnlockBufHdr(bufHdr); } while (!StartBufferIO(bufHdr, true)); } } /* * if we have gotten to this point, we have allocated a buffer for the * page but its contents are not yet valid. IO_IN_PROGRESS is set for it, * if it's a shared buffer. * * Note: if smgrextend fails, we will end up with a buffer that is * allocated but not marked BM_VALID. P_NEW will still select the same * block number (because the relation didn't get any longer on disk) and * so future attempts to extend the relation will find the same buffer (if * it's not been recycled) but come right back here to try smgrextend * again. */ Assert(!(bufHdr->flags & BM_VALID)); /* spinlock not needed */ bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr); if (isExtend) { /* new buffers are zero-filled */ MemSet((char *) bufBlock, 0, BLCKSZ); smgrextend(reln->rd_smgr, blockNum, (char *) bufBlock, reln->rd_istemp); } else { smgrread(reln->rd_smgr, blockNum, (char *) bufBlock); /* check for garbage data */ if (!PageHeaderIsValid((PageHeader) bufBlock)) { /* * During WAL recovery, the first access to any data page should * overwrite the whole page from the WAL; so a clobbered page * header is not reason to fail. Hence, when InRecovery we may * always act as though zero_damaged_pages is ON. */ if (zero_damaged_pages || InRecovery) { ereport(WARNING, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("invalid page header in block %u of relation \"%s\"; zeroing out page", blockNum, RelationGetRelationName(reln)))); MemSet((char *) bufBlock, 0, BLCKSZ); } else ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("invalid page header in block %u of relation \"%s\"", blockNum, RelationGetRelationName(reln)))); } } if (isLocalBuf) { /* Only need to adjust flags */ bufHdr->flags |= BM_VALID; } else { /* Set BM_VALID, terminate IO, and wake up any waiters */ TerminateBufferIO(bufHdr, false, BM_VALID); } if (VacuumCostActive) VacuumCostBalance += VacuumCostPageMiss; return BufferDescriptorGetBuffer(bufHdr);}/* * BufferAlloc -- subroutine for ReadBuffer. Handles lookup of a shared * buffer. If no buffer exists already, selects a replacement * victim and evicts the old page, but does NOT read in new page. * * The returned buffer is pinned and is already marked as holding the * desired page. If it already did have the desired page, *foundPtr is * set TRUE. Otherwise, *foundPtr is set FALSE and the buffer is marked * as IO_IN_PROGRESS; ReadBuffer will now need to do I/O to fill it. * * *foundPtr is actually redundant with the buffer's BM_VALID flag, but * we keep it for simplicity in ReadBuffer. * * No locks are held either at entry or exit. */static volatile BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr){ BufferTag newTag; /* identity of requested block */ BufferTag oldTag; BufFlags oldFlags; int buf_id; volatile BufferDesc *buf; bool valid; /* create a tag so we can lookup the buffer */ INIT_BUFFERTAG(newTag, reln, blockNum); /* see if the block is in the buffer pool already */ LWLockAcquire(BufMappingLock, LW_SHARED); buf_id = BufTableLookup(&newTag); if (buf_id >= 0) { /* * Found it. Now, pin the buffer so no one can steal it from the * buffer pool, and check to see if the correct data has been loaded * into the buffer. */ buf = &BufferDescriptors[buf_id]; valid = PinBuffer(buf); /* Can release the mapping lock as soon as we've pinned it */ LWLockRelease(BufMappingLock); *foundPtr = TRUE; if (!valid) { /* * We can only get here if (a) someone else is still reading in * the page, or (b) a previous read attempt failed. We have to * wait for any active read attempt to finish, and then set up our * own read attempt if the page is still not BM_VALID. * StartBufferIO does it all. */ if (StartBufferIO(buf, true)) { /* * If we get here, previous attempts to read the buffer must * have failed ... but we shall bravely try again. */ *foundPtr = FALSE; } } return buf; } /* * Didn't find it in the buffer pool. We'll have to initialize a new * buffer. Remember to unlock BufMappingLock while doing the work. */ LWLockRelease(BufMappingLock); /* Loop here in case we have to try another victim buffer */ for (;;) { /* * Select a victim buffer. The buffer is returned with its header * spinlock still held! Also the BufFreelistLock is still held, since * it would be bad to hold the spinlock while possibly waking up other * processes. */ buf = StrategyGetBuffer(); Assert(buf->refcount == 0); /* Must copy buffer flags while we still hold the spinlock */ oldFlags = buf->flags; /* Pin the buffer and then release the buffer spinlock */ PinBuffer_Locked(buf); /* Now it's safe to release the freelist lock */ LWLockRelease(BufFreelistLock); /* * If the buffer was dirty, try to write it out. There is a race * condition here, in that someone might dirty it after we released it * above, or even while we are writing it out (since our share-lock * won't prevent hint-bit updates). We will recheck the dirty bit * after re-locking the buffer header. */ if (oldFlags & BM_DIRTY) { /* * We need a share-lock on the buffer contents to write it out * (else we might write invalid data, eg because someone else is * compacting the page contents while we write). We must use a * conditional lock acquisition here to avoid deadlock. Even * though the buffer was not pinned (and therefore surely not * locked) when StrategyGetBuffer returned it, someone else could * have pinned and exclusive-locked it by the time we get here. If * we try to get the lock unconditionally, we'd block waiting for * them; if they later block waiting for us, deadlock ensues. * (This has been observed to happen when two backends are both * trying to split btree index pages, and the second one just * happens to be trying to split the page the first one got from * StrategyGetBuffer.) */ if (LWLockConditionalAcquire(buf->content_lock, LW_SHARED)) { FlushBuffer(buf, NULL); LWLockRelease(buf->content_lock); } else { /* * Someone else has pinned the buffer, so give it up and loop * back to get another one. */ UnpinBuffer(buf, true, false /* evidently recently used */ ); continue; } } /* * Acquire exclusive mapping lock in preparation for changing the * buffer's association. */ LWLockAcquire(BufMappingLock, LW_EXCLUSIVE); /* * Try to make a hashtable entry for the buffer under its new tag. * This could fail because while we were writing someone else * allocated another buffer for the same block we want to read in. * Note that we have not yet removed the hashtable entry for the old * tag. */ buf_id = BufTableInsert(&newTag, buf->buf_id); if (buf_id >= 0) { /* * Got a collision. Someone has already done what we were about to * do. We'll just handle this as if it were found in the buffer * pool in the first place. First, give up the buffer we were * planning to use. Don't allow it to be thrown in the free list * (we don't want to hold both global locks at once). */ UnpinBuffer(buf, true, false); /* remaining code should match code at top of routine */ buf = &BufferDescriptors[buf_id]; valid = PinBuffer(buf); /* Can release the mapping lock as soon as we've pinned it */ LWLockRelease(BufMappingLock); *foundPtr = TRUE; if (!valid) { /* * We can only get here if (a) someone else is still reading * in the page, or (b) a previous read attempt failed. We * have to wait for any active read attempt to finish, and * then set up our own read attempt if the page is still not * BM_VALID. StartBufferIO does it all. */ if (StartBufferIO(buf, true)) { /* * If we get here, previous attempts to read the buffer * must have failed ... but we shall bravely try again. */ *foundPtr = FALSE; } } return buf; } /* * Need to lock the buffer header too in order to change its tag. */ LockBufHdr_NoHoldoff(buf); /* * Somebody could have pinned or re-dirtied the buffer while we were * doing the I/O and making the new hashtable entry. If so, we can't * recycle this buffer; we must undo everything we've done and start * over with a new victim buffer. */ if (buf->refcount == 1 && !(buf->flags & BM_DIRTY)) break; UnlockBufHdr_NoHoldoff(buf); BufTableDelete(&newTag); LWLockRelease(BufMappingLock); UnpinBuffer(buf, true, false /* evidently recently used */ ); } /* * Okay, it's finally safe to rename the buffer. * * Clearing BM_VALID here is necessary, clearing the dirtybits is just * paranoia. We also clear the usage_count since any recency of use of * the old content is no longer relevant. */ oldTag = buf->tag; oldFlags = buf->flags; buf->tag = newTag; buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR); buf->flags |= BM_TAG_VALID; buf->usage_count = 0; UnlockBufHdr_NoHoldoff(buf); if (oldFlags & BM_TAG_VALID) BufTableDelete(&oldTag); LWLockRelease(BufMappingLock); /* * Buffer contents are currently invalid. Try to get the io_in_progress * lock. If StartBufferIO returns false, then someone else managed to * read it before we did, so there's nothing left for BufferAlloc() to do. */ if (StartBufferIO(buf, true)) *foundPtr = FALSE; else *foundPtr = TRUE; return buf;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -