📄 bufmgr.c
字号:
/*------------------------------------------------------------------------- * * bufmgr.c * buffer manager interface routines * * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION * $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.141.2.1 2003/12/01 16:53:30 tgl Exp $ * *------------------------------------------------------------------------- *//* * * BufferAlloc() -- lookup a buffer in the buffer table. If * it isn't there add it, but do not read data into memory. * This is used when we are about to reinitialize the * buffer so don't care what the current disk contents are. * BufferAlloc() also pins the new buffer in memory. * * ReadBuffer() -- like BufferAlloc() but reads the data * on a buffer cache miss. * * ReleaseBuffer() -- unpin the buffer * * WriteNoReleaseBuffer() -- mark the buffer contents as "dirty" * but don't unpin. The disk IO is delayed until buffer * replacement. * * WriteBuffer() -- WriteNoReleaseBuffer() + ReleaseBuffer() * * BufferSync() -- flush all dirty buffers in the buffer pool. * * InitBufferPool() -- Init the buffer module. * * See other files: * freelist.c -- chooses victim for buffer replacement * buf_table.c -- manages the buffer lookup table */#include "postgres.h"#include <sys/file.h>#include <math.h>#include <signal.h>#include "lib/stringinfo.h"#include "miscadmin.h"#include "storage/buf_internals.h"#include "storage/bufmgr.h"#include "storage/bufpage.h"#include "storage/proc.h"#include "storage/smgr.h"#include "utils/relcache.h"#include "pgstat.h"#define BufferGetLSN(bufHdr) \ (*((XLogRecPtr*) MAKE_PTR((bufHdr)->data)))/* GUC variable */bool zero_damaged_pages = false;static void WaitIO(BufferDesc *buf);static void StartBufferIO(BufferDesc *buf, bool forInput);static void TerminateBufferIO(BufferDesc *buf);static void ContinueBufferIO(BufferDesc *buf, bool forInput);static void buffer_write_error_callback(void *arg);/* * Macro : BUFFER_IS_BROKEN * Note that write error doesn't mean the buffer broken*/#define BUFFER_IS_BROKEN(buf) ((buf->flags & BM_IO_ERROR) && !(buf->flags & BM_DIRTY))static Buffer ReadBufferInternal(Relation reln, BlockNumber blockNum, bool bufferLockHeld);static BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr);static int BufferReplace(BufferDesc *bufHdr);#ifdef NOT_USEDvoid PrintBufferDescs(void);#endifstatic void write_buffer(Buffer buffer, bool unpin);/* * ReadBuffer -- returns a buffer containing the requested * block of the requested relation. If the blknum * requested is P_NEW, extend the relation file and * allocate a new block. (Caller is responsible for * ensuring that only one backend tries to extend a * relation at the same time!) * * Returns: the buffer number for the buffer containing * the block read, or NULL on an error. If successful, * the returned buffer has been pinned. * * Assume when this function is called, that reln has been * opened already. * * Note: a side effect of a P_NEW call is to update reln->rd_nblocks. */#undef ReadBuffer /* conflicts with macro when BUFMGR_DEBUG * defined *//* * ReadBuffer */BufferReadBuffer(Relation reln, BlockNumber blockNum){ return ReadBufferInternal(reln, blockNum, false);}/* * ReadBufferInternal -- internal version of ReadBuffer with more options * * bufferLockHeld: if true, caller already acquired the bufmgr lock. * (This is assumed never to be true if dealing with a local buffer!) */static BufferReadBufferInternal(Relation reln, BlockNumber blockNum, bool bufferLockHeld){ BufferDesc *bufHdr; int status; bool found; bool isExtend; bool isLocalBuf; isExtend = (blockNum == P_NEW); isLocalBuf = reln->rd_istemp; if (isLocalBuf) { ReadLocalBufferCount++; pgstat_count_buffer_read(&reln->pgstat_info, reln); /* Substitute proper block number if caller asked for P_NEW */ if (isExtend) { blockNum = reln->rd_nblocks; reln->rd_nblocks++; } bufHdr = LocalBufferAlloc(reln, blockNum, &found); if (found) { LocalBufferHitCount++; pgstat_count_buffer_hit(&reln->pgstat_info, reln); } } else { ReadBufferCount++; pgstat_count_buffer_read(&reln->pgstat_info, reln); /* Substitute proper block number if caller asked for P_NEW */ if (isExtend) { /* must be sure we have accurate file length! */ blockNum = reln->rd_nblocks = smgrnblocks(DEFAULT_SMGR, reln); reln->rd_nblocks++; } /* * lookup the buffer. IO_IN_PROGRESS is set if the requested * block is not currently in memory. */ if (!bufferLockHeld) LWLockAcquire(BufMgrLock, LW_EXCLUSIVE); bufHdr = BufferAlloc(reln, blockNum, &found); if (found) { BufferHitCount++; pgstat_count_buffer_hit(&reln->pgstat_info, reln); } } /* At this point we do NOT hold the bufmgr lock. */ if (!bufHdr) return InvalidBuffer; /* if it's already in the buffer pool, we're done */ if (found) { /* That is, we're done if we expected to be able to find it ... */ if (!isExtend) return BufferDescriptorGetBuffer(bufHdr); /* * If we found a buffer when we were expecting to extend the * relation, the implication is that a buffer was already created * for the next page position, but then smgrextend failed to write * the page. We'd better try the smgrextend again. But since * BufferAlloc won't have done StartBufferIO, we must do that * first. */ if (!isLocalBuf) { LWLockAcquire(BufMgrLock, LW_EXCLUSIVE); StartBufferIO(bufHdr, false); LWLockRelease(BufMgrLock); } } /* * if we have gotten to this point, the reln pointer must be ok and * the relation file must be open. */ if (isExtend) { /* new buffers are zero-filled */ MemSet((char *) MAKE_PTR(bufHdr->data), 0, BLCKSZ); status = smgrextend(DEFAULT_SMGR, reln, blockNum, (char *) MAKE_PTR(bufHdr->data)); } else { status = smgrread(DEFAULT_SMGR, reln, blockNum, (char *) MAKE_PTR(bufHdr->data)); /* check for garbage data */ if (status == SM_SUCCESS && !PageHeaderIsValid((PageHeader) MAKE_PTR(bufHdr->data))) { /* * During WAL recovery, the first access to any data page should * overwrite the whole page from the WAL; so a clobbered page * header is not reason to fail. Hence, when InRecovery we may * always act as though zero_damaged_pages is ON. */ if (zero_damaged_pages || InRecovery) { ereport(WARNING, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("invalid page header in block %u of relation \"%s\"; zeroing out page", blockNum, RelationGetRelationName(reln)))); MemSet((char *) MAKE_PTR(bufHdr->data), 0, BLCKSZ); } else ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("invalid page header in block %u of relation \"%s\"", blockNum, RelationGetRelationName(reln)))); } } if (isLocalBuf) { /* No shared buffer state to update... */ if (status == SM_FAIL) { bufHdr->flags |= BM_IO_ERROR; return InvalidBuffer; } return BufferDescriptorGetBuffer(bufHdr); } /* lock buffer manager again to update IO IN PROGRESS */ LWLockAcquire(BufMgrLock, LW_EXCLUSIVE); if (status == SM_FAIL) { /* IO Failed. cleanup the data structures and go home */ if (!BufTableDelete(bufHdr)) { LWLockRelease(BufMgrLock); elog(FATAL, "buffer table broken after I/O error"); } /* remember that BufferAlloc() pinned the buffer */ UnpinBuffer(bufHdr); /* * Have to reset the flag so that anyone waiting for the buffer * can tell that the contents are invalid. */ bufHdr->flags |= BM_IO_ERROR; bufHdr->flags &= ~BM_IO_IN_PROGRESS; } else { /* IO Succeeded. clear the flags, finish buffer update */ bufHdr->flags &= ~(BM_IO_ERROR | BM_IO_IN_PROGRESS); } /* If anyone was waiting for IO to complete, wake them up now */ TerminateBufferIO(bufHdr); LWLockRelease(BufMgrLock); if (status == SM_FAIL) return InvalidBuffer; return BufferDescriptorGetBuffer(bufHdr);}/* * BufferAlloc -- Get a buffer from the buffer pool but don't * read it. If successful, the returned buffer is pinned. * * Returns: descriptor for buffer * * BufMgrLock must be held at entry. When this routine returns, * the BufMgrLock is guaranteed NOT to be held. */static BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr){ BufferDesc *buf, *buf2; BufferTag newTag; /* identity of requested block */ bool inProgress; /* buffer undergoing IO */ /* create a new tag so we can lookup the buffer */ /* assume that the relation is already open */ INIT_BUFFERTAG(&newTag, reln, blockNum); /* see if the block is in the buffer pool already */ buf = BufTableLookup(&newTag); if (buf != NULL) { /* * Found it. Now, (a) pin the buffer so no one steals it from the * buffer pool, (b) check IO_IN_PROGRESS, someone may be faulting * the buffer into the buffer pool. */ PinBuffer(buf); inProgress = (buf->flags & BM_IO_IN_PROGRESS); *foundPtr = TRUE; if (inProgress) /* confirm end of IO */ { WaitIO(buf); inProgress = (buf->flags & BM_IO_IN_PROGRESS); } if (BUFFER_IS_BROKEN(buf)) { /* * I couldn't understand the following old comment. If there's * no IO for the buffer and the buffer is BROKEN, it should be * read again. So start a new buffer IO here. * * wierd race condition: * * We were waiting for someone else to read the buffer. While we * were waiting, the reader boof'd in some way, so the * contents of the buffer are still invalid. By saying that * we didn't find it, we can make the caller reinitialize the * buffer. If two processes are waiting for this block, both * will read the block. The second one to finish may * overwrite any updates made by the first. (Assume higher * level synchronization prevents this from happening). * * This is never going to happen, don't worry about it. */ *foundPtr = FALSE; }#ifdef BMTRACE _bm_trace((reln->rd_rel->relisshared ? 0 : MyDatabaseId), RelationGetRelid(reln), blockNum, BufferDescriptorGetBuffer(buf), BMT_ALLOCFND);#endif /* BMTRACE */ if (!(*foundPtr)) StartBufferIO(buf, true); LWLockRelease(BufMgrLock); return buf; } *foundPtr = FALSE; /* * Didn't find it in the buffer pool. We'll have to initialize a new * buffer. First, grab one from the free list. If it's dirty, flush * it to disk. Remember to unlock BufMgrLock while doing the IOs. */ inProgress = FALSE; for (buf = (BufferDesc *) NULL; buf == (BufferDesc *) NULL;) { buf = GetFreeBuffer(); /* GetFreeBuffer will abort if it can't find a free buffer */ Assert(buf); /* * There should be exactly one pin on the buffer after it is * allocated -- ours. If it had a pin it wouldn't have been on * the free list. No one else could have pinned it between * GetFreeBuffer and here because we have the BufMgrLock. */ Assert(buf->refcount == 0); buf->refcount = 1; PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 1; if (buf->flags & BM_DIRTY || buf->cntxDirty) { bool smok; /* * skip write error buffers */ if ((buf->flags & BM_IO_ERROR) != 0) { UnpinBuffer(buf); buf = (BufferDesc *) NULL; continue; } /* * Set BM_IO_IN_PROGRESS to keep anyone from doing anything * with the contents of the buffer while we write it out. We * don't really care if they try to read it, but if they can * complete a BufferAlloc on it they can then scribble into * it, and we'd really like to avoid that while we are * flushing the buffer. Setting this flag should block them * in WaitIO until we're done. */ inProgress = TRUE; /* * All code paths that acquire this lock pin the buffer first; * since no one had it pinned (it just came off the free * list), no one else can have this lock. */ StartBufferIO(buf, false); /* * Write the buffer out, being careful to release BufMgrLock * before starting the I/O. */ smok = BufferReplace(buf); if (smok == FALSE) { ereport(WARNING, (errcode(ERRCODE_IO_ERROR), errmsg("could not write block %u of %u/%u", buf->tag.blockNum, buf->tag.rnode.tblNode, buf->tag.rnode.relNode))); inProgress = FALSE; buf->flags |= BM_IO_ERROR; buf->flags &= ~BM_IO_IN_PROGRESS; TerminateBufferIO(buf); UnpinBuffer(buf); buf = (BufferDesc *) NULL; } else { /* * BM_JUST_DIRTIED cleared by BufferReplace and shouldn't * be set by anyone. - vadim 01/17/97 */ if (buf->flags & BM_JUST_DIRTIED) { elog(PANIC, "content of block %u of %u/%u changed while flushing", buf->tag.blockNum, buf->tag.rnode.tblNode, buf->tag.rnode.relNode); } else buf->flags &= ~BM_DIRTY; buf->cntxDirty = false; } /* * Somebody could have pinned the buffer while we were doing * the I/O and had given up the BufMgrLock (though they would * be waiting for us to clear the BM_IO_IN_PROGRESS flag). * That's why this is a loop -- if so, we need to clear the * I/O flags, remove our pin and start all over again. * * People may be making buffers free at any time, so there's no * reason to think that we have an immediate disaster on our * hands. */ if (buf && buf->refcount > 1) { inProgress = FALSE; buf->flags &= ~BM_IO_IN_PROGRESS; TerminateBufferIO(buf); UnpinBuffer(buf); buf = (BufferDesc *) NULL; } /* * Somebody could have allocated another buffer for the same * block we are about to read in. (While we flush out the * dirty buffer, we don't hold the lock and someone could have * allocated another buffer for the same block. The problem is * we haven't gotten around to insert the new tag into the * buffer table. So we need to check here. -ay 3/95 */ buf2 = BufTableLookup(&newTag); if (buf2 != NULL) { /* * Found it. Someone has already done what we're about to * do. We'll just handle this as if it were found in the * buffer pool in the first place. */ if (buf != NULL) { buf->flags &= ~BM_IO_IN_PROGRESS; TerminateBufferIO(buf); /* give up old buffer since we don't need it any more */ UnpinBuffer(buf); } PinBuffer(buf2); inProgress = (buf2->flags & BM_IO_IN_PROGRESS); *foundPtr = TRUE; if (inProgress) { WaitIO(buf2); inProgress = (buf2->flags & BM_IO_IN_PROGRESS); } if (BUFFER_IS_BROKEN(buf2)) *foundPtr = FALSE; if (!(*foundPtr)) StartBufferIO(buf2, true); LWLockRelease(BufMgrLock); return buf2; } } } /* * At this point we should have the sole pin on a non-dirty buffer and * we may or may not already have the BM_IO_IN_PROGRESS flag set. */ /* * Change the name of the buffer in the lookup table: * * Need to update the lookup table before the read starts. If someone * comes along looking for the buffer while we are reading it in, we * don't want them to allocate a new buffer. For the same reason, we * didn't want to erase the buf table entry for the buffer we were * writing back until now, either. */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -