📄 xlog.c
字号:
/*------------------------------------------------------------------------- * * xlog.c * PostgreSQL transaction log manager * * * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.222.2.3 2006/03/28 22:01:25 tgl Exp $ * *------------------------------------------------------------------------- */#include "postgres.h"#include <ctype.h>#include <fcntl.h>#include <signal.h>#include <time.h>#include <unistd.h>#include <sys/stat.h>#include <sys/time.h>#include "access/clog.h"#include "access/multixact.h"#include "access/subtrans.h"#include "access/twophase.h"#include "access/xact.h"#include "access/xlog.h"#include "access/xlog_internal.h"#include "access/xlogutils.h"#include "catalog/catversion.h"#include "catalog/pg_control.h"#include "miscadmin.h"#include "pgstat.h"#include "postmaster/bgwriter.h"#include "storage/bufpage.h"#include "storage/fd.h"#include "storage/lwlock.h"#include "storage/pmsignal.h"#include "storage/proc.h"#include "storage/procarray.h"#include "storage/spin.h"#include "utils/builtins.h"#include "utils/guc.h"#include "utils/nabstime.h"#include "utils/pg_locale.h"#include "utils/relcache.h"/* * Because O_DIRECT bypasses the kernel buffers, and because we never * read those buffers except during crash recovery, it is a win to use * it in all cases where we sync on each write(). We could allow O_DIRECT * with fsync(), but because skipping the kernel buffer forces writes out * quickly, it seems best just to use it for O_SYNC. It is hard to imagine * how fsync() could be a win for O_DIRECT compared to O_SYNC and O_DIRECT. * Also, O_DIRECT is never enough to force data to the drives, it merely * tries to bypass the kernel cache, so we still need O_SYNC or fsync(). */#ifdef O_DIRECT#define PG_O_DIRECT O_DIRECT#else#define PG_O_DIRECT 0#endif/* * This chunk of hackery attempts to determine which file sync methods * are available on the current platform, and to choose an appropriate * default method. We assume that fsync() is always available, and that * configure determined whether fdatasync() is. */#if defined(O_SYNC)#define BARE_OPEN_SYNC_FLAG O_SYNC#elif defined(O_FSYNC)#define BARE_OPEN_SYNC_FLAG O_FSYNC#endif#ifdef BARE_OPEN_SYNC_FLAG#define OPEN_SYNC_FLAG (BARE_OPEN_SYNC_FLAG | PG_O_DIRECT)#endif#if defined(O_DSYNC)#if defined(OPEN_SYNC_FLAG)/* O_DSYNC is distinct? */#if O_DSYNC != BARE_OPEN_SYNC_FLAG#define OPEN_DATASYNC_FLAG (O_DSYNC | PG_O_DIRECT)#endif#else /* !defined(OPEN_SYNC_FLAG) *//* Win32 only has O_DSYNC */#define OPEN_DATASYNC_FLAG (O_DSYNC | PG_O_DIRECT)#endif#endif#if defined(OPEN_DATASYNC_FLAG)#define DEFAULT_SYNC_METHOD_STR "open_datasync"#define DEFAULT_SYNC_METHOD SYNC_METHOD_OPEN#define DEFAULT_SYNC_FLAGBIT OPEN_DATASYNC_FLAG#elif defined(HAVE_FDATASYNC)#define DEFAULT_SYNC_METHOD_STR "fdatasync"#define DEFAULT_SYNC_METHOD SYNC_METHOD_FDATASYNC#define DEFAULT_SYNC_FLAGBIT 0#elif defined(HAVE_FSYNC_WRITETHROUGH_ONLY)#define DEFAULT_SYNC_METHOD_STR "fsync_writethrough"#define DEFAULT_SYNC_METHOD SYNC_METHOD_FSYNC_WRITETHROUGH#define DEFAULT_SYNC_FLAGBIT 0#else#define DEFAULT_SYNC_METHOD_STR "fsync"#define DEFAULT_SYNC_METHOD SYNC_METHOD_FSYNC#define DEFAULT_SYNC_FLAGBIT 0#endif/* * Limitation of buffer-alignment for direct IO depends on OS and filesystem, * but BLCKSZ is assumed to be enough for it. */#ifdef O_DIRECT#define ALIGNOF_XLOG_BUFFER BLCKSZ#else#define ALIGNOF_XLOG_BUFFER ALIGNOF_BUFFER#endif/* File path names (all relative to $PGDATA) */#define BACKUP_LABEL_FILE "backup_label"#define RECOVERY_COMMAND_FILE "recovery.conf"#define RECOVERY_COMMAND_DONE "recovery.done"/* User-settable parameters */int CheckPointSegments = 3;int XLOGbuffers = 8;char *XLogArchiveCommand = NULL;char *XLOG_sync_method = NULL;const char XLOG_sync_method_default[] = DEFAULT_SYNC_METHOD_STR;bool fullPageWrites = true;#ifdef WAL_DEBUGbool XLOG_DEBUG = false;#endif/* * XLOGfileslop is used in the code as the allowed "fuzz" in the number of * preallocated XLOG segments --- we try to have at least XLOGfiles advance * segments but no more than XLOGfileslop segments. This could * be made a separate GUC variable, but at present I think it's sufficient * to hardwire it as 2*CheckPointSegments+1. Under normal conditions, a * checkpoint will free no more than 2*CheckPointSegments log segments, and * we want to recycle all of them; the +1 allows boundary cases to happen * without wasting a delete/create-segment cycle. */#define XLOGfileslop (2*CheckPointSegments + 1)/* these are derived from XLOG_sync_method by assign_xlog_sync_method */int sync_method = DEFAULT_SYNC_METHOD;static int open_sync_bit = DEFAULT_SYNC_FLAGBIT;#define XLOG_SYNC_BIT (enableFsync ? open_sync_bit : 0)/* * ThisTimeLineID will be same in all backends --- it identifies current * WAL timeline for the database system. */TimeLineID ThisTimeLineID = 0;/* Are we doing recovery from XLOG? */bool InRecovery = false;/* Are we recovering using offline XLOG archives? */static bool InArchiveRecovery = false;/* Was the last xlog file restored from archive, or local? */static bool restoredFromArchive = false;/* options taken from recovery.conf */static char *recoveryRestoreCommand = NULL;static bool recoveryTarget = false;static bool recoveryTargetExact = false;static bool recoveryTargetInclusive = true;static TransactionId recoveryTargetXid;static time_t recoveryTargetTime;/* if recoveryStopsHere returns true, it saves actual stop xid/time here */static TransactionId recoveryStopXid;static time_t recoveryStopTime;static bool recoveryStopAfter;/* constraint set by read_backup_label */static XLogRecPtr recoveryMinXlogOffset = {0, 0};/* * During normal operation, the only timeline we care about is ThisTimeLineID. * During recovery, however, things are more complicated. To simplify life * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we * scan through the WAL history (that is, it is the line that was active when * the currently-scanned WAL record was generated). We also need these * timeline values: * * recoveryTargetTLI: the desired timeline that we want to end in. * * expectedTLIs: an integer list of recoveryTargetTLI and the TLIs of * its known parents, newest first (so recoveryTargetTLI is always the * first list member). Only these TLIs are expected to be seen in the WAL * segments we read, and indeed only these TLIs will be considered as * candidate WAL files to open at all. * * curFileTLI: the TLI appearing in the name of the current input WAL file. * (This is not necessarily the same as ThisTimeLineID, because we could * be scanning data that was copied from an ancestor timeline when the current * file was created.) During a sequential scan we do not allow this value * to decrease. */static TimeLineID recoveryTargetTLI;static List *expectedTLIs;static TimeLineID curFileTLI;/* * MyLastRecPtr points to the start of the last XLOG record inserted by the * current transaction. If MyLastRecPtr.xrecoff == 0, then the current * xact hasn't yet inserted any transaction-controlled XLOG records. * * Note that XLOG records inserted outside transaction control are not * reflected into MyLastRecPtr. They do, however, cause MyXactMadeXLogEntry * to be set true. The latter can be used to test whether the current xact * made any loggable changes (including out-of-xact changes, such as * sequence updates). * * When we insert/update/delete a tuple in a temporary relation, we do not * make any XLOG record, since we don't care about recovering the state of * the temp rel after a crash. However, we will still need to remember * whether our transaction committed or aborted in that case. So, we must * set MyXactMadeTempRelUpdate true to indicate that the XID will be of * interest later. */XLogRecPtr MyLastRecPtr = {0, 0};bool MyXactMadeXLogEntry = false;bool MyXactMadeTempRelUpdate = false;/* * ProcLastRecPtr points to the start of the last XLOG record inserted by the * current backend. It is updated for all inserts, transaction-controlled * or not. ProcLastRecEnd is similar but points to end+1 of last record. */static XLogRecPtr ProcLastRecPtr = {0, 0};XLogRecPtr ProcLastRecEnd = {0, 0};/* * RedoRecPtr is this backend's local copy of the REDO record pointer * (which is almost but not quite the same as a pointer to the most recent * CHECKPOINT record). We update this from the shared-memory copy, * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we * hold the Insert lock). See XLogInsert for details. We are also allowed * to update from XLogCtl->Insert.RedoRecPtr if we hold the info_lck; * see GetRedoRecPtr. A freshly spawned backend obtains the value during * InitXLOGAccess. */static XLogRecPtr RedoRecPtr;/*---------- * Shared-memory data structures for XLOG control * * LogwrtRqst indicates a byte position that we need to write and/or fsync * the log up to (all records before that point must be written or fsynced). * LogwrtResult indicates the byte positions we have already written/fsynced. * These structs are identical but are declared separately to indicate their * slightly different functions. * * We do a lot of pushups to minimize the amount of access to lockable * shared memory values. There are actually three shared-memory copies of * LogwrtResult, plus one unshared copy in each backend. Here's how it works: * XLogCtl->LogwrtResult is protected by info_lck * XLogCtl->Write.LogwrtResult is protected by WALWriteLock * XLogCtl->Insert.LogwrtResult is protected by WALInsertLock * One must hold the associated lock to read or write any of these, but * of course no lock is needed to read/write the unshared LogwrtResult. * * XLogCtl->LogwrtResult and XLogCtl->Write.LogwrtResult are both "always * right", since both are updated by a write or flush operation before * it releases WALWriteLock. The point of keeping XLogCtl->Write.LogwrtResult * is that it can be examined/modified by code that already holds WALWriteLock * without needing to grab info_lck as well. * * XLogCtl->Insert.LogwrtResult may lag behind the reality of the other two, * but is updated when convenient. Again, it exists for the convenience of * code that is already holding WALInsertLock but not the other locks. * * The unshared LogwrtResult may lag behind any or all of these, and again * is updated when convenient. * * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst * (protected by info_lck), but we don't need to cache any copies of it. * * Note that this all works because the request and result positions can only * advance forward, never back up, and so we can easily determine which of two * values is "more up to date". * * info_lck is only held long enough to read/update the protected variables, * so it's a plain spinlock. The other locks are held longer (potentially * over I/O operations), so we use LWLocks for them. These locks are: * * WALInsertLock: must be held to insert a record into the WAL buffers. * * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or * XLogFlush). * * ControlFileLock: must be held to read/update control file or create * new log file. * * CheckpointLock: must be held to do a checkpoint (ensures only one * checkpointer at a time; even though the postmaster won't launch * parallel checkpoint processes, we need this because manual checkpoints * could be launched simultaneously). * *---------- */typedef struct XLogwrtRqst{ XLogRecPtr Write; /* last byte + 1 to write out */ XLogRecPtr Flush; /* last byte + 1 to flush */} XLogwrtRqst;typedef struct XLogwrtResult{ XLogRecPtr Write; /* last byte + 1 written out */ XLogRecPtr Flush; /* last byte + 1 flushed */} XLogwrtResult;/* * Shared state data for XLogInsert. */typedef struct XLogCtlInsert{ XLogwrtResult LogwrtResult; /* a recent value of LogwrtResult */ XLogRecPtr PrevRecord; /* start of previously-inserted record */ int curridx; /* current block index in cache */ XLogPageHeader currpage; /* points to header of block in cache */ char *currpos; /* current insertion point in cache */ XLogRecPtr RedoRecPtr; /* current redo point for insertions */} XLogCtlInsert;/* * Shared state data for XLogWrite/XLogFlush. */typedef struct XLogCtlWrite{ XLogwrtResult LogwrtResult; /* current value of LogwrtResult */ int curridx; /* cache index of next block to write */} XLogCtlWrite;/* * Total shared-memory state for XLOG. */typedef struct XLogCtlData{ /* Protected by WALInsertLock: */ XLogCtlInsert Insert; /* Protected by info_lck: */ XLogwrtRqst LogwrtRqst; XLogwrtResult LogwrtResult; /* Protected by WALWriteLock: */ XLogCtlWrite Write; /* * These values do not change after startup, although the pointed-to pages * and xlblocks values certainly do. Permission to read/write the pages * and xlblocks values depends on WALInsertLock and WALWriteLock. */ char *pages; /* buffers for unwritten XLOG pages */ XLogRecPtr *xlblocks; /* 1st byte ptr-s + BLCKSZ */ Size XLogCacheByte; /* # bytes in xlog buffers */ int XLogCacheBlck; /* highest allocated xlog buffer index */ TimeLineID ThisTimeLineID; slock_t info_lck; /* locks shared LogwrtRqst/LogwrtResult */} XLogCtlData;static XLogCtlData *XLogCtl = NULL;/* * We maintain an image of pg_control in shared memory. */static ControlFileData *ControlFile = NULL;/* * Macros for managing XLogInsert state. In most cases, the calling routine * has local copies of XLogCtl->Insert and/or XLogCtl->Insert->curridx, * so these are passed as parameters instead of being fetched via XLogCtl. *//* Free space remaining in the current xlog page buffer */#define INSERT_FREESPACE(Insert) \ (BLCKSZ - ((Insert)->currpos - (char *) (Insert)->currpage))/* Construct XLogRecPtr value for current insertion point */#define INSERT_RECPTR(recptr,Insert,curridx) \ ( \ (recptr).xlogid = XLogCtl->xlblocks[curridx].xlogid, \ (recptr).xrecoff = \ XLogCtl->xlblocks[curridx].xrecoff - INSERT_FREESPACE(Insert) \ )#define PrevBufIdx(idx) \ (((idx) == 0) ? XLogCtl->XLogCacheBlck : ((idx) - 1))#define NextBufIdx(idx) \ (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))/* * Private, possibly out-of-date copy of shared LogwrtResult. * See discussion above. */static XLogwrtResult LogwrtResult = {{0, 0}, {0, 0}};/* * openLogFile is -1 or a kernel FD for an open log file segment. * When it's open, openLogOff is the current seek offset in the file. * openLogId/openLogSeg identify the segment. These variables are only
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -