📄 xlog.c
字号:
* Since fsync is usually a horribly expensive operation, we try to * piggyback as much data as we can on each fsync: if we see any more data * entered into the xlog buffer, we'll write and fsync that too, so that * the final value of LogwrtResult.Flush is as large as possible. This * gives us some chance of avoiding another fsync immediately after. */ /* initialize to given target; may increase below */ WriteRqstPtr = record; /* read LogwrtResult and update local state */ { /* use volatile pointer to prevent code rearrangement */ volatile XLogCtlData *xlogctl = XLogCtl; SpinLockAcquire(&xlogctl->info_lck); if (XLByteLT(WriteRqstPtr, xlogctl->LogwrtRqst.Write)) WriteRqstPtr = xlogctl->LogwrtRqst.Write; LogwrtResult = xlogctl->LogwrtResult; SpinLockRelease(&xlogctl->info_lck); } /* done already? */ if (!XLByteLE(record, LogwrtResult.Flush)) { /* now wait for the write lock */ LWLockAcquire(WALWriteLock, LW_EXCLUSIVE); LogwrtResult = XLogCtl->Write.LogwrtResult; if (!XLByteLE(record, LogwrtResult.Flush)) { /* try to write/flush later additions to XLOG as well */ if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE)) { XLogCtlInsert *Insert = &XLogCtl->Insert; uint32 freespace = INSERT_FREESPACE(Insert); if (freespace < SizeOfXLogRecord) /* buffer is full */ WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx]; else { WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx]; WriteRqstPtr.xrecoff -= freespace; } LWLockRelease(WALInsertLock); WriteRqst.Write = WriteRqstPtr; WriteRqst.Flush = WriteRqstPtr; } else { WriteRqst.Write = WriteRqstPtr; WriteRqst.Flush = record; } XLogWrite(WriteRqst, false, false); } LWLockRelease(WALWriteLock); } END_CRIT_SECTION(); /* * If we still haven't flushed to the request point then we have a * problem; most likely, the requested flush point is past end of XLOG. * This has been seen to occur when a disk page has a corrupted LSN. * * Formerly we treated this as a PANIC condition, but that hurts the * system's robustness rather than helping it: we do not want to take down * the whole system due to corruption on one data page. In particular, if * the bad page is encountered again during recovery then we would be * unable to restart the database at all! (This scenario has actually * happened in the field several times with 7.1 releases. Note that we * cannot get here while InRedo is true, but if the bad page is brought in * and marked dirty during recovery then CreateCheckPoint will try to * flush it at the end of recovery.) * * The current approach is to ERROR under normal conditions, but only * WARNING during recovery, so that the system can be brought up even if * there's a corrupt LSN. Note that for calls from xact.c, the ERROR will * be promoted to PANIC since xact.c calls this routine inside a critical * section. However, calls from bufmgr.c are not within critical sections * and so we will not force a restart for a bad LSN on a data page. */ if (XLByteLT(LogwrtResult.Flush, record)) elog(InRecovery ? WARNING : ERROR, "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X", record.xlogid, record.xrecoff, LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);}/* * Flush xlog, but without specifying exactly where to flush to. * * We normally flush only completed blocks; but if there is nothing to do on * that basis, we check for unflushed async commits in the current incomplete * block, and flush through the latest one of those. Thus, if async commits * are not being used, we will flush complete blocks only. We can guarantee * that async commits reach disk after at most three cycles; normally only * one or two. (We allow XLogWrite to write "flexibly", meaning it can stop * at the end of the buffer ring; this makes a difference only with very high * load or long wal_writer_delay, but imposes one extra cycle for the worst * case for async commits.) * * This routine is invoked periodically by the background walwriter process. */voidXLogBackgroundFlush(void){ XLogRecPtr WriteRqstPtr; bool flexible = true; /* read LogwrtResult and update local state */ { /* use volatile pointer to prevent code rearrangement */ volatile XLogCtlData *xlogctl = XLogCtl; SpinLockAcquire(&xlogctl->info_lck); LogwrtResult = xlogctl->LogwrtResult; WriteRqstPtr = xlogctl->LogwrtRqst.Write; SpinLockRelease(&xlogctl->info_lck); } /* back off to last completed page boundary */ WriteRqstPtr.xrecoff -= WriteRqstPtr.xrecoff % XLOG_BLCKSZ; /* if we have already flushed that far, consider async commit records */ if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush)) { /* use volatile pointer to prevent code rearrangement */ volatile XLogCtlData *xlogctl = XLogCtl; SpinLockAcquire(&xlogctl->info_lck); WriteRqstPtr = xlogctl->asyncCommitLSN; SpinLockRelease(&xlogctl->info_lck); flexible = false; /* ensure it all gets written */ } /* Done if already known flushed */ if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush)) return;#ifdef WAL_DEBUG if (XLOG_DEBUG) elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X", WriteRqstPtr.xlogid, WriteRqstPtr.xrecoff, LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff, LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);#endif START_CRIT_SECTION(); /* now wait for the write lock */ LWLockAcquire(WALWriteLock, LW_EXCLUSIVE); LogwrtResult = XLogCtl->Write.LogwrtResult; if (!XLByteLE(WriteRqstPtr, LogwrtResult.Flush)) { XLogwrtRqst WriteRqst; WriteRqst.Write = WriteRqstPtr; WriteRqst.Flush = WriteRqstPtr; XLogWrite(WriteRqst, flexible, false); } LWLockRelease(WALWriteLock); END_CRIT_SECTION();}/* * Flush any previous asynchronously-committed transactions' commit records. * * NOTE: it is unwise to assume that this provides any strong guarantees. * In particular, because of the inexact LSN bookkeeping used by clog.c, * we cannot assume that hint bits will be settable for these transactions. */voidXLogAsyncCommitFlush(void){ XLogRecPtr WriteRqstPtr; /* use volatile pointer to prevent code rearrangement */ volatile XLogCtlData *xlogctl = XLogCtl; SpinLockAcquire(&xlogctl->info_lck); WriteRqstPtr = xlogctl->asyncCommitLSN; SpinLockRelease(&xlogctl->info_lck); XLogFlush(WriteRqstPtr);}/* * Test whether XLOG data has been flushed up to (at least) the given position. * * Returns true if a flush is still needed. (It may be that someone else * is already in process of flushing that far, however.) */boolXLogNeedsFlush(XLogRecPtr record){ /* Quick exit if already known flushed */ if (XLByteLE(record, LogwrtResult.Flush)) return false; /* read LogwrtResult and update local state */ { /* use volatile pointer to prevent code rearrangement */ volatile XLogCtlData *xlogctl = XLogCtl; SpinLockAcquire(&xlogctl->info_lck); LogwrtResult = xlogctl->LogwrtResult; SpinLockRelease(&xlogctl->info_lck); } /* check again */ if (XLByteLE(record, LogwrtResult.Flush)) return false; return true;}/* * Create a new XLOG file segment, or open a pre-existing one. * * log, seg: identify segment to be created/opened. * * *use_existent: if TRUE, OK to use a pre-existing file (else, any * pre-existing file will be deleted). On return, TRUE if a pre-existing * file was used. * * use_lock: if TRUE, acquire ControlFileLock while moving file into * place. This should be TRUE except during bootstrap log creation. The * caller must *not* hold the lock at call. * * Returns FD of opened file. * * Note: errors here are ERROR not PANIC because we might or might not be * inside a critical section (eg, during checkpoint there is no reason to * take down the system on failure). They will promote to PANIC if we are * in a critical section. */static intXLogFileInit(uint32 log, uint32 seg, bool *use_existent, bool use_lock){ char path[MAXPGPATH]; char tmppath[MAXPGPATH]; char *zbuffer; uint32 installed_log; uint32 installed_seg; int max_advance; int fd; int nbytes; XLogFilePath(path, ThisTimeLineID, log, seg); /* * Try to use existent file (checkpoint maker may have created it already) */ if (*use_existent) { fd = BasicOpenFile(path, O_RDWR | PG_BINARY | XLOG_SYNC_BIT, S_IRUSR | S_IWUSR); if (fd < 0) { if (errno != ENOENT) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\" (log file %u, segment %u): %m", path, log, seg))); } else return fd; } /* * Initialize an empty (all zeroes) segment. NOTE: it is possible that * another process is doing the same thing. If so, we will end up * pre-creating an extra log segment. That seems OK, and better than * holding the lock throughout this lengthy process. */ elog(DEBUG2, "creating and filling new WAL file"); snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid()); unlink(tmppath); /* do not use XLOG_SYNC_BIT here --- want to fsync only at end of fill */ fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", tmppath))); /* * Zero-fill the file. We have to do this the hard way to ensure that all * the file space has really been allocated --- on platforms that allow * "holes" in files, just seeking to the end doesn't allocate intermediate * space. This way, we know that we have all the space and (after the * fsync below) that all the indirect blocks are down on disk. Therefore, * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the * log file. * * Note: palloc zbuffer, instead of just using a local char array, to * ensure it is reasonably well-aligned; this may save a few cycles * transferring data to the kernel. */ zbuffer = (char *) palloc0(XLOG_BLCKSZ); for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ) { errno = 0; if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ) { int save_errno = errno; /* * If we fail to make the file, delete it to release disk space */ unlink(tmppath); /* if write didn't set errno, assume problem is no disk space */ errno = save_errno ? save_errno : ENOSPC; ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", tmppath))); } } pfree(zbuffer); if (pg_fsync(fd) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", tmppath))); if (close(fd)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close file \"%s\": %m", tmppath))); /* * Now move the segment into place with its final name. * * If caller didn't want to use a pre-existing file, get rid of any * pre-existing file. Otherwise, cope with possibility that someone else * has created the file while we were filling ours: if so, use ours to * pre-create a future log segment. */ installed_log = log; installed_seg = seg; max_advance = XLOGfileslop; if (!InstallXLogFileSegment(&installed_log, &installed_seg, tmppath, *use_existent, &max_advance, use_lock)) { /* No need for any more future segments... */ unlink(tmppath); } elog(DEBUG2, "done creating and filling new WAL file"); /* Set flag to tell caller there was no existent file */ *use_existent = false; /* Now open original target segment (might not be file I just made) */ fd = BasicOpenFile(path, O_RDWR | PG_BINARY | XLOG_SYNC_BIT, S_IRUSR | S_IWUSR); if (fd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\" (log file %u, segment %u): %m", path, log, seg))); return fd;}/* * Create a new XLOG file segment by copying a pre-existing one. * * log, seg: identify segment to be created. * * srcTLI, srclog, srcseg: identify segment to be copied (could be from * a different timeline) * * Currently this is only used during recovery, and so there are no locking * considerations. But we should be just as tense as XLogFileInit to avoid * emplacing a bogus file. */static voidXLogFileCopy(uint32 log, uint32 seg, TimeLineID srcTLI, uint32 srclog, uint32 srcseg){ char path[MAXPGPATH]; char tmppath[MAXPGPATH]; char buffer[XLOG_BLCKSZ]; int srcfd; int fd; int nbytes; /* * Open the source file */ XLogFilePath(path, srcTLI, srclog, srcseg); srcfd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0); if (srcfd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", path))); /* * Copy into a temp file name. */ snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid()); unlink(tmppath); /* do not use XLOG_SYNC_BIT here --- want to fsync only at end of fill */ fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) ereport(ERROR, (errcode_for_file_access(),
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -