📄 md.c
字号:
/*------------------------------------------------------------------------- * * md.c * This code manages relations that reside on magnetic disk. * * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.135.2.1 2008/04/18 06:48:50 heikki Exp $ * *------------------------------------------------------------------------- */#include "postgres.h"#include <unistd.h>#include <fcntl.h>#include <sys/file.h>#include "catalog/catalog.h"#include "miscadmin.h"#include "postmaster/bgwriter.h"#include "storage/fd.h"#include "storage/bufmgr.h"#include "storage/smgr.h"#include "utils/hsearch.h"#include "utils/memutils.h"/* interval for calling AbsorbFsyncRequests in mdsync */#define FSYNCS_PER_ABSORB 10/* special values for the segno arg to RememberFsyncRequest */#define FORGET_RELATION_FSYNC (InvalidBlockNumber)#define FORGET_DATABASE_FSYNC (InvalidBlockNumber-1)#define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)/* * On Windows, we have to interpret EACCES as possibly meaning the same as * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform, * that's what you get. Ugh. This code is designed so that we don't * actually believe these cases are okay without further evidence (namely, * a pending fsync request getting revoked ... see mdsync). */#ifndef WIN32#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT)#else#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT || (err) == EACCES)#endif/* * The magnetic disk storage manager keeps track of open file * descriptors in its own descriptor pool. This is done to make it * easier to support relations that are larger than the operating * system's file size limit (often 2GBytes). In order to do that, * we break relations up into "segment" files that are each shorter than * the OS file size limit. The segment size is set by the RELSEG_SIZE * configuration constant in pg_config_manual.h. * * On disk, a relation must consist of consecutively numbered segment * files in the pattern * -- Zero or more full segments of exactly RELSEG_SIZE blocks each * -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks * -- Optionally, any number of inactive segments of size 0 blocks. * The full and partial segments are collectively the "active" segments. * Inactive segments are those that once contained data but are currently * not needed because of an mdtruncate() operation. The reason for leaving * them present at size zero, rather than unlinking them, is that other * backends and/or the bgwriter might be holding open file references to * such segments. If the relation expands again after mdtruncate(), such * that a deactivated segment becomes active again, it is important that * such file references still be valid --- else data might get written * out to an unlinked old copy of a segment file that will eventually * disappear. * * The file descriptor pointer (md_fd field) stored in the SMgrRelation * cache is, therefore, just the head of a list of MdfdVec objects, one * per segment. But note the md_fd pointer can be NULL, indicating * relation not open. * * Also note that mdfd_chain == NULL does not necessarily mean the relation * doesn't have another segment after this one; we may just not have * opened the next segment yet. (We could not have "all segments are * in the chain" as an invariant anyway, since another backend could * extend the relation when we weren't looking.) We do not make chain * entries for inactive segments, however; as soon as we find a partial * segment, we assume that any subsequent segments are inactive. * * All MdfdVec objects are palloc'd in the MdCxt memory context. * * Defining LET_OS_MANAGE_FILESIZE disables the segmentation logic, * for use on machines that support large files. Beware that that * code has not been tested in a long time and is probably bit-rotted. */typedef struct _MdfdVec{ File mdfd_vfd; /* fd number in fd.c's pool */ BlockNumber mdfd_segno; /* segment number, from 0 */#ifndef LET_OS_MANAGE_FILESIZE /* for large relations */ struct _MdfdVec *mdfd_chain; /* next segment, or NULL */#endif} MdfdVec;static MemoryContext MdCxt; /* context for all md.c allocations *//* * In some contexts (currently, standalone backends and the bgwriter process) * we keep track of pending fsync operations: we need to remember all relation * segments that have been written since the last checkpoint, so that we can * fsync them down to disk before completing the next checkpoint. This hash * table remembers the pending operations. We use a hash table mostly as * a convenient way of eliminating duplicate requests. * * We use a similar mechanism to remember no-longer-needed files that can * be deleted after the next checkpoint, but we use a linked list instead of * a hash table, because we don't expect there to be any duplicate requests. * * (Regular backends do not track pending operations locally, but forward * them to the bgwriter.) */typedef struct{ RelFileNode rnode; /* the targeted relation */ BlockNumber segno; /* which segment */} PendingOperationTag;typedef uint16 CycleCtr; /* can be any convenient integer size */typedef struct{ PendingOperationTag tag; /* hash table key (must be first!) */ bool canceled; /* T => request canceled, not yet removed */ CycleCtr cycle_ctr; /* mdsync_cycle_ctr when request was made */} PendingOperationEntry;typedef struct{ RelFileNode rnode; /* the dead relation to delete */ CycleCtr cycle_ctr; /* mdckpt_cycle_ctr when request was made */} PendingUnlinkEntry;static HTAB *pendingOpsTable = NULL;static List *pendingUnlinks = NIL;static CycleCtr mdsync_cycle_ctr = 0;static CycleCtr mdckpt_cycle_ctr = 0;typedef enum /* behavior for mdopen & _mdfd_getseg */{ EXTENSION_FAIL, /* ereport if segment not present */ EXTENSION_RETURN_NULL, /* return NULL if not present */ EXTENSION_CREATE /* create new segments as needed */} ExtensionBehavior;/* local routines */static MdfdVec *mdopen(SMgrRelation reln, ExtensionBehavior behavior);static void register_dirty_segment(SMgrRelation reln, MdfdVec *seg);static void register_unlink(RelFileNode rnode);static MdfdVec *_fdvec_alloc(void);#ifndef LET_OS_MANAGE_FILESIZEstatic MdfdVec *_mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags);#endifstatic MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool isTemp, ExtensionBehavior behavior);static BlockNumber _mdnblocks(SMgrRelation reln, MdfdVec *seg);/* * mdinit() -- Initialize private state for magnetic disk storage manager. */voidmdinit(void){ MdCxt = AllocSetContextCreate(TopMemoryContext, "MdSmgr", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); /* * Create pending-operations hashtable if we need it. Currently, we need * it if we are standalone (not under a postmaster) OR if we are a * bootstrap-mode subprocess of a postmaster (that is, a startup or * bgwriter process). */ if (!IsUnderPostmaster || IsBootstrapProcessingMode()) { HASHCTL hash_ctl; MemSet(&hash_ctl, 0, sizeof(hash_ctl)); hash_ctl.keysize = sizeof(PendingOperationTag); hash_ctl.entrysize = sizeof(PendingOperationEntry); hash_ctl.hash = tag_hash; hash_ctl.hcxt = MdCxt; pendingOpsTable = hash_create("Pending Ops Table", 100L, &hash_ctl, HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT); pendingUnlinks = NIL; }}/* * mdcreate() -- Create a new relation on magnetic disk. * * If isRedo is true, it's okay for the relation to exist already. */voidmdcreate(SMgrRelation reln, bool isRedo){ char *path; File fd; if (isRedo && reln->md_fd != NULL) return; /* created and opened already... */ Assert(reln->md_fd == NULL); path = relpath(reln->smgr_rnode); fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600); if (fd < 0) { int save_errno = errno; /* * During bootstrap, there are cases where a system relation will be * accessed (by internal backend processes) before the bootstrap * script nominally creates it. Therefore, allow the file to exist * already, even if isRedo is not set. (See also mdopen) */ if (isRedo || IsBootstrapProcessingMode()) fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600); if (fd < 0) { pfree(path); /* be sure to report the error reported by create, not open */ errno = save_errno; ereport(ERROR, (errcode_for_file_access(), errmsg("could not create relation %u/%u/%u: %m", reln->smgr_rnode.spcNode, reln->smgr_rnode.dbNode, reln->smgr_rnode.relNode))); } } pfree(path); reln->md_fd = _fdvec_alloc(); reln->md_fd->mdfd_vfd = fd; reln->md_fd->mdfd_segno = 0;#ifndef LET_OS_MANAGE_FILESIZE reln->md_fd->mdfd_chain = NULL;#endif}/* * mdunlink() -- Unlink a relation. * * Note that we're passed a RelFileNode --- by the time this is called, * there won't be an SMgrRelation hashtable entry anymore. * * Actually, we don't unlink the first segment file of the relation, but * just truncate it to zero length, and record a request to unlink it after * the next checkpoint. Additional segments can be unlinked immediately, * however. Leaving the empty file in place prevents that relfilenode * number from being reused. The scenario this protects us from is: * 1. We delete a relation (and commit, and actually remove its file). * 2. We create a new relation, which by chance gets the same relfilenode as * the just-deleted one (OIDs must've wrapped around for that to happen). * 3. We crash before another checkpoint occurs. * During replay, we would delete the file and then recreate it, which is fine * if the contents of the file were repopulated by subsequent WAL entries. * But if we didn't WAL-log insertions, but instead relied on fsyncing the * file after populating it (as for instance CLUSTER and CREATE INDEX do), * the contents of the file would be lost forever. By leaving the empty file * until after the next checkpoint, we prevent reassignment of the relfilenode * number until it's safe, because relfilenode assignment skips over any * existing file. * * If isRedo is true, it's okay for the relation to be already gone. * Also, we should remove the file immediately instead of queuing a request * for later, since during redo there's no possibility of creating a * conflicting relation. * * Note: any failure should be reported as WARNING not ERROR, because * we are usually not in a transaction anymore when this is called. */voidmdunlink(RelFileNode rnode, bool isRedo){ char *path; int ret; /* * We have to clean out any pending fsync requests for the doomed * relation, else the next mdsync() will fail. */ ForgetRelationFsyncRequests(rnode); path = relpath(rnode); /* * Delete or truncate the first segment, or only segment if not doing * segmenting */ if (isRedo) ret = unlink(path); else { /* truncate(2) would be easier here, but Windows hasn't got it */ int fd; fd = BasicOpenFile(path, O_RDWR | PG_BINARY, 0); if (fd >= 0) { int save_errno; ret = ftruncate(fd, 0); save_errno = errno; close(fd); errno = save_errno; } else ret = -1; } if (ret < 0) { if (!isRedo || errno != ENOENT) ereport(WARNING, (errcode_for_file_access(), errmsg("could not remove relation %u/%u/%u: %m", rnode.spcNode, rnode.dbNode, rnode.relNode))); }#ifndef LET_OS_MANAGE_FILESIZE /* Delete the additional segments, if any */ else { char *segpath = (char *) palloc(strlen(path) + 12); BlockNumber segno; /* * Note that because we loop until getting ENOENT, we will correctly * remove all inactive segments as well as active ones. */ for (segno = 1;; segno++) { sprintf(segpath, "%s.%u", path, segno); if (unlink(segpath) < 0) { /* ENOENT is expected after the last segment... */ if (errno != ENOENT) ereport(WARNING, (errcode_for_file_access(), errmsg("could not remove segment %u of relation %u/%u/%u: %m", segno, rnode.spcNode, rnode.dbNode, rnode.relNode))); break; } } pfree(segpath); }#endif pfree(path); /* Register request to unlink first segment later */ if (!isRedo) register_unlink(rnode);}/* * mdextend() -- Add a block to the specified relation. * * The semantics are nearly the same as mdwrite(): write at the * specified position. However, this is to be used for the case of * extending a relation (i.e., blocknum is at or beyond the current * EOF). Note that we assume writing a block beyond current EOF * causes intervening file space to become filled with zeroes. */voidmdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp){ long seekpos; int nbytes; MdfdVec *v; /* This assert is too expensive to have on normally ... */#ifdef CHECK_WRITE_VS_EXTEND Assert(blocknum >= mdnblocks(reln));#endif /*
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -