📄 md.c
字号:
/*------------------------------------------------------------------------- * * md.c * This code manages relations that reside on magnetic disk. * * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.118 2005/10/15 02:49:26 momjian Exp $ * *------------------------------------------------------------------------- */#include "postgres.h"#include <errno.h>#include <unistd.h>#include <fcntl.h>#include <sys/file.h>#include "catalog/catalog.h"#include "miscadmin.h"#include "postmaster/bgwriter.h"#include "storage/fd.h"#include "storage/smgr.h"#include "utils/hsearch.h"#include "utils/memutils.h"/* * The magnetic disk storage manager keeps track of open file * descriptors in its own descriptor pool. This is done to make it * easier to support relations that are larger than the operating * system's file size limit (often 2GBytes). In order to do that, * we break relations up into chunks of < 2GBytes and store one chunk * in each of several files that represent the relation. See the * BLCKSZ and RELSEG_SIZE configuration constants in pg_config_manual.h. * All chunks except the last MUST have size exactly equal to RELSEG_SIZE * blocks --- see mdnblocks() and mdtruncate(). * * The file descriptor pointer (md_fd field) stored in the SMgrRelation * cache is, therefore, just the head of a list of MdfdVec objects. * But note the md_fd pointer can be NULL, indicating relation not open. * * Note that mdfd_chain == NULL does not necessarily mean the relation * doesn't have another segment after this one; we may just not have * opened the next segment yet. (We could not have "all segments are * in the chain" as an invariant anyway, since another backend could * extend the relation when we weren't looking.) * * All MdfdVec objects are palloc'd in the MdCxt memory context. */typedef struct _MdfdVec{ File mdfd_vfd; /* fd number in fd.c's pool */ BlockNumber mdfd_segno; /* segment number, from 0 */#ifndef LET_OS_MANAGE_FILESIZE /* for large relations */ struct _MdfdVec *mdfd_chain; /* next segment, or NULL */#endif} MdfdVec;static MemoryContext MdCxt; /* context for all md.c allocations *//* * In some contexts (currently, standalone backends and the bgwriter process) * we keep track of pending fsync operations: we need to remember all relation * segments that have been written since the last checkpoint, so that we can * fsync them down to disk before completing the next checkpoint. This hash * table remembers the pending operations. We use a hash table not because * we want to look up individual operations, but simply as a convenient way * of eliminating duplicate requests. * * (Regular backends do not track pending operations locally, but forward * them to the bgwriter.) * * XXX for WIN32, may want to expand this to track pending deletes, too. */typedef struct{ RelFileNode rnode; /* the targeted relation */ BlockNumber segno; /* which segment */} PendingOperationEntry;static HTAB *pendingOpsTable = NULL;/* local routines */static MdfdVec *mdopen(SMgrRelation reln, bool allowNotFound);static bool register_dirty_segment(SMgrRelation reln, MdfdVec *seg);static MdfdVec *_fdvec_alloc(void);#ifndef LET_OS_MANAGE_FILESIZEstatic MdfdVec *_mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags);#endifstatic MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool allowNotFound);static BlockNumber _mdnblocks(File file, Size blcksz);/* * mdinit() -- Initialize private state for magnetic disk storage manager. */boolmdinit(void){ MdCxt = AllocSetContextCreate(TopMemoryContext, "MdSmgr", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); /* * Create pending-operations hashtable if we need it. Currently, we need * it if we are standalone (not under a postmaster) OR if we are a * bootstrap-mode subprocess of a postmaster (that is, a startup or * bgwriter process). */ if (!IsUnderPostmaster || IsBootstrapProcessingMode()) { HASHCTL hash_ctl; MemSet(&hash_ctl, 0, sizeof(hash_ctl)); hash_ctl.keysize = sizeof(PendingOperationEntry); hash_ctl.entrysize = sizeof(PendingOperationEntry); hash_ctl.hash = tag_hash; hash_ctl.hcxt = MdCxt; pendingOpsTable = hash_create("Pending Ops Table", 100L, &hash_ctl, HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT); } return true;}/* * mdcreate() -- Create a new relation on magnetic disk. * * If isRedo is true, it's okay for the relation to exist already. */boolmdcreate(SMgrRelation reln, bool isRedo){ char *path; File fd; if (isRedo && reln->md_fd != NULL) return true; /* created and opened already... */ Assert(reln->md_fd == NULL); path = relpath(reln->smgr_rnode); fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600); if (fd < 0) { int save_errno = errno; /* * During bootstrap, there are cases where a system relation will be * accessed (by internal backend processes) before the bootstrap * script nominally creates it. Therefore, allow the file to exist * already, even if isRedo is not set. (See also mdopen) */ if (isRedo || IsBootstrapProcessingMode()) fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600); if (fd < 0) { pfree(path); /* be sure to return the error reported by create, not open */ errno = save_errno; return false; } errno = 0; } pfree(path); reln->md_fd = _fdvec_alloc(); reln->md_fd->mdfd_vfd = fd; reln->md_fd->mdfd_segno = 0;#ifndef LET_OS_MANAGE_FILESIZE reln->md_fd->mdfd_chain = NULL;#endif return true;}/* * mdunlink() -- Unlink a relation. * * Note that we're passed a RelFileNode --- by the time this is called, * there won't be an SMgrRelation hashtable entry anymore. * * If isRedo is true, it's okay for the relation to be already gone. */boolmdunlink(RelFileNode rnode, bool isRedo){ bool status = true; int save_errno = 0; char *path; path = relpath(rnode); /* Delete the first segment, or only segment if not doing segmenting */ if (unlink(path) < 0) { if (!isRedo || errno != ENOENT) { status = false; save_errno = errno; } }#ifndef LET_OS_MANAGE_FILESIZE /* Get the additional segments, if any */ if (status) { char *segpath = (char *) palloc(strlen(path) + 12); BlockNumber segno; for (segno = 1;; segno++) { sprintf(segpath, "%s.%u", path, segno); if (unlink(segpath) < 0) { /* ENOENT is expected after the last segment... */ if (errno != ENOENT) { status = false; save_errno = errno; } break; } } pfree(segpath); }#endif pfree(path); errno = save_errno; return status;}/* * mdextend() -- Add a block to the specified relation. * * The semantics are basically the same as mdwrite(): write at the * specified position. However, we are expecting to extend the * relation (ie, blocknum is the current EOF), and so in case of * failure we clean up by truncating. * * This routine returns true or false, with errno set as appropriate. * * Note: this routine used to call mdnblocks() to get the block position * to write at, but that's pretty silly since the caller needs to know where * the block will be written, and accordingly must have done mdnblocks() * already. Might as well pass in the position and save a seek. */boolmdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp){ long seekpos; int nbytes; MdfdVec *v; v = _mdfd_getseg(reln, blocknum, false);#ifndef LET_OS_MANAGE_FILESIZE seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE))); Assert(seekpos < BLCKSZ * RELSEG_SIZE);#else seekpos = (long) (BLCKSZ * (blocknum));#endif /* * Note: because caller obtained blocknum by calling _mdnblocks, which did * a seek(SEEK_END), this seek is often redundant and will be optimized * away by fd.c. It's not redundant, however, if there is a partial page * at the end of the file. In that case we want to try to overwrite the * partial page with a full page. It's also not redundant if bufmgr.c had * to dump another buffer of the same file to make room for the new page's * buffer. */ if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) return false; if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ) { if (nbytes > 0) { int save_errno = errno; /* Remove the partially-written page */ FileTruncate(v->mdfd_vfd, seekpos); FileSeek(v->mdfd_vfd, seekpos, SEEK_SET); errno = save_errno; } return false; } if (!isTemp) { if (!register_dirty_segment(reln, v)) return false; }#ifndef LET_OS_MANAGE_FILESIZE Assert(_mdnblocks(v->mdfd_vfd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));#endif return true;}/* * mdopen() -- Open the specified relation. ereport's on failure. * (Optionally, can return NULL instead of ereport for ENOENT.) * * Note we only open the first segment, when there are multiple segments. */static MdfdVec *mdopen(SMgrRelation reln, bool allowNotFound){ MdfdVec *mdfd; char *path; File fd; /* No work if already open */ if (reln->md_fd) return reln->md_fd; path = relpath(reln->smgr_rnode); fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600); if (fd < 0) { /* * During bootstrap, there are cases where a system relation will be * accessed (by internal backend processes) before the bootstrap * script nominally creates it. Therefore, accept mdopen() as a * substitute for mdcreate() in bootstrap mode only. (See mdcreate) */ if (IsBootstrapProcessingMode()) fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600); if (fd < 0) { pfree(path); if (allowNotFound && errno == ENOENT) return NULL; ereport(ERROR, (errcode_for_file_access(), errmsg("could not open relation %u/%u/%u: %m", reln->smgr_rnode.spcNode, reln->smgr_rnode.dbNode, reln->smgr_rnode.relNode))); } } pfree(path); reln->md_fd = mdfd = _fdvec_alloc(); mdfd->mdfd_vfd = fd; mdfd->mdfd_segno = 0;#ifndef LET_OS_MANAGE_FILESIZE mdfd->mdfd_chain = NULL; Assert(_mdnblocks(fd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));#endif return mdfd;}/* * mdclose() -- Close the specified relation, if it isn't closed already. * * Returns true or false with errno set as appropriate. */boolmdclose(SMgrRelation reln){ MdfdVec *v = reln->md_fd; /* No work if already closed */ if (v == NULL) return true; reln->md_fd = NULL; /* prevent dangling pointer after error */#ifndef LET_OS_MANAGE_FILESIZE while (v != NULL) { MdfdVec *ov = v; /* if not closed already */ if (v->mdfd_vfd >= 0) FileClose(v->mdfd_vfd); /* Now free vector */ v = v->mdfd_chain; pfree(ov); }#else if (v->mdfd_vfd >= 0) FileClose(v->mdfd_vfd); pfree(v);#endif return true;}/* * mdread() -- Read the specified block from a relation. */boolmdread(SMgrRelation reln, BlockNumber blocknum, char *buffer){ bool status; long seekpos; int nbytes; MdfdVec *v; v = _mdfd_getseg(reln, blocknum, false);#ifndef LET_OS_MANAGE_FILESIZE seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE))); Assert(seekpos < BLCKSZ * RELSEG_SIZE);#else seekpos = (long) (BLCKSZ * (blocknum));#endif if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) return false; status = true; if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ) { /* * If we are at or past EOF, return zeroes without complaining. Also * substitute zeroes if we found a partial block at EOF. * * XXX this is really ugly, bad design. However the current * implementation of hash indexes requires it, because hash index * pages are initialized out-of-order. */ if (nbytes == 0 || (nbytes > 0 && mdnblocks(reln) == blocknum)) MemSet(buffer, 0, BLCKSZ); else status = false; } return status;}/* * mdwrite() -- Write the supplied block at the appropriate location. */boolmdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp){ long seekpos; MdfdVec *v; v = _mdfd_getseg(reln, blocknum, false);#ifndef LET_OS_MANAGE_FILESIZE seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE))); Assert(seekpos < BLCKSZ * RELSEG_SIZE);#else seekpos = (long) (BLCKSZ * (blocknum));#endif if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) return false; if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ) return false; if (!isTemp) { if (!register_dirty_segment(reln, v))
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -