📄 md.c
字号:
/*------------------------------------------------------------------------- * * md.c * This code manages relations that reside on magnetic disk. * * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION * $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.98 2003/08/04 02:40:04 momjian Exp $ * *------------------------------------------------------------------------- */#include "postgres.h"#include <errno.h>#include <unistd.h>#include <fcntl.h>#include <sys/file.h>#include "catalog/catalog.h"#include "miscadmin.h"#include "storage/smgr.h"#include "utils/inval.h"#include "utils/memutils.h"#undef DIAGNOSTIC/* * The magnetic disk storage manager keeps track of open file descriptors * in its own descriptor pool. This happens for two reasons. First, at * transaction boundaries, we walk the list of descriptors and flush * anything that we've dirtied in the current transaction. Second, we want * to support relations larger than the OS' file size limit (often 2GBytes). * In order to do that, we break relations up into chunks of < 2GBytes * and store one chunk in each of several files that represent the relation. * See the BLCKSZ and RELSEG_SIZE configuration constants in include/pg_config.h. * * The file descriptor stored in the relation cache (see RelationGetFile()) * is actually an index into the Md_fdvec array. -1 indicates not open. * * When a relation is broken into multiple chunks, only the first chunk * has its own entry in the Md_fdvec array; the remaining chunks have * palloc'd MdfdVec objects that are chained onto the first chunk via the * mdfd_chain links. All chunks except the last MUST have size exactly * equal to RELSEG_SIZE blocks --- see mdnblocks() and mdtruncate(). */typedef struct _MdfdVec{ int mdfd_vfd; /* fd number in vfd pool */ int mdfd_flags; /* fd status flags *//* these are the assigned bits in mdfd_flags: */#define MDFD_FREE (1 << 0) /* unused entry */ int mdfd_nextFree; /* link to next freelist member, if free */#ifndef LET_OS_MANAGE_FILESIZE struct _MdfdVec *mdfd_chain; /* for large relations */#endif} MdfdVec;static int Nfds = 100; /* initial/current size of Md_fdvec array */static MdfdVec *Md_fdvec = (MdfdVec *) NULL;static int Md_Free = -1; /* head of freelist of unused fdvec * entries */static int CurFd = 0; /* first never-used fdvec index */static MemoryContext MdCxt; /* context for all my allocations *//* routines declared here */static void mdclose_fd(int fd);static int _mdfd_getrelnfd(Relation reln);static MdfdVec *_mdfd_openseg(Relation reln, BlockNumber segno, int oflags);static MdfdVec *_mdfd_getseg(Relation reln, BlockNumber blkno);static int _mdfd_blind_getseg(RelFileNode rnode, BlockNumber blkno);static int _fdvec_alloc(void);static void _fdvec_free(int);static BlockNumber _mdnblocks(File file, Size blcksz);/* * mdinit() -- Initialize private state for magnetic disk storage manager. * * We keep a private table of all file descriptors. Whenever we do * a write to one, we mark it dirty in our table. Whenever we force * changes to disk, we mark the file descriptor clean. At transaction * commit, we force changes to disk for all dirty file descriptors. * This routine allocates and initializes the table. * * Returns SM_SUCCESS or SM_FAIL with errno set as appropriate. */intmdinit(void){ int i; MdCxt = AllocSetContextCreate(TopMemoryContext, "MdSmgr", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); Md_fdvec = (MdfdVec *) MemoryContextAlloc(MdCxt, Nfds * sizeof(MdfdVec)); MemSet(Md_fdvec, 0, Nfds * sizeof(MdfdVec)); /* Set free list */ for (i = 0; i < Nfds; i++) { Md_fdvec[i].mdfd_nextFree = i + 1; Md_fdvec[i].mdfd_flags = MDFD_FREE; } Md_Free = 0; Md_fdvec[Nfds - 1].mdfd_nextFree = -1; return SM_SUCCESS;}intmdcreate(Relation reln){ char *path; int fd, vfd; Assert(reln->rd_fd < 0); path = relpath(reln->rd_node); fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600); if (fd < 0) { int save_errno = errno; /* * During bootstrap, there are cases where a system relation will * be accessed (by internal backend processes) before the * bootstrap script nominally creates it. Therefore, allow the * file to exist already, but in bootstrap mode only. (See also * mdopen) */ if (IsBootstrapProcessingMode()) fd = FileNameOpenFile(path, O_RDWR | PG_BINARY, 0600); if (fd < 0) { pfree(path); /* be sure to return the error reported by create, not open */ errno = save_errno; return -1; } errno = 0; } pfree(path); vfd = _fdvec_alloc(); if (vfd < 0) return -1; Md_fdvec[vfd].mdfd_vfd = fd; Md_fdvec[vfd].mdfd_flags = (uint16) 0;#ifndef LET_OS_MANAGE_FILESIZE Md_fdvec[vfd].mdfd_chain = (MdfdVec *) NULL;#endif return vfd;}/* * mdunlink() -- Unlink a relation. */intmdunlink(RelFileNode rnode){ int status = SM_SUCCESS; int save_errno = 0; char *path; path = relpath(rnode); /* Delete the first segment, or only segment if not doing segmenting */ if (unlink(path) < 0) { status = SM_FAIL; save_errno = errno; }#ifndef LET_OS_MANAGE_FILESIZE /* Get the additional segments, if any */ if (status == SM_SUCCESS) { char *segpath = (char *) palloc(strlen(path) + 12); BlockNumber segno; for (segno = 1;; segno++) { sprintf(segpath, "%s.%u", path, segno); if (unlink(segpath) < 0) { /* ENOENT is expected after the last segment... */ if (errno != ENOENT) { status = SM_FAIL; save_errno = errno; } break; } } pfree(segpath); }#endif pfree(path); errno = save_errno; return status;}/* * mdextend() -- Add a block to the specified relation. * * The semantics are basically the same as mdwrite(): write at the * specified position. However, we are expecting to extend the * relation (ie, blocknum is the current EOF), and so in case of * failure we clean up by truncating. * * This routine returns SM_FAIL or SM_SUCCESS, with errno set as * appropriate. * * Note: this routine used to call mdnblocks() to get the block position * to write at, but that's pretty silly since the caller needs to know where * the block will be written, and accordingly must have done mdnblocks() * already. Might as well pass in the position and save a seek. */intmdextend(Relation reln, BlockNumber blocknum, char *buffer){ long seekpos; int nbytes; MdfdVec *v; v = _mdfd_getseg(reln, blocknum);#ifndef LET_OS_MANAGE_FILESIZE seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));#ifdef DIAGNOSTIC if (seekpos >= BLCKSZ * RELSEG_SIZE) elog(FATAL, "seekpos too big");#endif#else seekpos = (long) (BLCKSZ * (blocknum));#endif /* * Note: because caller obtained blocknum by calling mdnblocks, which * did a seek(SEEK_END), this seek is often redundant and will be * optimized away by fd.c. It's not redundant, however, if there is a * partial page at the end of the file. In that case we want to try * to overwrite the partial page with a full page. It's also not * redundant if bufmgr.c had to dump another buffer of the same file * to make room for the new page's buffer. */ if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) return SM_FAIL; if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ) { if (nbytes > 0) { int save_errno = errno; /* Remove the partially-written page */ FileTruncate(v->mdfd_vfd, seekpos); FileSeek(v->mdfd_vfd, seekpos, SEEK_SET); errno = save_errno; } return SM_FAIL; }#ifndef LET_OS_MANAGE_FILESIZE#ifdef DIAGNOSTIC if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > ((BlockNumber) RELSEG_SIZE)) elog(FATAL, "segment too big");#endif#endif return SM_SUCCESS;}/* * mdopen() -- Open the specified relation. */intmdopen(Relation reln){ char *path; int fd; int vfd; Assert(reln->rd_fd < 0); path = relpath(reln->rd_node); fd = FileNameOpenFile(path, O_RDWR | PG_BINARY, 0600); if (fd < 0) { /* * During bootstrap, there are cases where a system relation will * be accessed (by internal backend processes) before the * bootstrap script nominally creates it. Therefore, accept * mdopen() as a substitute for mdcreate() in bootstrap mode only. * (See mdcreate) */ if (IsBootstrapProcessingMode()) fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600); if (fd < 0) { pfree(path); return -1; } } pfree(path); vfd = _fdvec_alloc(); if (vfd < 0) return -1; Md_fdvec[vfd].mdfd_vfd = fd; Md_fdvec[vfd].mdfd_flags = (uint16) 0;#ifndef LET_OS_MANAGE_FILESIZE Md_fdvec[vfd].mdfd_chain = (MdfdVec *) NULL;#ifdef DIAGNOSTIC if (_mdnblocks(fd, BLCKSZ) > ((BlockNumber) RELSEG_SIZE)) elog(FATAL, "segment too big");#endif#endif return vfd;}/* * mdclose() -- Close the specified relation, if it isn't closed already. * * AND FREE fd vector! It may be re-used for other relation! * reln should be flushed from cache after closing !.. * * Returns SM_SUCCESS or SM_FAIL with errno set as appropriate. */intmdclose(Relation reln){ int fd; fd = RelationGetFile(reln); if (fd < 0) return SM_SUCCESS; /* already closed, so no work */ mdclose_fd(fd); reln->rd_fd = -1; return SM_SUCCESS;}static voidmdclose_fd(int fd){ MdfdVec *v;#ifndef LET_OS_MANAGE_FILESIZE for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL;) { MdfdVec *ov = v; /* if not closed already */ if (v->mdfd_vfd >= 0) FileClose(v->mdfd_vfd); /* Now free vector */ v = v->mdfd_chain; if (ov != &Md_fdvec[fd]) pfree(ov); } Md_fdvec[fd].mdfd_chain = (MdfdVec *) NULL;#else v = &Md_fdvec[fd]; if (v != (MdfdVec *) NULL) { if (v->mdfd_vfd >= 0) FileClose(v->mdfd_vfd); }#endif _fdvec_free(fd);}/* * mdread() -- Read the specified block from a relation. * * Returns SM_SUCCESS or SM_FAIL. */intmdread(Relation reln, BlockNumber blocknum, char *buffer){ int status; long seekpos; int nbytes; MdfdVec *v; v = _mdfd_getseg(reln, blocknum);#ifndef LET_OS_MANAGE_FILESIZE seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));#ifdef DIAGNOSTIC if (seekpos >= BLCKSZ * RELSEG_SIZE) elog(FATAL, "seekpos too big");#endif#else seekpos = (long) (BLCKSZ * (blocknum));#endif if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) return SM_FAIL; status = SM_SUCCESS; if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ) { /* * If we are at or past EOF, return zeroes without complaining. * Also substitute zeroes if we found a partial block at EOF. * * XXX this is really ugly, bad design. However the current * implementation of hash indexes requires it, because hash index * pages are initialized out-of-order. */ if (nbytes == 0 || (nbytes > 0 && mdnblocks(reln) == blocknum)) MemSet(buffer, 0, BLCKSZ); else status = SM_FAIL; } return status;}/* * mdwrite() -- Write the supplied block at the appropriate location. * * Returns SM_SUCCESS or SM_FAIL. */intmdwrite(Relation reln, BlockNumber blocknum, char *buffer){ long seekpos; MdfdVec *v; v = _mdfd_getseg(reln, blocknum);#ifndef LET_OS_MANAGE_FILESIZE seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));#ifdef DIAGNOSTIC if (seekpos >= BLCKSZ * RELSEG_SIZE) elog(FATAL, "seekpos too big");#endif#else seekpos = (long) (BLCKSZ * (blocknum));#endif if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) return SM_FAIL; if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ) return SM_FAIL; return SM_SUCCESS;}/* * mdblindwrt() -- Write a block to disk blind. * * We have to be able to do this using only the rnode of the relation * in which the block belongs. Otherwise this is much like mdwrite(). */intmdblindwrt(RelFileNode rnode, BlockNumber blkno, char *buffer){
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -