📄 md.c
字号:
/*------------------------------------------------------------------------- * * md.c * This code manages relations that reside on magnetic disk. * * Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION * $Header: /usr/local/cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.46.2.2 1999/09/06 20:00:15 tgl Exp $ * *------------------------------------------------------------------------- */#include <unistd.h>#include <fcntl.h>#include <sys/file.h>#include "postgres.h"#include "catalog/catalog.h"#include "miscadmin.h"#include "storage/smgr.h"#undef DIAGNOSTIC/* * The magnetic disk storage manager keeps track of open file descriptors * in its own descriptor pool. This happens for two reasons. First, at * transaction boundaries, we walk the list of descriptors and flush * anything that we've dirtied in the current transaction. Second, we want * to support relations larger than the OS' file size limit (often 2GBytes). * In order to do that, we break relations up into chunks of < 2GBytes * and store one chunk in each of several files that represent the relation. * See the BLCKSZ and RELSEG_SIZE configuration constants in include/config.h. * * The file descriptor stored in the relation cache (see RelationGetFile()) * is actually an index into the Md_fdvec array. -1 indicates not open. * * When a relation is broken into multiple chunks, only the first chunk * has its own entry in the Md_fdvec array; the remaining chunks have * palloc'd MdfdVec objects that are chained onto the first chunk via the * mdfd_chain links. All chunks except the last MUST have size exactly * equal to RELSEG_SIZE blocks --- see mdnblocks() and mdtruncate(). */typedef struct _MdfdVec{ int mdfd_vfd; /* fd number in vfd pool */ uint16 mdfd_flags; /* clean, dirty, free */ int mdfd_lstbcnt; /* most recent block count */ int mdfd_nextFree; /* next free vector */#ifndef LET_OS_MANAGE_FILESIZE struct _MdfdVec *mdfd_chain;/* for large relations */#endif} MdfdVec;static int Nfds = 100; /* initial/current size of Md_fdvec array */static MdfdVec *Md_fdvec = (MdfdVec *) NULL;static int Md_Free = -1; /* head of freelist of unused fdvec entries */static int CurFd = 0; /* first never-used fdvec index */static MemoryContext MdCxt; /* context for all my allocations */#define MDFD_DIRTY (uint16) 0x01#define MDFD_FREE (uint16) 0x02/* routines declared here */static int _mdfd_getrelnfd(Relation reln);static MdfdVec *_mdfd_openseg(Relation reln, int segno, int oflags);static MdfdVec *_mdfd_getseg(Relation reln, int blkno);static int _fdvec_alloc(void);static void _fdvec_free(int);static BlockNumber _mdnblocks(File file, Size blcksz);/* * mdinit() -- Initialize private state for magnetic disk storage manager. * * We keep a private table of all file descriptors. Whenever we do * a write to one, we mark it dirty in our table. Whenever we force * changes to disk, we mark the file descriptor clean. At transaction * commit, we force changes to disk for all dirty file descriptors. * This routine allocates and initializes the table. * * Returns SM_SUCCESS or SM_FAIL with errno set as appropriate. */intmdinit(){ MemoryContext oldcxt; int i; MdCxt = (MemoryContext) CreateGlobalMemory("MdSmgr"); if (MdCxt == (MemoryContext) NULL) return SM_FAIL; oldcxt = MemoryContextSwitchTo(MdCxt); Md_fdvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec)); MemoryContextSwitchTo(oldcxt); if (Md_fdvec == (MdfdVec *) NULL) return SM_FAIL; MemSet(Md_fdvec, 0, Nfds * sizeof(MdfdVec)); /* Set free list */ for (i = 0; i < Nfds; i++) { Md_fdvec[i].mdfd_nextFree = i + 1; Md_fdvec[i].mdfd_flags = MDFD_FREE; } Md_Free = 0; Md_fdvec[Nfds - 1].mdfd_nextFree = -1; return SM_SUCCESS;}intmdcreate(Relation reln){ int fd, vfd; char *path; path = relpath(reln->rd_rel->relname.data);#ifndef __CYGWIN32__ fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL, 0600);#else fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | O_BINARY, 0600);#endif /* * If the file already exists and is empty, we pretend that the create * succeeded. During bootstrap processing, we skip that check, * because pg_time, pg_variable, and pg_log get created before their * .bki file entries are processed. * * As the result of this pretence it was possible to have in pg_class > 1 * records with the same relname. Actually, it should be fixed in * upper levels, too, but... - vadim 05/06/97 */ if (fd < 0) { if (!IsBootstrapProcessingMode()) return -1;#ifndef __CYGWIN32__ fd = FileNameOpenFile(path, O_RDWR, 0600); /* Bootstrap */#else fd = FileNameOpenFile(path, O_RDWR | O_BINARY, 0600); /* Bootstrap */#endif if (fd < 0) return -1; } vfd = _fdvec_alloc(); if (vfd < 0) return -1; Md_fdvec[vfd].mdfd_vfd = fd; Md_fdvec[vfd].mdfd_flags = (uint16) 0;#ifndef LET_OS_MANAGE_FILESIZE Md_fdvec[vfd].mdfd_chain = (MdfdVec *) NULL;#endif Md_fdvec[vfd].mdfd_lstbcnt = 0; return vfd;}/* * mdunlink() -- Unlink a relation. */intmdunlink(Relation reln){ int nblocks; int fd; MdfdVec *v; MemoryContext oldcxt; /* * Force all segments of the relation to be opened, so that we * won't miss deleting any of them. */ nblocks = mdnblocks(reln); /* * Clean out the mdfd vector, letting fd.c unlink the physical files. * * NOTE: We truncate the file(s) before deleting 'em, because if other * backends are holding the files open, the unlink will fail on some * platforms (think Microsoft). Better a zero-size file gets left around * than a big file. Those other backends will be forced to close the * relation by cache invalidation, but that probably hasn't happened yet. */ fd = RelationGetFile(reln); if (fd < 0) /* should not happen */ elog(ERROR, "mdunlink: mdnblocks didn't open relation"); Md_fdvec[fd].mdfd_flags = (uint16) 0; oldcxt = MemoryContextSwitchTo(MdCxt);#ifndef LET_OS_MANAGE_FILESIZE for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL;) { MdfdVec *ov = v; FileTruncate(v->mdfd_vfd, 0); FileUnlink(v->mdfd_vfd); v = v->mdfd_chain; if (ov != &Md_fdvec[fd]) pfree(ov); } Md_fdvec[fd].mdfd_chain = (MdfdVec *) NULL;#else v = &Md_fdvec[fd]; FileTruncate(v->mdfd_vfd, 0); FileUnlink(v->mdfd_vfd);#endif MemoryContextSwitchTo(oldcxt); _fdvec_free(fd); /* be sure to mark relation closed */ reln->rd_fd = -1; return SM_SUCCESS;}/* * mdextend() -- Add a block to the specified relation. * * This routine returns SM_FAIL or SM_SUCCESS, with errno set as * appropriate. */intmdextend(Relation reln, char *buffer){ long pos; int nblocks; MdfdVec *v; nblocks = mdnblocks(reln); v = _mdfd_getseg(reln, nblocks); if ((pos = FileSeek(v->mdfd_vfd, 0L, SEEK_END)) < 0) return SM_FAIL; if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ) return SM_FAIL; /* remember that we did a write, so we can sync at xact commit */ v->mdfd_flags |= MDFD_DIRTY; /* try to keep the last block count current, though it's just a hint */#ifndef LET_OS_MANAGE_FILESIZE if ((v->mdfd_lstbcnt = (++nblocks % RELSEG_SIZE)) == 0) v->mdfd_lstbcnt = RELSEG_SIZE;#ifdef DIAGNOSTIC if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > RELSEG_SIZE || v->mdfd_lstbcnt > RELSEG_SIZE) elog(FATAL, "segment too big!");#endif#else v->mdfd_lstbcnt = ++nblocks;#endif return SM_SUCCESS;}/* * mdopen() -- Open the specified relation. */intmdopen(Relation reln){ char *path; int fd; int vfd; path = relpath(reln->rd_rel->relname.data);#ifndef __CYGWIN32__ fd = FileNameOpenFile(path, O_RDWR, 0600);#else fd = FileNameOpenFile(path, O_RDWR | O_BINARY, 0600);#endif if (fd < 0) { /* in bootstrap mode, accept mdopen as substitute for mdcreate */ if (IsBootstrapProcessingMode()) {#ifndef __CYGWIN32__ fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL, 0600);#else fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | O_BINARY, 0600);#endif } if (fd < 0) { elog(ERROR, "mdopen: couldn't open %s: %m", path); return -1; } } vfd = _fdvec_alloc(); if (vfd < 0) return -1; Md_fdvec[vfd].mdfd_vfd = fd; Md_fdvec[vfd].mdfd_flags = (uint16) 0; Md_fdvec[vfd].mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);#ifndef LET_OS_MANAGE_FILESIZE Md_fdvec[vfd].mdfd_chain = (MdfdVec *) NULL;#ifdef DIAGNOSTIC if (Md_fdvec[vfd].mdfd_lstbcnt > RELSEG_SIZE) elog(FATAL, "segment too big on relopen!");#endif#endif return vfd;}/* * mdclose() -- Close the specified relation, if it isn't closed already. * * AND FREE fd vector! It may be re-used for other relation! * reln should be flushed from cache after closing !.. * * Returns SM_SUCCESS or SM_FAIL with errno set as appropriate. */intmdclose(Relation reln){ int fd; MdfdVec *v; MemoryContext oldcxt; fd = RelationGetFile(reln); if (fd < 0) return SM_SUCCESS; /* already closed, so no work */ oldcxt = MemoryContextSwitchTo(MdCxt);#ifndef LET_OS_MANAGE_FILESIZE for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL;) { MdfdVec *ov = v; /* if not closed already */ if (v->mdfd_vfd >= 0) { /* * We sync the file descriptor so that we don't need to reopen * it at transaction commit to force changes to disk. */ FileSync(v->mdfd_vfd); FileClose(v->mdfd_vfd); /* mark this file descriptor as clean in our private table */ v->mdfd_flags &= ~MDFD_DIRTY; } /* Now free vector */ v = v->mdfd_chain; if (ov != &Md_fdvec[fd]) pfree(ov); } Md_fdvec[fd].mdfd_chain = (MdfdVec *) NULL;#else v = &Md_fdvec[fd]; if (v != (MdfdVec *) NULL) { if (v->mdfd_vfd >= 0) { /* * We sync the file descriptor so that we don't need to reopen * it at transaction commit to force changes to disk. */ FileSync(v->mdfd_vfd); FileClose(v->mdfd_vfd); /* mark this file descriptor as clean in our private table */ v->mdfd_flags &= ~MDFD_DIRTY; } }#endif MemoryContextSwitchTo(oldcxt); _fdvec_free(fd); /* be sure to mark relation closed */ reln->rd_fd = -1; return SM_SUCCESS;}/* * mdread() -- Read the specified block from a relation. * * Returns SM_SUCCESS or SM_FAIL. */intmdread(Relation reln, BlockNumber blocknum, char *buffer){ int status; long seekpos; int nbytes; MdfdVec *v; v = _mdfd_getseg(reln, blocknum);#ifndef LET_OS_MANAGE_FILESIZE seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));#ifdef DIAGNOSTIC if (seekpos >= BLCKSZ * RELSEG_SIZE) elog(FATAL, "seekpos too big!");#endif#else seekpos = (long) (BLCKSZ * (blocknum));#endif if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) return SM_FAIL; status = SM_SUCCESS; if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ) { if (nbytes == 0) MemSet(buffer, 0, BLCKSZ); else status = SM_FAIL; } return status;}/* * mdwrite() -- Write the supplied block at the appropriate location. * * Returns SM_SUCCESS or SM_FAIL. */intmdwrite(Relation reln, BlockNumber blocknum, char *buffer){ int status; long seekpos; MdfdVec *v; v = _mdfd_getseg(reln, blocknum);#ifndef LET_OS_MANAGE_FILESIZE seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));#ifdef DIAGNOSTIC if (seekpos >= BLCKSZ * RELSEG_SIZE) elog(FATAL, "seekpos too big!");#endif#else seekpos = (long) (BLCKSZ * (blocknum));#endif if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) return SM_FAIL; status = SM_SUCCESS; if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ) status = SM_FAIL; v->mdfd_flags |= MDFD_DIRTY; return status;}/* * mdflush() -- Synchronously write a block to disk. * * This is exactly like mdwrite(), but doesn't return until the file * system buffer cache has been flushed. */intmdflush(Relation reln, BlockNumber blocknum, char *buffer){ int status; long seekpos; MdfdVec *v; v = _mdfd_getseg(reln, blocknum);#ifndef LET_OS_MANAGE_FILESIZE seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));#ifdef DIAGNOSTIC if (seekpos >= BLCKSZ * RELSEG_SIZE) elog(FATAL, "seekpos too big!");#endif#else seekpos = (long) (BLCKSZ * (blocknum));#endif if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) return SM_FAIL; /* write and sync the block */ status = SM_SUCCESS; if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ || FileSync(v->mdfd_vfd) < 0) status = SM_FAIL; /* * By here, the block is written and changes have been forced to * stable storage. Mark the descriptor as clean until the next write, * so we don't sync it again unnecessarily at transaction commit. */ v->mdfd_flags &= ~MDFD_DIRTY; return status;}/* * mdblindwrt() -- Write a block to disk blind. * * We have to be able to do this using only the name and OID of * the database and relation in which the block belongs. This * is a synchronous write. */intmdblindwrt(char *dbstr, char *relstr, Oid dbid, Oid relid, BlockNumber blkno, char *buffer){ int fd; int segno; long seekpos;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -