📄 md.c

📁 关系型数据库 Postgresql 6.5.2
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
/*------------------------------------------------------------------------- * * md.c *	  This code manages relations that reside on magnetic disk. * * Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION *	  $Header: /usr/local/cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.46.2.2 1999/09/06 20:00:15 tgl Exp $ * *------------------------------------------------------------------------- */#include <unistd.h>#include <fcntl.h>#include <sys/file.h>#include "postgres.h"#include "catalog/catalog.h"#include "miscadmin.h"#include "storage/smgr.h"#undef DIAGNOSTIC/* *	The magnetic disk storage manager keeps track of open file descriptors *	in its own descriptor pool.  This happens for two reasons.	First, at *	transaction boundaries, we walk the list of descriptors and flush *	anything that we've dirtied in the current transaction.  Second, we want *	to support relations larger than the OS' file size limit (often 2GBytes). *	In order to do that, we break relations up into chunks of < 2GBytes *	and store one chunk in each of several files that represent the relation. *	See the BLCKSZ and RELSEG_SIZE configuration constants in include/config.h. * *	The file descriptor stored in the relation cache (see RelationGetFile()) *	is actually an index into the Md_fdvec array.  -1 indicates not open. * *	When a relation is broken into multiple chunks, only the first chunk *	has its own entry in the Md_fdvec array; the remaining chunks have *	palloc'd MdfdVec objects that are chained onto the first chunk via the *	mdfd_chain links.  All chunks except the last MUST have size exactly *	equal to RELSEG_SIZE blocks --- see mdnblocks() and mdtruncate(). */typedef struct _MdfdVec{	int			mdfd_vfd;		/* fd number in vfd pool */	uint16		mdfd_flags;		/* clean, dirty, free */	int			mdfd_lstbcnt;	/* most recent block count */	int			mdfd_nextFree;	/* next free vector */#ifndef LET_OS_MANAGE_FILESIZE	struct _MdfdVec *mdfd_chain;/* for large relations */#endif} MdfdVec;static int	Nfds = 100;			/* initial/current size of Md_fdvec array */static MdfdVec *Md_fdvec = (MdfdVec *) NULL;static int	Md_Free = -1;		/* head of freelist of unused fdvec entries */static int	CurFd = 0;			/* first never-used fdvec index */static MemoryContext MdCxt;		/* context for all my allocations */#define MDFD_DIRTY		(uint16) 0x01#define MDFD_FREE		(uint16) 0x02/* routines declared here */static int _mdfd_getrelnfd(Relation reln);static MdfdVec *_mdfd_openseg(Relation reln, int segno, int oflags);static MdfdVec *_mdfd_getseg(Relation reln, int blkno);static int	_fdvec_alloc(void);static void _fdvec_free(int);static BlockNumber _mdnblocks(File file, Size blcksz);/* *	mdinit() -- Initialize private state for magnetic disk storage manager. * *		We keep a private table of all file descriptors.  Whenever we do *		a write to one, we mark it dirty in our table.	Whenever we force *		changes to disk, we mark the file descriptor clean.  At transaction *		commit, we force changes to disk for all dirty file descriptors. *		This routine allocates and initializes the table. * *		Returns SM_SUCCESS or SM_FAIL with errno set as appropriate. */intmdinit(){	MemoryContext oldcxt;	int			i;	MdCxt = (MemoryContext) CreateGlobalMemory("MdSmgr");	if (MdCxt == (MemoryContext) NULL)		return SM_FAIL;	oldcxt = MemoryContextSwitchTo(MdCxt);	Md_fdvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec));	MemoryContextSwitchTo(oldcxt);	if (Md_fdvec == (MdfdVec *) NULL)		return SM_FAIL;	MemSet(Md_fdvec, 0, Nfds * sizeof(MdfdVec));	/* Set free list */	for (i = 0; i < Nfds; i++)	{		Md_fdvec[i].mdfd_nextFree = i + 1;		Md_fdvec[i].mdfd_flags = MDFD_FREE;	}	Md_Free = 0;	Md_fdvec[Nfds - 1].mdfd_nextFree = -1;	return SM_SUCCESS;}intmdcreate(Relation reln){	int			fd,				vfd;	char	   *path;	path = relpath(reln->rd_rel->relname.data);#ifndef __CYGWIN32__	fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL, 0600);#else	fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | O_BINARY, 0600);#endif	/*	 * If the file already exists and is empty, we pretend that the create	 * succeeded.  During bootstrap processing, we skip that check,	 * because pg_time, pg_variable, and pg_log get created before their	 * .bki file entries are processed.	 *	 * As the result of this pretence it was possible to have in pg_class > 1	 * records with the same relname. Actually, it should be fixed in	 * upper levels, too, but... -	vadim 05/06/97	 */	if (fd < 0)	{		if (!IsBootstrapProcessingMode())			return -1;#ifndef __CYGWIN32__		fd = FileNameOpenFile(path, O_RDWR, 0600);		/* Bootstrap */#else		fd = FileNameOpenFile(path, O_RDWR | O_BINARY, 0600);	/* Bootstrap */#endif		if (fd < 0)			return -1;	}	vfd = _fdvec_alloc();	if (vfd < 0)		return -1;	Md_fdvec[vfd].mdfd_vfd = fd;	Md_fdvec[vfd].mdfd_flags = (uint16) 0;#ifndef LET_OS_MANAGE_FILESIZE	Md_fdvec[vfd].mdfd_chain = (MdfdVec *) NULL;#endif	Md_fdvec[vfd].mdfd_lstbcnt = 0;	return vfd;}/* *	mdunlink() -- Unlink a relation. */intmdunlink(Relation reln){	int			nblocks;	int			fd;	MdfdVec    *v;	MemoryContext oldcxt;	/*	 * Force all segments of the relation to be opened, so that we	 * won't miss deleting any of them.	 */	nblocks = mdnblocks(reln);	/*	 * Clean out the mdfd vector, letting fd.c unlink the physical files.	 *	 * NOTE: We truncate the file(s) before deleting 'em, because if other	 * backends are holding the files open, the unlink will fail on some	 * platforms (think Microsoft).  Better a zero-size file gets left around	 * than a big file.  Those other backends will be forced to close the	 * relation by cache invalidation, but that probably hasn't happened yet.	 */	fd = RelationGetFile(reln);	if (fd < 0)					/* should not happen */		elog(ERROR, "mdunlink: mdnblocks didn't open relation");	Md_fdvec[fd].mdfd_flags = (uint16) 0;	oldcxt = MemoryContextSwitchTo(MdCxt);#ifndef LET_OS_MANAGE_FILESIZE	for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL;)	{		MdfdVec    *ov = v;		FileTruncate(v->mdfd_vfd, 0);		FileUnlink(v->mdfd_vfd);		v = v->mdfd_chain;		if (ov != &Md_fdvec[fd])			pfree(ov);	}	Md_fdvec[fd].mdfd_chain = (MdfdVec *) NULL;#else	v = &Md_fdvec[fd];	FileTruncate(v->mdfd_vfd, 0);	FileUnlink(v->mdfd_vfd);#endif	MemoryContextSwitchTo(oldcxt);	_fdvec_free(fd);	/* be sure to mark relation closed */	reln->rd_fd = -1;	return SM_SUCCESS;}/* *	mdextend() -- Add a block to the specified relation. * *		This routine returns SM_FAIL or SM_SUCCESS, with errno set as *		appropriate. */intmdextend(Relation reln, char *buffer){	long		pos;	int			nblocks;	MdfdVec    *v;	nblocks = mdnblocks(reln);	v = _mdfd_getseg(reln, nblocks);	if ((pos = FileSeek(v->mdfd_vfd, 0L, SEEK_END)) < 0)		return SM_FAIL;	if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)		return SM_FAIL;	/* remember that we did a write, so we can sync at xact commit */	v->mdfd_flags |= MDFD_DIRTY;	/* try to keep the last block count current, though it's just a hint */#ifndef LET_OS_MANAGE_FILESIZE	if ((v->mdfd_lstbcnt = (++nblocks % RELSEG_SIZE)) == 0)		v->mdfd_lstbcnt = RELSEG_SIZE;#ifdef DIAGNOSTIC	if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > RELSEG_SIZE		|| v->mdfd_lstbcnt > RELSEG_SIZE)		elog(FATAL, "segment too big!");#endif#else	v->mdfd_lstbcnt = ++nblocks;#endif	return SM_SUCCESS;}/* *	mdopen() -- Open the specified relation. */intmdopen(Relation reln){	char	   *path;	int			fd;	int			vfd;	path = relpath(reln->rd_rel->relname.data);#ifndef __CYGWIN32__	fd = FileNameOpenFile(path, O_RDWR, 0600);#else	fd = FileNameOpenFile(path, O_RDWR | O_BINARY, 0600);#endif	if (fd < 0)	{		/* in bootstrap mode, accept mdopen as substitute for mdcreate */		if (IsBootstrapProcessingMode())		{#ifndef __CYGWIN32__			fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL, 0600);#else			fd = FileNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | O_BINARY, 0600);#endif		}		if (fd < 0)		{			elog(ERROR, "mdopen: couldn't open %s: %m", path);			return -1;		}	}	vfd = _fdvec_alloc();	if (vfd < 0)		return -1;	Md_fdvec[vfd].mdfd_vfd = fd;	Md_fdvec[vfd].mdfd_flags = (uint16) 0;	Md_fdvec[vfd].mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);#ifndef LET_OS_MANAGE_FILESIZE	Md_fdvec[vfd].mdfd_chain = (MdfdVec *) NULL;#ifdef DIAGNOSTIC	if (Md_fdvec[vfd].mdfd_lstbcnt > RELSEG_SIZE)		elog(FATAL, "segment too big on relopen!");#endif#endif	return vfd;}/* *	mdclose() -- Close the specified relation, if it isn't closed already. * *		AND FREE fd vector! It may be re-used for other relation! *		reln should be flushed from cache after closing !.. * *		Returns SM_SUCCESS or SM_FAIL with errno set as appropriate. */intmdclose(Relation reln){	int			fd;	MdfdVec    *v;	MemoryContext oldcxt;	fd = RelationGetFile(reln);	if (fd < 0)		return SM_SUCCESS;		/* already closed, so no work */	oldcxt = MemoryContextSwitchTo(MdCxt);#ifndef LET_OS_MANAGE_FILESIZE	for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL;)	{		MdfdVec    *ov = v;		/* if not closed already */		if (v->mdfd_vfd >= 0)		{			/*			 * We sync the file descriptor so that we don't need to reopen			 * it at transaction commit to force changes to disk.			 */			FileSync(v->mdfd_vfd);			FileClose(v->mdfd_vfd);			/* mark this file descriptor as clean in our private table */			v->mdfd_flags &= ~MDFD_DIRTY;		}		/* Now free vector */		v = v->mdfd_chain;		if (ov != &Md_fdvec[fd])			pfree(ov);	}	Md_fdvec[fd].mdfd_chain = (MdfdVec *) NULL;#else	v = &Md_fdvec[fd];	if (v != (MdfdVec *) NULL)	{		if (v->mdfd_vfd >= 0)		{			/*			 * We sync the file descriptor so that we don't need to reopen			 * it at transaction commit to force changes to disk.			 */			FileSync(v->mdfd_vfd);			FileClose(v->mdfd_vfd);			/* mark this file descriptor as clean in our private table */			v->mdfd_flags &= ~MDFD_DIRTY;		}	}#endif	MemoryContextSwitchTo(oldcxt);	_fdvec_free(fd);	/* be sure to mark relation closed */	reln->rd_fd = -1;	return SM_SUCCESS;}/* *	mdread() -- Read the specified block from a relation. * *		Returns SM_SUCCESS or SM_FAIL. */intmdread(Relation reln, BlockNumber blocknum, char *buffer){	int			status;	long		seekpos;	int			nbytes;	MdfdVec    *v;	v = _mdfd_getseg(reln, blocknum);#ifndef LET_OS_MANAGE_FILESIZE	seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));#ifdef DIAGNOSTIC	if (seekpos >= BLCKSZ * RELSEG_SIZE)		elog(FATAL, "seekpos too big!");#endif#else	seekpos = (long) (BLCKSZ * (blocknum));#endif	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)		return SM_FAIL;	status = SM_SUCCESS;	if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)	{		if (nbytes == 0)			MemSet(buffer, 0, BLCKSZ);		else			status = SM_FAIL;	}	return status;}/* *	mdwrite() -- Write the supplied block at the appropriate location. * *		Returns SM_SUCCESS or SM_FAIL. */intmdwrite(Relation reln, BlockNumber blocknum, char *buffer){	int			status;	long		seekpos;	MdfdVec    *v;	v = _mdfd_getseg(reln, blocknum);#ifndef LET_OS_MANAGE_FILESIZE	seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));#ifdef DIAGNOSTIC	if (seekpos >= BLCKSZ * RELSEG_SIZE)		elog(FATAL, "seekpos too big!");#endif#else	seekpos = (long) (BLCKSZ * (blocknum));#endif	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)		return SM_FAIL;	status = SM_SUCCESS;	if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)		status = SM_FAIL;	v->mdfd_flags |= MDFD_DIRTY;	return status;}/* *	mdflush() -- Synchronously write a block to disk. * *		This is exactly like mdwrite(), but doesn't return until the file *		system buffer cache has been flushed. */intmdflush(Relation reln, BlockNumber blocknum, char *buffer){	int			status;	long		seekpos;	MdfdVec    *v;	v = _mdfd_getseg(reln, blocknum);#ifndef LET_OS_MANAGE_FILESIZE	seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));#ifdef DIAGNOSTIC	if (seekpos >= BLCKSZ * RELSEG_SIZE)		elog(FATAL, "seekpos too big!");#endif#else	seekpos = (long) (BLCKSZ * (blocknum));#endif	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)		return SM_FAIL;	/* write and sync the block */	status = SM_SUCCESS;	if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ		|| FileSync(v->mdfd_vfd) < 0)		status = SM_FAIL;	/*	 * By here, the block is written and changes have been forced to	 * stable storage.	Mark the descriptor as clean until the next write,	 * so we don't sync it again unnecessarily at transaction commit.	 */	v->mdfd_flags &= ~MDFD_DIRTY;	return status;}/* *	mdblindwrt() -- Write a block to disk blind. * *		We have to be able to do this using only the name and OID of *		the database and relation in which the block belongs.  This *		is a synchronous write. */intmdblindwrt(char *dbstr,		   char *relstr,		   Oid dbid,		   Oid relid,		   BlockNumber blkno,		   char *buffer){	int			fd;	int			segno;	long		seekpos;
12 下一页
💿 文件大小 6496 K
👤 上传用户 huajunun
📂 所属分类数据库系统
🏷️ 相关标签

#Postgresql #数据库
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -