📄 md.c

📁 postgresql8.3.4源码,开源数据库
💻 C
📖 第 1 页 / 共 4 页
字号:
12 3 4 下一页
/*------------------------------------------------------------------------- * * md.c *	  This code manages relations that reside on magnetic disk. * * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION *	  $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.135.2.1 2008/04/18 06:48:50 heikki Exp $ * *------------------------------------------------------------------------- */#include "postgres.h"#include <unistd.h>#include <fcntl.h>#include <sys/file.h>#include "catalog/catalog.h"#include "miscadmin.h"#include "postmaster/bgwriter.h"#include "storage/fd.h"#include "storage/bufmgr.h"#include "storage/smgr.h"#include "utils/hsearch.h"#include "utils/memutils.h"/* interval for calling AbsorbFsyncRequests in mdsync */#define FSYNCS_PER_ABSORB		10/* special values for the segno arg to RememberFsyncRequest */#define FORGET_RELATION_FSYNC	(InvalidBlockNumber)#define FORGET_DATABASE_FSYNC	(InvalidBlockNumber-1)#define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)/* * On Windows, we have to interpret EACCES as possibly meaning the same as * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform, * that's what you get.  Ugh.  This code is designed so that we don't * actually believe these cases are okay without further evidence (namely, * a pending fsync request getting revoked ... see mdsync). */#ifndef WIN32#define FILE_POSSIBLY_DELETED(err)	((err) == ENOENT)#else#define FILE_POSSIBLY_DELETED(err)	((err) == ENOENT || (err) == EACCES)#endif/* *	The magnetic disk storage manager keeps track of open file *	descriptors in its own descriptor pool.  This is done to make it *	easier to support relations that are larger than the operating *	system's file size limit (often 2GBytes).  In order to do that, *	we break relations up into "segment" files that are each shorter than *	the OS file size limit.  The segment size is set by the RELSEG_SIZE *	configuration constant in pg_config_manual.h. * *	On disk, a relation must consist of consecutively numbered segment *	files in the pattern *		-- Zero or more full segments of exactly RELSEG_SIZE blocks each *		-- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks *		-- Optionally, any number of inactive segments of size 0 blocks. *	The full and partial segments are collectively the "active" segments. *	Inactive segments are those that once contained data but are currently *	not needed because of an mdtruncate() operation.  The reason for leaving *	them present at size zero, rather than unlinking them, is that other *	backends and/or the bgwriter might be holding open file references to *	such segments.	If the relation expands again after mdtruncate(), such *	that a deactivated segment becomes active again, it is important that *	such file references still be valid --- else data might get written *	out to an unlinked old copy of a segment file that will eventually *	disappear. * *	The file descriptor pointer (md_fd field) stored in the SMgrRelation *	cache is, therefore, just the head of a list of MdfdVec objects, one *	per segment.  But note the md_fd pointer can be NULL, indicating *	relation not open. * *	Also note that mdfd_chain == NULL does not necessarily mean the relation *	doesn't have another segment after this one; we may just not have *	opened the next segment yet.  (We could not have "all segments are *	in the chain" as an invariant anyway, since another backend could *	extend the relation when we weren't looking.)  We do not make chain *	entries for inactive segments, however; as soon as we find a partial *	segment, we assume that any subsequent segments are inactive. * *	All MdfdVec objects are palloc'd in the MdCxt memory context. * *	Defining LET_OS_MANAGE_FILESIZE disables the segmentation logic, *	for use on machines that support large files.  Beware that that *	code has not been tested in a long time and is probably bit-rotted. */typedef struct _MdfdVec{	File		mdfd_vfd;		/* fd number in fd.c's pool */	BlockNumber mdfd_segno;		/* segment number, from 0 */#ifndef LET_OS_MANAGE_FILESIZE	/* for large relations */	struct _MdfdVec *mdfd_chain;	/* next segment, or NULL */#endif} MdfdVec;static MemoryContext MdCxt;		/* context for all md.c allocations *//* * In some contexts (currently, standalone backends and the bgwriter process) * we keep track of pending fsync operations: we need to remember all relation * segments that have been written since the last checkpoint, so that we can * fsync them down to disk before completing the next checkpoint.  This hash * table remembers the pending operations.	We use a hash table mostly as * a convenient way of eliminating duplicate requests. * * We use a similar mechanism to remember no-longer-needed files that can * be deleted after the next checkpoint, but we use a linked list instead of * a hash table, because we don't expect there to be any duplicate requests. * * (Regular backends do not track pending operations locally, but forward * them to the bgwriter.) */typedef struct{	RelFileNode rnode;			/* the targeted relation */	BlockNumber segno;			/* which segment */} PendingOperationTag;typedef uint16 CycleCtr;		/* can be any convenient integer size */typedef struct{	PendingOperationTag tag;	/* hash table key (must be first!) */	bool		canceled;		/* T => request canceled, not yet removed */	CycleCtr	cycle_ctr;		/* mdsync_cycle_ctr when request was made */} PendingOperationEntry;typedef struct{	RelFileNode rnode;			/* the dead relation to delete */	CycleCtr	cycle_ctr;		/* mdckpt_cycle_ctr when request was made */} PendingUnlinkEntry;static HTAB *pendingOpsTable = NULL;static List *pendingUnlinks = NIL;static CycleCtr mdsync_cycle_ctr = 0;static CycleCtr mdckpt_cycle_ctr = 0;typedef enum					/* behavior for mdopen & _mdfd_getseg */{	EXTENSION_FAIL,				/* ereport if segment not present */	EXTENSION_RETURN_NULL,		/* return NULL if not present */	EXTENSION_CREATE			/* create new segments as needed */} ExtensionBehavior;/* local routines */static MdfdVec *mdopen(SMgrRelation reln, ExtensionBehavior behavior);static void register_dirty_segment(SMgrRelation reln, MdfdVec *seg);static void register_unlink(RelFileNode rnode);static MdfdVec *_fdvec_alloc(void);#ifndef LET_OS_MANAGE_FILESIZEstatic MdfdVec *_mdfd_openseg(SMgrRelation reln, BlockNumber segno,			  int oflags);#endifstatic MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno,			 bool isTemp, ExtensionBehavior behavior);static BlockNumber _mdnblocks(SMgrRelation reln, MdfdVec *seg);/* *	mdinit() -- Initialize private state for magnetic disk storage manager. */voidmdinit(void){	MdCxt = AllocSetContextCreate(TopMemoryContext,								  "MdSmgr",								  ALLOCSET_DEFAULT_MINSIZE,								  ALLOCSET_DEFAULT_INITSIZE,								  ALLOCSET_DEFAULT_MAXSIZE);	/*	 * Create pending-operations hashtable if we need it.  Currently, we need	 * it if we are standalone (not under a postmaster) OR if we are a	 * bootstrap-mode subprocess of a postmaster (that is, a startup or	 * bgwriter process).	 */	if (!IsUnderPostmaster || IsBootstrapProcessingMode())	{		HASHCTL		hash_ctl;		MemSet(&hash_ctl, 0, sizeof(hash_ctl));		hash_ctl.keysize = sizeof(PendingOperationTag);		hash_ctl.entrysize = sizeof(PendingOperationEntry);		hash_ctl.hash = tag_hash;		hash_ctl.hcxt = MdCxt;		pendingOpsTable = hash_create("Pending Ops Table",									  100L,									  &hash_ctl,								   HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);		pendingUnlinks = NIL;	}}/* *	mdcreate() -- Create a new relation on magnetic disk. * * If isRedo is true, it's okay for the relation to exist already. */voidmdcreate(SMgrRelation reln, bool isRedo){	char	   *path;	File		fd;	if (isRedo && reln->md_fd != NULL)		return;					/* created and opened already... */	Assert(reln->md_fd == NULL);	path = relpath(reln->smgr_rnode);	fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);	if (fd < 0)	{		int			save_errno = errno;		/*		 * During bootstrap, there are cases where a system relation will be		 * accessed (by internal backend processes) before the bootstrap		 * script nominally creates it.  Therefore, allow the file to exist		 * already, even if isRedo is not set.	(See also mdopen)		 */		if (isRedo || IsBootstrapProcessingMode())			fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);		if (fd < 0)		{			pfree(path);			/* be sure to report the error reported by create, not open */			errno = save_errno;			ereport(ERROR,					(errcode_for_file_access(),					 errmsg("could not create relation %u/%u/%u: %m",							reln->smgr_rnode.spcNode,							reln->smgr_rnode.dbNode,							reln->smgr_rnode.relNode)));		}	}	pfree(path);	reln->md_fd = _fdvec_alloc();	reln->md_fd->mdfd_vfd = fd;	reln->md_fd->mdfd_segno = 0;#ifndef LET_OS_MANAGE_FILESIZE	reln->md_fd->mdfd_chain = NULL;#endif}/* *	mdunlink() -- Unlink a relation. * * Note that we're passed a RelFileNode --- by the time this is called, * there won't be an SMgrRelation hashtable entry anymore. * * Actually, we don't unlink the first segment file of the relation, but * just truncate it to zero length, and record a request to unlink it after * the next checkpoint.  Additional segments can be unlinked immediately, * however.  Leaving the empty file in place prevents that relfilenode * number from being reused.  The scenario this protects us from is: * 1. We delete a relation (and commit, and actually remove its file). * 2. We create a new relation, which by chance gets the same relfilenode as *	  the just-deleted one (OIDs must've wrapped around for that to happen). * 3. We crash before another checkpoint occurs. * During replay, we would delete the file and then recreate it, which is fine * if the contents of the file were repopulated by subsequent WAL entries. * But if we didn't WAL-log insertions, but instead relied on fsyncing the * file after populating it (as for instance CLUSTER and CREATE INDEX do), * the contents of the file would be lost forever.	By leaving the empty file * until after the next checkpoint, we prevent reassignment of the relfilenode * number until it's safe, because relfilenode assignment skips over any * existing file. * * If isRedo is true, it's okay for the relation to be already gone. * Also, we should remove the file immediately instead of queuing a request * for later, since during redo there's no possibility of creating a * conflicting relation. * * Note: any failure should be reported as WARNING not ERROR, because * we are usually not in a transaction anymore when this is called. */voidmdunlink(RelFileNode rnode, bool isRedo){	char	   *path;	int			ret;	/*	 * We have to clean out any pending fsync requests for the doomed	 * relation, else the next mdsync() will fail.	 */	ForgetRelationFsyncRequests(rnode);	path = relpath(rnode);	/*	 * Delete or truncate the first segment, or only segment if not doing	 * segmenting	 */	if (isRedo)		ret = unlink(path);	else	{		/* truncate(2) would be easier here, but Windows hasn't got it */		int			fd;		fd = BasicOpenFile(path, O_RDWR | PG_BINARY, 0);		if (fd >= 0)		{			int			save_errno;			ret = ftruncate(fd, 0);			save_errno = errno;			close(fd);			errno = save_errno;		}		else			ret = -1;	}	if (ret < 0)	{		if (!isRedo || errno != ENOENT)			ereport(WARNING,					(errcode_for_file_access(),					 errmsg("could not remove relation %u/%u/%u: %m",							rnode.spcNode,							rnode.dbNode,							rnode.relNode)));	}#ifndef LET_OS_MANAGE_FILESIZE	/* Delete the additional segments, if any */	else	{		char	   *segpath = (char *) palloc(strlen(path) + 12);		BlockNumber segno;		/*		 * Note that because we loop until getting ENOENT, we will correctly		 * remove all inactive segments as well as active ones.		 */		for (segno = 1;; segno++)		{			sprintf(segpath, "%s.%u", path, segno);			if (unlink(segpath) < 0)			{				/* ENOENT is expected after the last segment... */				if (errno != ENOENT)					ereport(WARNING,							(errcode_for_file_access(),							 errmsg("could not remove segment %u of relation %u/%u/%u: %m",									segno,									rnode.spcNode,									rnode.dbNode,									rnode.relNode)));				break;			}		}		pfree(segpath);	}#endif	pfree(path);	/* Register request to unlink first segment later */	if (!isRedo)		register_unlink(rnode);}/* *	mdextend() -- Add a block to the specified relation. * *		The semantics are nearly the same as mdwrite(): write at the *		specified position.  However, this is to be used for the case of *		extending a relation (i.e., blocknum is at or beyond the current *		EOF).  Note that we assume writing a block beyond current EOF *		causes intervening file space to become filled with zeroes. */voidmdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp){	long		seekpos;	int			nbytes;	MdfdVec    *v;	/* This assert is too expensive to have on normally ... */#ifdef CHECK_WRITE_VS_EXTEND	Assert(blocknum >= mdnblocks(reln));#endif	/*
12 3 4 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -