📄 md.c

📁 postgresql8.3.4源码,开源数据库
💻 C
📖 第 1 页 / 共 4 页
字号:
			/*			 * This segment is no longer active (and has already been unlinked			 * from the mdfd_chain). We truncate the file, but do not delete			 * it, for reasons explained in the header comments.			 */			if (FileTruncate(v->mdfd_vfd, 0) < 0)				ereport(ERROR,						(errcode_for_file_access(),						 errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",								reln->smgr_rnode.spcNode,								reln->smgr_rnode.dbNode,								reln->smgr_rnode.relNode,								nblocks)));			if (!isTemp)				register_dirty_segment(reln, v);			v = v->mdfd_chain;			Assert(ov != reln->md_fd);	/* we never drop the 1st segment */			pfree(ov);		}		else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)		{			/*			 * This is the last segment we want to keep. Truncate the file to			 * the right length, and clear chain link that points to any			 * remaining segments (which we shall zap). NOTE: if nblocks is			 * exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st			 * segment to 0 length but keep it. This adheres to the invariant			 * given in the header comments.			 */			BlockNumber lastsegblocks = nblocks - priorblocks;			if (FileTruncate(v->mdfd_vfd, lastsegblocks * BLCKSZ) < 0)				ereport(ERROR,						(errcode_for_file_access(),						 errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",								reln->smgr_rnode.spcNode,								reln->smgr_rnode.dbNode,								reln->smgr_rnode.relNode,								nblocks)));			if (!isTemp)				register_dirty_segment(reln, v);			v = v->mdfd_chain;			ov->mdfd_chain = NULL;		}		else		{			/*			 * We still need this segment and 0 or more blocks beyond it, so			 * nothing to do here.			 */			v = v->mdfd_chain;		}		priorblocks += RELSEG_SIZE;	}#else	if (FileTruncate(v->mdfd_vfd, nblocks * BLCKSZ) < 0)		ereport(ERROR,				(errcode_for_file_access(),			  errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",					 reln->smgr_rnode.spcNode,					 reln->smgr_rnode.dbNode,					 reln->smgr_rnode.relNode,					 nblocks)));	if (!isTemp)		register_dirty_segment(reln, v);#endif}/* *	mdimmedsync() -- Immediately sync a relation to stable storage. * * Note that only writes already issued are synced; this routine knows * nothing of dirty buffers that may exist inside the buffer manager. */voidmdimmedsync(SMgrRelation reln){	MdfdVec    *v;	BlockNumber curnblk;	/*	 * NOTE: mdnblocks makes sure we have opened all active segments, so that	 * fsync loop will get them all!	 */	curnblk = mdnblocks(reln);	v = mdopen(reln, EXTENSION_FAIL);#ifndef LET_OS_MANAGE_FILESIZE	while (v != NULL)	{		if (FileSync(v->mdfd_vfd) < 0)			ereport(ERROR,					(errcode_for_file_access(),				errmsg("could not fsync segment %u of relation %u/%u/%u: %m",					   v->mdfd_segno,					   reln->smgr_rnode.spcNode,					   reln->smgr_rnode.dbNode,					   reln->smgr_rnode.relNode)));		v = v->mdfd_chain;	}#else	if (FileSync(v->mdfd_vfd) < 0)		ereport(ERROR,				(errcode_for_file_access(),				 errmsg("could not fsync segment %u of relation %u/%u/%u: %m",						v->mdfd_segno,						reln->smgr_rnode.spcNode,						reln->smgr_rnode.dbNode,						reln->smgr_rnode.relNode)));#endif}/* *	mdsync() -- Sync previous writes to stable storage. */voidmdsync(void){	static bool mdsync_in_progress = false;	HASH_SEQ_STATUS hstat;	PendingOperationEntry *entry;	int			absorb_counter;	/*	 * This is only called during checkpoints, and checkpoints should only	 * occur in processes that have created a pendingOpsTable.	 */	if (!pendingOpsTable)		elog(ERROR, "cannot sync without a pendingOpsTable");	/*	 * If we are in the bgwriter, the sync had better include all fsync	 * requests that were queued by backends up to this point.	The tightest	 * race condition that could occur is that a buffer that must be written	 * and fsync'd for the checkpoint could have been dumped by a backend just	 * before it was visited by BufferSync().  We know the backend will have	 * queued an fsync request before clearing the buffer's dirtybit, so we	 * are safe as long as we do an Absorb after completing BufferSync().	 */	AbsorbFsyncRequests();	/*	 * To avoid excess fsync'ing (in the worst case, maybe a never-terminating	 * checkpoint), we want to ignore fsync requests that are entered into the	 * hashtable after this point --- they should be processed next time,	 * instead.  We use mdsync_cycle_ctr to tell old entries apart from new	 * ones: new ones will have cycle_ctr equal to the incremented value of	 * mdsync_cycle_ctr.	 *	 * In normal circumstances, all entries present in the table at this point	 * will have cycle_ctr exactly equal to the current (about to be old)	 * value of mdsync_cycle_ctr.  However, if we fail partway through the	 * fsync'ing loop, then older values of cycle_ctr might remain when we	 * come back here to try again.  Repeated checkpoint failures would	 * eventually wrap the counter around to the point where an old entry	 * might appear new, causing us to skip it, possibly allowing a checkpoint	 * to succeed that should not have.  To forestall wraparound, any time the	 * previous mdsync() failed to complete, run through the table and	 * forcibly set cycle_ctr = mdsync_cycle_ctr.	 *	 * Think not to merge this loop with the main loop, as the problem is	 * exactly that that loop may fail before having visited all the entries.	 * From a performance point of view it doesn't matter anyway, as this path	 * will never be taken in a system that's functioning normally.	 */	if (mdsync_in_progress)	{		/* prior try failed, so update any stale cycle_ctr values */		hash_seq_init(&hstat, pendingOpsTable);		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)		{			entry->cycle_ctr = mdsync_cycle_ctr;		}	}	/* Advance counter so that new hashtable entries are distinguishable */	mdsync_cycle_ctr++;	/* Set flag to detect failure if we don't reach the end of the loop */	mdsync_in_progress = true;	/* Now scan the hashtable for fsync requests to process */	absorb_counter = FSYNCS_PER_ABSORB;	hash_seq_init(&hstat, pendingOpsTable);	while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)	{		/*		 * If the entry is new then don't process it this time.  Note that		 * "continue" bypasses the hash-remove call at the bottom of the loop.		 */		if (entry->cycle_ctr == mdsync_cycle_ctr)			continue;		/* Else assert we haven't missed it */		Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);		/*		 * If fsync is off then we don't have to bother opening the file at		 * all.  (We delay checking until this point so that changing fsync on		 * the fly behaves sensibly.)  Also, if the entry is marked canceled,		 * fall through to delete it.		 */		if (enableFsync && !entry->canceled)		{			int			failures;			/*			 * If in bgwriter, we want to absorb pending requests every so			 * often to prevent overflow of the fsync request queue.  It is			 * unspecified whether newly-added entries will be visited by			 * hash_seq_search, but we don't care since we don't need to			 * process them anyway.			 */			if (--absorb_counter <= 0)			{				AbsorbFsyncRequests();				absorb_counter = FSYNCS_PER_ABSORB;			}			/*			 * The fsync table could contain requests to fsync segments that			 * have been deleted (unlinked) by the time we get to them. Rather			 * than just hoping an ENOENT (or EACCES on Windows) error can be			 * ignored, what we do on error is absorb pending requests and			 * then retry.	Since mdunlink() queues a "revoke" message before			 * actually unlinking, the fsync request is guaranteed to be			 * marked canceled after the absorb if it really was this case.			 * DROP DATABASE likewise has to tell us to forget fsync requests			 * before it starts deletions.			 */			for (failures = 0;; failures++)		/* loop exits at "break" */			{				SMgrRelation reln;				MdfdVec    *seg;				/*				 * Find or create an smgr hash entry for this relation. This				 * may seem a bit unclean -- md calling smgr?  But it's really				 * the best solution.  It ensures that the open file reference				 * isn't permanently leaked if we get an error here. (You may				 * say "but an unreferenced SMgrRelation is still a leak!" Not				 * really, because the only case in which a checkpoint is done				 * by a process that isn't about to shut down is in the				 * bgwriter, and it will periodically do smgrcloseall(). This				 * fact justifies our not closing the reln in the success path				 * either, which is a good thing since in non-bgwriter cases				 * we couldn't safely do that.)  Furthermore, in many cases				 * the relation will have been dirtied through this same smgr				 * relation, and so we can save a file open/close cycle.				 */				reln = smgropen(entry->tag.rnode);				/*				 * It is possible that the relation has been dropped or				 * truncated since the fsync request was entered.  Therefore,				 * allow ENOENT, but only if we didn't fail already on this				 * file.  This applies both during _mdfd_getseg() and during				 * FileSync, since fd.c might have closed the file behind our				 * back.				 */				seg = _mdfd_getseg(reln,							  entry->tag.segno * ((BlockNumber) RELSEG_SIZE),								   false, EXTENSION_RETURN_NULL);				if (seg != NULL &&					FileSync(seg->mdfd_vfd) >= 0)					break;		/* success; break out of retry loop */				/*				 * XXX is there any point in allowing more than one retry?				 * Don't see one at the moment, but easy to change the test				 * here if so.				 */				if (!FILE_POSSIBLY_DELETED(errno) ||					failures > 0)					ereport(ERROR,							(errcode_for_file_access(),							 errmsg("could not fsync segment %u of relation %u/%u/%u: %m",									entry->tag.segno,									entry->tag.rnode.spcNode,									entry->tag.rnode.dbNode,									entry->tag.rnode.relNode)));				else					ereport(DEBUG1,							(errcode_for_file_access(),							 errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m",									entry->tag.segno,									entry->tag.rnode.spcNode,									entry->tag.rnode.dbNode,									entry->tag.rnode.relNode)));				/*				 * Absorb incoming requests and check to see if canceled.				 */				AbsorbFsyncRequests();				absorb_counter = FSYNCS_PER_ABSORB;		/* might as well... */				if (entry->canceled)					break;			}					/* end retry loop */		}		/*		 * If we get here, either we fsync'd successfully, or we don't have to		 * because enableFsync is off, or the entry is (now) marked canceled.		 * Okay to delete it.		 */		if (hash_search(pendingOpsTable, &entry->tag,						HASH_REMOVE, NULL) == NULL)			elog(ERROR, "pendingOpsTable corrupted");	}							/* end loop over hashtable entries */	/* Flag successful completion of mdsync */	mdsync_in_progress = false;}/* * mdpreckpt() -- Do pre-checkpoint work * * To distinguish unlink requests that arrived before this checkpoint * started from those that arrived during the checkpoint, we use a cycle * counter similar to the one we use for fsync requests. That cycle * counter is incremented here. * * This must be called *before* the checkpoint REDO point is determined. * That ensures that we won't delete files too soon. * * Note that we can't do anything here that depends on the assumption * that the checkpoint will be completed. */voidmdpreckpt(void){	ListCell   *cell;	/*	 * In case the prior checkpoint wasn't completed, stamp all entries in the	 * list with the current cycle counter.  Anything that's in the list at	 * the start of checkpoint can surely be deleted after the checkpoint is	 * finished, regardless of when the request was made.	 */	foreach(cell, pendingUnlinks)	{		PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);		entry->cycle_ctr = mdckpt_cycle_ctr;	}	/*	 * Any unlink requests arriving after this point will be assigned the next	 * cycle counter, and won't be unlinked until next checkpoint.	 */	mdckpt_cycle_ctr++;}/* * mdpostckpt() -- Do post-checkpoint work * * Remove any lingering files that can now be safely removed. */voidmdpostckpt(void){	while (pendingUnlinks != NIL)	{		PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);		char	   *path;		/*		 * New entries are appended to the end, so if the entry is new we've		 * reached the end of old entries.		 */		if (entry->cycle_ctr == mdckpt_cycle_ctr)			break;		/* Else assert we haven't missed it */		Assert((CycleCtr) (entry->cycle_ctr + 1) == mdckpt_cycle_ctr);		/* Unlink the file */		path = relpath(entry->rnode);		if (unlink(path) < 0)		{			/*			 * There's a race condition, when the database is dropped at the			 * same time that we process the pending unlink requests. If the			 * DROP DATABASE deletes the file before we do, we will get ENOENT			 * here. rmtree() also has to ignore ENOENT errors, to deal with			 * the possibility that we delete the file first.			 */			if (errno != ENOENT)				ereport(WARNING,						(errcode_for_file_access(),						 errmsg("could not remove relation %u/%u/%u: %m",								entry->rnode.spcNode,								entry->rnode.dbNode,								entry->rnode.relNode)));		}		pfree(path);		pendingUnlinks = list_delete_first(pendingUnlinks);		pfree(entry);	}}/* * register_dirty_segment() -- Mark a relation segment as needing fsync
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -