📄 md.c
字号:
/* * This segment is no longer active (and has already been unlinked * from the mdfd_chain). We truncate the file, but do not delete * it, for reasons explained in the header comments. */ if (FileTruncate(v->mdfd_vfd, 0) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not truncate relation %u/%u/%u to %u blocks: %m", reln->smgr_rnode.spcNode, reln->smgr_rnode.dbNode, reln->smgr_rnode.relNode, nblocks))); if (!isTemp) register_dirty_segment(reln, v); v = v->mdfd_chain; Assert(ov != reln->md_fd); /* we never drop the 1st segment */ pfree(ov); } else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks) { /* * This is the last segment we want to keep. Truncate the file to * the right length, and clear chain link that points to any * remaining segments (which we shall zap). NOTE: if nblocks is * exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st * segment to 0 length but keep it. This adheres to the invariant * given in the header comments. */ BlockNumber lastsegblocks = nblocks - priorblocks; if (FileTruncate(v->mdfd_vfd, lastsegblocks * BLCKSZ) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not truncate relation %u/%u/%u to %u blocks: %m", reln->smgr_rnode.spcNode, reln->smgr_rnode.dbNode, reln->smgr_rnode.relNode, nblocks))); if (!isTemp) register_dirty_segment(reln, v); v = v->mdfd_chain; ov->mdfd_chain = NULL; } else { /* * We still need this segment and 0 or more blocks beyond it, so * nothing to do here. */ v = v->mdfd_chain; } priorblocks += RELSEG_SIZE; }#else if (FileTruncate(v->mdfd_vfd, nblocks * BLCKSZ) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not truncate relation %u/%u/%u to %u blocks: %m", reln->smgr_rnode.spcNode, reln->smgr_rnode.dbNode, reln->smgr_rnode.relNode, nblocks))); if (!isTemp) register_dirty_segment(reln, v);#endif}/* * mdimmedsync() -- Immediately sync a relation to stable storage. * * Note that only writes already issued are synced; this routine knows * nothing of dirty buffers that may exist inside the buffer manager. */voidmdimmedsync(SMgrRelation reln){ MdfdVec *v; BlockNumber curnblk; /* * NOTE: mdnblocks makes sure we have opened all active segments, so that * fsync loop will get them all! */ curnblk = mdnblocks(reln); v = mdopen(reln, EXTENSION_FAIL);#ifndef LET_OS_MANAGE_FILESIZE while (v != NULL) { if (FileSync(v->mdfd_vfd) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync segment %u of relation %u/%u/%u: %m", v->mdfd_segno, reln->smgr_rnode.spcNode, reln->smgr_rnode.dbNode, reln->smgr_rnode.relNode))); v = v->mdfd_chain; }#else if (FileSync(v->mdfd_vfd) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync segment %u of relation %u/%u/%u: %m", v->mdfd_segno, reln->smgr_rnode.spcNode, reln->smgr_rnode.dbNode, reln->smgr_rnode.relNode)));#endif}/* * mdsync() -- Sync previous writes to stable storage. */voidmdsync(void){ static bool mdsync_in_progress = false; HASH_SEQ_STATUS hstat; PendingOperationEntry *entry; int absorb_counter; /* * This is only called during checkpoints, and checkpoints should only * occur in processes that have created a pendingOpsTable. */ if (!pendingOpsTable) elog(ERROR, "cannot sync without a pendingOpsTable"); /* * If we are in the bgwriter, the sync had better include all fsync * requests that were queued by backends up to this point. The tightest * race condition that could occur is that a buffer that must be written * and fsync'd for the checkpoint could have been dumped by a backend just * before it was visited by BufferSync(). We know the backend will have * queued an fsync request before clearing the buffer's dirtybit, so we * are safe as long as we do an Absorb after completing BufferSync(). */ AbsorbFsyncRequests(); /* * To avoid excess fsync'ing (in the worst case, maybe a never-terminating * checkpoint), we want to ignore fsync requests that are entered into the * hashtable after this point --- they should be processed next time, * instead. We use mdsync_cycle_ctr to tell old entries apart from new * ones: new ones will have cycle_ctr equal to the incremented value of * mdsync_cycle_ctr. * * In normal circumstances, all entries present in the table at this point * will have cycle_ctr exactly equal to the current (about to be old) * value of mdsync_cycle_ctr. However, if we fail partway through the * fsync'ing loop, then older values of cycle_ctr might remain when we * come back here to try again. Repeated checkpoint failures would * eventually wrap the counter around to the point where an old entry * might appear new, causing us to skip it, possibly allowing a checkpoint * to succeed that should not have. To forestall wraparound, any time the * previous mdsync() failed to complete, run through the table and * forcibly set cycle_ctr = mdsync_cycle_ctr. * * Think not to merge this loop with the main loop, as the problem is * exactly that that loop may fail before having visited all the entries. * From a performance point of view it doesn't matter anyway, as this path * will never be taken in a system that's functioning normally. */ if (mdsync_in_progress) { /* prior try failed, so update any stale cycle_ctr values */ hash_seq_init(&hstat, pendingOpsTable); while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) { entry->cycle_ctr = mdsync_cycle_ctr; } } /* Advance counter so that new hashtable entries are distinguishable */ mdsync_cycle_ctr++; /* Set flag to detect failure if we don't reach the end of the loop */ mdsync_in_progress = true; /* Now scan the hashtable for fsync requests to process */ absorb_counter = FSYNCS_PER_ABSORB; hash_seq_init(&hstat, pendingOpsTable); while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) { /* * If the entry is new then don't process it this time. Note that * "continue" bypasses the hash-remove call at the bottom of the loop. */ if (entry->cycle_ctr == mdsync_cycle_ctr) continue; /* Else assert we haven't missed it */ Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr); /* * If fsync is off then we don't have to bother opening the file at * all. (We delay checking until this point so that changing fsync on * the fly behaves sensibly.) Also, if the entry is marked canceled, * fall through to delete it. */ if (enableFsync && !entry->canceled) { int failures; /* * If in bgwriter, we want to absorb pending requests every so * often to prevent overflow of the fsync request queue. It is * unspecified whether newly-added entries will be visited by * hash_seq_search, but we don't care since we don't need to * process them anyway. */ if (--absorb_counter <= 0) { AbsorbFsyncRequests(); absorb_counter = FSYNCS_PER_ABSORB; } /* * The fsync table could contain requests to fsync segments that * have been deleted (unlinked) by the time we get to them. Rather * than just hoping an ENOENT (or EACCES on Windows) error can be * ignored, what we do on error is absorb pending requests and * then retry. Since mdunlink() queues a "revoke" message before * actually unlinking, the fsync request is guaranteed to be * marked canceled after the absorb if it really was this case. * DROP DATABASE likewise has to tell us to forget fsync requests * before it starts deletions. */ for (failures = 0;; failures++) /* loop exits at "break" */ { SMgrRelation reln; MdfdVec *seg; /* * Find or create an smgr hash entry for this relation. This * may seem a bit unclean -- md calling smgr? But it's really * the best solution. It ensures that the open file reference * isn't permanently leaked if we get an error here. (You may * say "but an unreferenced SMgrRelation is still a leak!" Not * really, because the only case in which a checkpoint is done * by a process that isn't about to shut down is in the * bgwriter, and it will periodically do smgrcloseall(). This * fact justifies our not closing the reln in the success path * either, which is a good thing since in non-bgwriter cases * we couldn't safely do that.) Furthermore, in many cases * the relation will have been dirtied through this same smgr * relation, and so we can save a file open/close cycle. */ reln = smgropen(entry->tag.rnode); /* * It is possible that the relation has been dropped or * truncated since the fsync request was entered. Therefore, * allow ENOENT, but only if we didn't fail already on this * file. This applies both during _mdfd_getseg() and during * FileSync, since fd.c might have closed the file behind our * back. */ seg = _mdfd_getseg(reln, entry->tag.segno * ((BlockNumber) RELSEG_SIZE), false, EXTENSION_RETURN_NULL); if (seg != NULL && FileSync(seg->mdfd_vfd) >= 0) break; /* success; break out of retry loop */ /* * XXX is there any point in allowing more than one retry? * Don't see one at the moment, but easy to change the test * here if so. */ if (!FILE_POSSIBLY_DELETED(errno) || failures > 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync segment %u of relation %u/%u/%u: %m", entry->tag.segno, entry->tag.rnode.spcNode, entry->tag.rnode.dbNode, entry->tag.rnode.relNode))); else ereport(DEBUG1, (errcode_for_file_access(), errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m", entry->tag.segno, entry->tag.rnode.spcNode, entry->tag.rnode.dbNode, entry->tag.rnode.relNode))); /* * Absorb incoming requests and check to see if canceled. */ AbsorbFsyncRequests(); absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */ if (entry->canceled) break; } /* end retry loop */ } /* * If we get here, either we fsync'd successfully, or we don't have to * because enableFsync is off, or the entry is (now) marked canceled. * Okay to delete it. */ if (hash_search(pendingOpsTable, &entry->tag, HASH_REMOVE, NULL) == NULL) elog(ERROR, "pendingOpsTable corrupted"); } /* end loop over hashtable entries */ /* Flag successful completion of mdsync */ mdsync_in_progress = false;}/* * mdpreckpt() -- Do pre-checkpoint work * * To distinguish unlink requests that arrived before this checkpoint * started from those that arrived during the checkpoint, we use a cycle * counter similar to the one we use for fsync requests. That cycle * counter is incremented here. * * This must be called *before* the checkpoint REDO point is determined. * That ensures that we won't delete files too soon. * * Note that we can't do anything here that depends on the assumption * that the checkpoint will be completed. */voidmdpreckpt(void){ ListCell *cell; /* * In case the prior checkpoint wasn't completed, stamp all entries in the * list with the current cycle counter. Anything that's in the list at * the start of checkpoint can surely be deleted after the checkpoint is * finished, regardless of when the request was made. */ foreach(cell, pendingUnlinks) { PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell); entry->cycle_ctr = mdckpt_cycle_ctr; } /* * Any unlink requests arriving after this point will be assigned the next * cycle counter, and won't be unlinked until next checkpoint. */ mdckpt_cycle_ctr++;}/* * mdpostckpt() -- Do post-checkpoint work * * Remove any lingering files that can now be safely removed. */voidmdpostckpt(void){ while (pendingUnlinks != NIL) { PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks); char *path; /* * New entries are appended to the end, so if the entry is new we've * reached the end of old entries. */ if (entry->cycle_ctr == mdckpt_cycle_ctr) break; /* Else assert we haven't missed it */ Assert((CycleCtr) (entry->cycle_ctr + 1) == mdckpt_cycle_ctr); /* Unlink the file */ path = relpath(entry->rnode); if (unlink(path) < 0) { /* * There's a race condition, when the database is dropped at the * same time that we process the pending unlink requests. If the * DROP DATABASE deletes the file before we do, we will get ENOENT * here. rmtree() also has to ignore ENOENT errors, to deal with * the possibility that we delete the file first. */ if (errno != ENOENT) ereport(WARNING, (errcode_for_file_access(), errmsg("could not remove relation %u/%u/%u: %m", entry->rnode.spcNode, entry->rnode.dbNode, entry->rnode.relNode))); } pfree(path); pendingUnlinks = list_delete_first(pendingUnlinks); pfree(entry); }}/* * register_dirty_segment() -- Mark a relation segment as needing fsync
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -