📄 md.c
字号:
return false; } return true;}/* * mdnblocks() -- Get the number of blocks stored in a relation. * * Important side effect: all segments of the relation are opened * and added to the mdfd_chain list. If this routine has not been * called, then only segments up to the last one actually touched * are present in the chain... * * Returns # of blocks, or InvalidBlockNumber on error. */BlockNumbermdnblocks(SMgrRelation reln){ MdfdVec *v = mdopen(reln, false);#ifndef LET_OS_MANAGE_FILESIZE BlockNumber nblocks; BlockNumber segno = 0; /* * Skip through any segments that aren't the last one, to avoid redundant * seeks on them. We have previously verified that these segments are * exactly RELSEG_SIZE long, and it's useless to recheck that each time. * (NOTE: this assumption could only be wrong if another backend has * truncated the relation. We rely on higher code levels to handle that * scenario by closing and re-opening the md fd.) */ while (v->mdfd_chain != NULL) { segno++; v = v->mdfd_chain; } for (;;) { nblocks = _mdnblocks(v->mdfd_vfd, BLCKSZ); if (nblocks > ((BlockNumber) RELSEG_SIZE)) elog(FATAL, "segment too big"); if (nblocks < ((BlockNumber) RELSEG_SIZE)) return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks; /* * If segment is exactly RELSEG_SIZE, advance to next one. */ segno++; if (v->mdfd_chain == NULL) { /* * Because we pass O_CREAT, we will create the next segment (with * zero length) immediately, if the last segment is of length * REL_SEGSIZE. This is unnecessary but harmless, and testing for * the case would take more cycles than it seems worth. */ v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT); if (v->mdfd_chain == NULL) return InvalidBlockNumber; /* failed? */ } v = v->mdfd_chain; }#else return _mdnblocks(v->mdfd_vfd, BLCKSZ);#endif}/* * mdtruncate() -- Truncate relation to specified number of blocks. * * Returns # of blocks or InvalidBlockNumber on error. */BlockNumbermdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp){ MdfdVec *v; BlockNumber curnblk;#ifndef LET_OS_MANAGE_FILESIZE BlockNumber priorblocks;#endif /* * NOTE: mdnblocks makes sure we have opened all existing segments, so * that truncate/delete loop will get them all! */ curnblk = mdnblocks(reln); if (curnblk == InvalidBlockNumber) return InvalidBlockNumber; /* mdnblocks failed */ if (nblocks > curnblk) return InvalidBlockNumber; /* bogus request */ if (nblocks == curnblk) return nblocks; /* no work */ v = mdopen(reln, false);#ifndef LET_OS_MANAGE_FILESIZE priorblocks = 0; while (v != NULL) { MdfdVec *ov = v; if (priorblocks > nblocks) { /* * This segment is no longer wanted at all (and has already been * unlinked from the mdfd_chain). We truncate the file before * deleting it because if other backends are holding the file * open, the unlink will fail on some platforms. Better a * zero-size file gets left around than a big file... */ FileTruncate(v->mdfd_vfd, 0); FileUnlink(v->mdfd_vfd); v = v->mdfd_chain; Assert(ov != reln->md_fd); /* we never drop the 1st segment */ pfree(ov); } else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks) { /* * This is the last segment we want to keep. Truncate the file to * the right length, and clear chain link that points to any * remaining segments (which we shall zap). NOTE: if nblocks is * exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st * segment to 0 length but keep it. This is mainly so that the * right thing happens if nblocks==0. */ BlockNumber lastsegblocks = nblocks - priorblocks; if (FileTruncate(v->mdfd_vfd, lastsegblocks * BLCKSZ) < 0) return InvalidBlockNumber; if (!isTemp) { if (!register_dirty_segment(reln, v)) return InvalidBlockNumber; } v = v->mdfd_chain; ov->mdfd_chain = NULL; } else { /* * We still need this segment and 0 or more blocks beyond it, so * nothing to do here. */ v = v->mdfd_chain; } priorblocks += RELSEG_SIZE; }#else if (FileTruncate(v->mdfd_vfd, nblocks * BLCKSZ) < 0) return InvalidBlockNumber; if (!isTemp) { if (!register_dirty_segment(reln, v)) return InvalidBlockNumber; }#endif return nblocks;}/* * mdimmedsync() -- Immediately sync a relation to stable storage. * * Note that only writes already issued are synced; this routine knows * nothing of dirty buffers that may exist inside the buffer manager. */boolmdimmedsync(SMgrRelation reln){ MdfdVec *v; BlockNumber curnblk; /* * NOTE: mdnblocks makes sure we have opened all existing segments, so * that fsync loop will get them all! */ curnblk = mdnblocks(reln); if (curnblk == InvalidBlockNumber) return false; /* mdnblocks failed */ v = mdopen(reln, false);#ifndef LET_OS_MANAGE_FILESIZE while (v != NULL) { if (FileSync(v->mdfd_vfd) < 0) return false; v = v->mdfd_chain; }#else if (FileSync(v->mdfd_vfd) < 0) return false;#endif return true;}/* * mdsync() -- Sync previous writes to stable storage. * * This is only called during checkpoints, and checkpoints should only * occur in processes that have created a pendingOpsTable. */boolmdsync(void){ HASH_SEQ_STATUS hstat; PendingOperationEntry *entry; if (!pendingOpsTable) return false; /* * If we are in the bgwriter, the sync had better include all fsync * requests that were queued by backends before the checkpoint REDO point * was determined. We go that a little better by accepting all requests * queued up to the point where we start fsync'ing. */ AbsorbFsyncRequests(); hash_seq_init(&hstat, pendingOpsTable); while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) { /* * If fsync is off then we don't have to bother opening the file at * all. (We delay checking until this point so that changing fsync on * the fly behaves sensibly.) */ if (enableFsync) { SMgrRelation reln; MdfdVec *seg; /* * Find or create an smgr hash entry for this relation. This may * seem a bit unclean -- md calling smgr? But it's really the * best solution. It ensures that the open file reference isn't * permanently leaked if we get an error here. (You may say "but * an unreferenced SMgrRelation is still a leak!" Not really, * because the only case in which a checkpoint is done by a * process that isn't about to shut down is in the bgwriter, and * it will periodically do smgrcloseall(). This fact justifies * our not closing the reln in the success path either, which is a * good thing since in non-bgwriter cases we couldn't safely do * that.) Furthermore, in many cases the relation will have been * dirtied through this same smgr relation, and so we can save a * file open/close cycle. */ reln = smgropen(entry->rnode); /* * It is possible that the relation has been dropped or truncated * since the fsync request was entered. Therefore, we have to * allow file-not-found errors. This applies both during * _mdfd_getseg() and during FileSync, since fd.c might have * closed the file behind our back. */ seg = _mdfd_getseg(reln, entry->segno * ((BlockNumber) RELSEG_SIZE), true); if (seg) { if (FileSync(seg->mdfd_vfd) < 0 && errno != ENOENT) { ereport(LOG, (errcode_for_file_access(), errmsg("could not fsync segment %u of relation %u/%u/%u: %m", entry->segno, entry->rnode.spcNode, entry->rnode.dbNode, entry->rnode.relNode))); return false; } } } /* Okay, delete this entry */ if (hash_search(pendingOpsTable, entry, HASH_REMOVE, NULL) == NULL) elog(ERROR, "pendingOpsTable corrupted"); } return true;}/* * register_dirty_segment() -- Mark a relation segment as needing fsync * * If there is a local pending-ops table, just make an entry in it for * mdsync to process later. Otherwise, try to pass off the fsync request * to the background writer process. If that fails, just do the fsync * locally before returning (we expect this will not happen often enough * to be a performance problem). * * A false result implies I/O failure during local fsync. errno will be * valid for error reporting. */static boolregister_dirty_segment(SMgrRelation reln, MdfdVec *seg){ if (pendingOpsTable) { PendingOperationEntry entry; /* ensure any pad bytes in the struct are zeroed */ MemSet(&entry, 0, sizeof(entry)); entry.rnode = reln->smgr_rnode; entry.segno = seg->mdfd_segno; (void) hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL); return true; } else { if (ForwardFsyncRequest(reln->smgr_rnode, seg->mdfd_segno)) return true; } if (FileSync(seg->mdfd_vfd) < 0) return false; return true;}/* * RememberFsyncRequest() -- callback from bgwriter side of fsync request * * We stuff the fsync request into the local hash table for execution * during the bgwriter's next checkpoint. */voidRememberFsyncRequest(RelFileNode rnode, BlockNumber segno){ PendingOperationEntry entry; Assert(pendingOpsTable); /* ensure any pad bytes in the struct are zeroed */ MemSet(&entry, 0, sizeof(entry)); entry.rnode = rnode; entry.segno = segno; (void) hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL);}/* * _fdvec_alloc() -- Make a MdfdVec object. */static MdfdVec *_fdvec_alloc(void){ return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));}#ifndef LET_OS_MANAGE_FILESIZE/* * Open the specified segment of the relation, * and make a MdfdVec object for it. Returns NULL on failure. */static MdfdVec *_mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags){ MdfdVec *v; int fd; char *path, *fullpath; path = relpath(reln->smgr_rnode); if (segno > 0) { /* be sure we have enough space for the '.segno' */ fullpath = (char *) palloc(strlen(path) + 12); sprintf(fullpath, "%s.%u", path, segno); pfree(path); } else fullpath = path; /* open the file */ fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags, 0600); pfree(fullpath); if (fd < 0) return NULL; /* allocate an mdfdvec entry for it */ v = _fdvec_alloc(); /* fill the entry */ v->mdfd_vfd = fd; v->mdfd_segno = segno; v->mdfd_chain = NULL; Assert(_mdnblocks(fd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE)); /* all done */ return v;}#endif /* LET_OS_MANAGE_FILESIZE *//* * _mdfd_getseg() -- Find the segment of the relation holding the * specified block. ereport's on failure. * (Optionally, can return NULL instead of ereport for ENOENT.) */static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool allowNotFound){ MdfdVec *v = mdopen(reln, allowNotFound);#ifndef LET_OS_MANAGE_FILESIZE BlockNumber segstogo; BlockNumber nextsegno; if (!v) return NULL; /* only possible if allowNotFound */ for (segstogo = blkno / ((BlockNumber) RELSEG_SIZE), nextsegno = 1; segstogo > 0; nextsegno++, segstogo--) { if (v->mdfd_chain == NULL) { /* * We will create the next segment only if the target block is * within it. This prevents Sorcerer's Apprentice syndrome if a * bug at higher levels causes us to be handed a ridiculously * large blkno --- otherwise we could create many thousands of * empty segment files before reaching the "target" block. We * should never need to create more than one new segment per call, * so this restriction seems reasonable. * * BUT: when doing WAL recovery, disable this logic and create * segments unconditionally. In this case it seems better to * assume the given blkno is good (it presumably came from a * CRC-checked WAL record); furthermore this lets us cope in the * case where we are replaying WAL data that has a write into a * high-numbered segment of a relation that was later deleted. We * want to go ahead and create the segments so we can finish out * the replay. */ v->mdfd_chain = _mdfd_openseg(reln, nextsegno, (segstogo == 1 || InRecovery) ? O_CREAT : 0); if (v->mdfd_chain == NULL) { if (allowNotFound && errno == ENOENT) return NULL; ereport(ERROR, (errcode_for_file_access(), errmsg("could not open segment %u of relation %u/%u/%u (target block %u): %m", nextsegno, reln->smgr_rnode.spcNode, reln->smgr_rnode.dbNode, reln->smgr_rnode.relNode, blkno))); } } v = v->mdfd_chain; }#endif return v;}/* * Get number of blocks present in a single disk file */static BlockNumber_mdnblocks(File file, Size blcksz){ long len; len = FileSeek(file, 0L, SEEK_END); if (len < 0) return 0; /* on failure, assume file is empty */ return (BlockNumber) (len / blcksz);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -