logtape.c
来自「postgresql8.3.4源码,开源数据库」· C语言 代码 · 共 1,024 行 · 第 1/3 页
C
1,024 行
/*------------------------------------------------------------------------- * * logtape.c * Management of "logical tapes" within temporary files. * * This module exists to support sorting via multiple merge passes (see * tuplesort.c). Merging is an ideal algorithm for tape devices, but if * we implement it on disk by creating a separate file for each "tape", * there is an annoying problem: the peak space usage is at least twice * the volume of actual data to be sorted. (This must be so because each * datum will appear in both the input and output tapes of the final * merge pass. For seven-tape polyphase merge, which is otherwise a * pretty good algorithm, peak usage is more like 4x actual data volume.) * * We can work around this problem by recognizing that any one tape * dataset (with the possible exception of the final output) is written * and read exactly once in a perfectly sequential manner. Therefore, * a datum once read will not be required again, and we can recycle its * space for use by the new tape dataset(s) being generated. In this way, * the total space usage is essentially just the actual data volume, plus * insignificant bookkeeping and start/stop overhead. * * Few OSes allow arbitrary parts of a file to be released back to the OS, * so we have to implement this space-recycling ourselves within a single * logical file. logtape.c exists to perform this bookkeeping and provide * the illusion of N independent tape devices to tuplesort.c. Note that * logtape.c itself depends on buffile.c to provide a "logical file" of * larger size than the underlying OS may support. * * For simplicity, we allocate and release space in the underlying file * in BLCKSZ-size blocks. Space allocation boils down to keeping track * of which blocks in the underlying file belong to which logical tape, * plus any blocks that are free (recycled and not yet reused). * The blocks in each logical tape are remembered using a method borrowed * from the Unix HFS filesystem: we store data block numbers in an * "indirect block". If an indirect block fills up, we write it out to * the underlying file and remember its location in a second-level indirect * block. In the same way second-level blocks are remembered in third- * level blocks, and so on if necessary (of course we're talking huge * amounts of data here). The topmost indirect block of a given logical * tape is never actually written out to the physical file, but all lower- * level indirect blocks will be. * * The initial write pass is guaranteed to fill the underlying file * perfectly sequentially, no matter how data is divided into logical tapes. * Once we begin merge passes, the access pattern becomes considerably * less predictable --- but the seeking involved should be comparable to * what would happen if we kept each logical tape in a separate file, * so there's no serious performance penalty paid to obtain the space * savings of recycling. We try to localize the write accesses by always * writing to the lowest-numbered free block when we have a choice; it's * not clear this helps much, but it can't hurt. (XXX perhaps a LIFO * policy for free blocks would be better?) * * To support the above policy of writing to the lowest free block, * ltsGetFreeBlock sorts the list of free block numbers into decreasing * order each time it is asked for a block and the list isn't currently * sorted. This is an efficient way to handle it because we expect cycles * of releasing many blocks followed by re-using many blocks, due to * tuplesort.c's "preread" behavior. * * Since all the bookkeeping and buffer memory is allocated with palloc(), * and the underlying file(s) are made with OpenTemporaryFile, all resources * for a logical tape set are certain to be cleaned up even if processing * is aborted by ereport(ERROR). To avoid confusion, the caller should take * care that all calls for a single LogicalTapeSet are made in the same * palloc context. * * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION * $PostgreSQL: pgsql/src/backend/utils/sort/logtape.c,v 1.26 2008/01/01 19:45:55 momjian Exp $ * *------------------------------------------------------------------------- */#include "postgres.h"#include "storage/buffile.h"#include "utils/logtape.h"/* * Block indexes are "long"s, so we can fit this many per indirect block. * NB: we assume this is an exact fit! */#define BLOCKS_PER_INDIR_BLOCK ((int) (BLCKSZ / sizeof(long)))/* * We use a struct like this for each active indirection level of each * logical tape. If the indirect block is not the highest level of its * tape, the "nextup" link points to the next higher level. Only the * "ptrs" array is written out if we have to dump the indirect block to * disk. If "ptrs" is not completely full, we store -1L in the first * unused slot at completion of the write phase for the logical tape. */typedef struct IndirectBlock{ int nextSlot; /* next pointer slot to write or read */ struct IndirectBlock *nextup; /* parent indirect level, or NULL if * top */ long ptrs[BLOCKS_PER_INDIR_BLOCK]; /* indexes of contained blocks */} IndirectBlock;/* * This data structure represents a single "logical tape" within the set * of logical tapes stored in the same file. We must keep track of the * current partially-read-or-written data block as well as the active * indirect block level(s). */typedef struct LogicalTape{ IndirectBlock *indirect; /* bottom of my indirect-block hierarchy */ bool writing; /* T while in write phase */ bool frozen; /* T if blocks should not be freed when read */ bool dirty; /* does buffer need to be written? */ /* * The total data volume in the logical tape is numFullBlocks * BLCKSZ + * lastBlockBytes. BUT: we do not update lastBlockBytes during writing, * only at completion of a write phase. */ long numFullBlocks; /* number of complete blocks in log tape */ int lastBlockBytes; /* valid bytes in last (incomplete) block */ /* * Buffer for current data block. Note we don't bother to store the * actual file block number of the data block (during the write phase it * hasn't been assigned yet, and during read we don't care anymore). But * we do need the relative block number so we can detect end-of-tape while * reading. */ char *buffer; /* physical buffer (separately palloc'd) */ long curBlockNumber; /* this block's logical blk# within tape */ int pos; /* next read/write position in buffer */ int nbytes; /* total # of valid bytes in buffer */} LogicalTape;/* * This data structure represents a set of related "logical tapes" sharing * space in a single underlying file. (But that "file" may be multiple files * if needed to escape OS limits on file size; buffile.c handles that for us.) * The number of tapes is fixed at creation. */struct LogicalTapeSet{ BufFile *pfile; /* underlying file for whole tape set */ long nFileBlocks; /* # of blocks used in underlying file */ /* * We store the numbers of recycled-and-available blocks in freeBlocks[]. * When there are no such blocks, we extend the underlying file. * * If forgetFreeSpace is true then any freed blocks are simply forgotten * rather than being remembered in freeBlocks[]. See notes for * LogicalTapeSetForgetFreeSpace(). * * If blocksSorted is true then the block numbers in freeBlocks are in * *decreasing* order, so that removing the last entry gives us the lowest * free block. We re-sort the blocks whenever a block is demanded; this * should be reasonably efficient given the expected usage pattern. */ bool forgetFreeSpace; /* are we remembering free blocks? */ bool blocksSorted; /* is freeBlocks[] currently in order? */ long *freeBlocks; /* resizable array */ int nFreeBlocks; /* # of currently free blocks */ int freeBlocksLen; /* current allocated length of freeBlocks[] */ /* * tapes[] is declared size 1 since C wants a fixed size, but actually it * is of length nTapes. */ int nTapes; /* # of logical tapes in set */ LogicalTape tapes[1]; /* must be last in struct! */};static void ltsWriteBlock(LogicalTapeSet *lts, long blocknum, void *buffer);static void ltsReadBlock(LogicalTapeSet *lts, long blocknum, void *buffer);static long ltsGetFreeBlock(LogicalTapeSet *lts);static void ltsReleaseBlock(LogicalTapeSet *lts, long blocknum);static void ltsRecordBlockNum(LogicalTapeSet *lts, IndirectBlock *indirect, long blocknum);static long ltsRewindIndirectBlock(LogicalTapeSet *lts, IndirectBlock *indirect, bool freezing);static long ltsRewindFrozenIndirectBlock(LogicalTapeSet *lts, IndirectBlock *indirect);static long ltsRecallNextBlockNum(LogicalTapeSet *lts, IndirectBlock *indirect, bool frozen);static long ltsRecallPrevBlockNum(LogicalTapeSet *lts, IndirectBlock *indirect);static void ltsDumpBuffer(LogicalTapeSet *lts, LogicalTape *lt);/* * Write a block-sized buffer to the specified block of the underlying file. * * NB: should not attempt to write beyond current end of file (ie, create * "holes" in file), since BufFile doesn't allow that. The first write pass * must write blocks sequentially. * * No need for an error return convention; we ereport() on any error. */static voidltsWriteBlock(LogicalTapeSet *lts, long blocknum, void *buffer){ if (BufFileSeekBlock(lts->pfile, blocknum) != 0 || BufFileWrite(lts->pfile, buffer, BLCKSZ) != BLCKSZ) ereport(ERROR, /* XXX is it okay to assume errno is correct? */ (errcode_for_file_access(), errmsg("could not write block %ld of temporary file: %m", blocknum), errhint("Perhaps out of disk space?")));}/* * Read a block-sized buffer from the specified block of the underlying file. * * No need for an error return convention; we ereport() on any error. This * module should never attempt to read a block it doesn't know is there. */static voidltsReadBlock(LogicalTapeSet *lts, long blocknum, void *buffer){ if (BufFileSeekBlock(lts->pfile, blocknum) != 0 || BufFileRead(lts->pfile, buffer, BLCKSZ) != BLCKSZ) ereport(ERROR, /* XXX is it okay to assume errno is correct? */ (errcode_for_file_access(), errmsg("could not read block %ld of temporary file: %m", blocknum)));}/* * qsort comparator for sorting freeBlocks[] into decreasing order. */static intfreeBlocks_cmp(const void *a, const void *b){ long ablk = *((const long *) a); long bblk = *((const long *) b); /* can't just subtract because long might be wider than int */ if (ablk < bblk) return 1; if (ablk > bblk) return -1; return 0;}/* * Select a currently unused block for writing to. * * NB: should only be called when writer is ready to write immediately, * to ensure that first write pass is sequential. */static longltsGetFreeBlock(LogicalTapeSet *lts){ /* * If there are multiple free blocks, we select the one appearing last in * freeBlocks[] (after sorting the array if needed). If there are none, * assign the next block at the end of the file. */ if (lts->nFreeBlocks > 0) { if (!lts->blocksSorted) { qsort((void *) lts->freeBlocks, lts->nFreeBlocks, sizeof(long), freeBlocks_cmp); lts->blocksSorted = true; } return lts->freeBlocks[--lts->nFreeBlocks]; } else return lts->nFileBlocks++;}/* * Return a block# to the freelist. */static voidltsReleaseBlock(LogicalTapeSet *lts, long blocknum){ int ndx; /* * Do nothing if we're no longer interested in remembering free space. */ if (lts->forgetFreeSpace) return; /* * Enlarge freeBlocks array if full. */ if (lts->nFreeBlocks >= lts->freeBlocksLen) { lts->freeBlocksLen *= 2; lts->freeBlocks = (long *) repalloc(lts->freeBlocks, lts->freeBlocksLen * sizeof(long)); } /* * Add blocknum to array, and mark the array unsorted if it's no longer in * decreasing order. */ ndx = lts->nFreeBlocks++; lts->freeBlocks[ndx] = blocknum; if (ndx > 0 && lts->freeBlocks[ndx - 1] < blocknum) lts->blocksSorted = false;}/* * These routines manipulate indirect-block hierarchies. All are recursive * so that they don't have any specific limit on the depth of hierarchy. *//* * Record a data block number in a logical tape's lowest indirect block, * or record an indirect block's number in the next higher indirect level. */static voidltsRecordBlockNum(LogicalTapeSet *lts, IndirectBlock *indirect, long blocknum){ if (indirect->nextSlot >= BLOCKS_PER_INDIR_BLOCK) { /* * This indirect block is full, so dump it out and recursively save * its address in the next indirection level. Create a new * indirection level if there wasn't one before. */ long indirblock = ltsGetFreeBlock(lts); ltsWriteBlock(lts, indirblock, (void *) indirect->ptrs); if (indirect->nextup == NULL) { indirect->nextup = (IndirectBlock *) palloc(sizeof(IndirectBlock)); indirect->nextup->nextSlot = 0; indirect->nextup->nextup = NULL;
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?