tidbitmap.c
来自「PostgreSQL 8.1.4的源码 适用于Linux下的开源数据库系统」· C语言 代码 · 共 935 行 · 第 1/2 页
C
935 行
/*------------------------------------------------------------------------- * * tidbitmap.c * PostgreSQL tuple-id (TID) bitmap package * * This module provides bitmap data structures that are spiritually * similar to Bitmapsets, but are specially adapted to store sets of * tuple identifiers (TIDs), or ItemPointers. In particular, the division * of an ItemPointer into BlockNumber and OffsetNumber is catered for. * Also, since we wish to be able to store very large tuple sets in * memory with this data structure, we support "lossy" storage, in which * we no longer remember individual tuple offsets on a page but only the * fact that a particular page needs to be visited. * * The "lossy" storage uses one bit per disk page, so at the standard 8K * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb * of memory. People pushing around tables of that size should have a * couple of Mb to spare, so we don't worry about providing a second level * of lossiness. In theory we could fall back to page ranges at some * point, but for now that seems useless complexity. * * * Copyright (c) 2003-2005, PostgreSQL Global Development Group * * IDENTIFICATION * $PostgreSQL: pgsql/src/backend/nodes/tidbitmap.c,v 1.8 2005/10/15 02:49:19 momjian Exp $ * *------------------------------------------------------------------------- */#include "postgres.h"#include <limits.h>#include "access/htup.h"#include "nodes/tidbitmap.h"#include "utils/hsearch.h"/* * The maximum number of tuples per page is not large (typically 256 with * 8K pages, or 1024 with 32K pages). So there's not much point in making * the per-page bitmaps variable size. We just legislate that the size * is this: */#define MAX_TUPLES_PER_PAGE MaxHeapTuplesPerPage/* * When we have to switch over to lossy storage, we use a data structure * with one bit per page, where all pages having the same number DIV * PAGES_PER_CHUNK are aggregated into one chunk. When a chunk is present * and has the bit set for a given page, there must not be a per-page entry * for that page in the page table. * * We actually store both exact pages and lossy chunks in the same hash * table, using identical data structures. (This is because dynahash.c's * memory management doesn't allow space to be transferred easily from one * hashtable to another.) Therefore it's best if PAGES_PER_CHUNK is the * same as MAX_TUPLES_PER_PAGE, or at least not too different. But we * also want PAGES_PER_CHUNK to be a power of 2 to avoid expensive integer * remainder operations. So, define it like this: */#define PAGES_PER_CHUNK (BLCKSZ / 32)/* The bitmap unit size can be adjusted by changing these declarations: */#define BITS_PER_BITMAPWORD 32typedef uint32 bitmapword; /* must be an unsigned type */#define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)#define BITNUM(x) ((x) % BITS_PER_BITMAPWORD)/* number of active words for an exact page: */#define WORDS_PER_PAGE ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)/* number of active words for a lossy chunk: */#define WORDS_PER_CHUNK ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)/* * The hashtable entries are represented by this data structure. For * an exact page, blockno is the page number and bit k of the bitmap * represents tuple offset k+1. For a lossy chunk, blockno is the first * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and * bit k represents page blockno+k. Note that it is not possible to * have exact storage for the first page of a chunk if we are using * lossy storage for any page in the chunk's range, since the same * hashtable entry has to serve both purposes. */typedef struct PagetableEntry{ BlockNumber blockno; /* page number (hashtable key) */ bool ischunk; /* T = lossy storage, F = exact */ bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];} PagetableEntry;/* * dynahash.c is optimized for relatively large, long-lived hash tables. * This is not ideal for TIDBitMap, particularly when we are using a bitmap * scan on the inside of a nestloop join: a bitmap may well live only long * enough to accumulate one entry in such cases. We therefore avoid creating * an actual hashtable until we need two pagetable entries. When just one * pagetable entry is needed, we store it in a fixed field of TIDBitMap. * (NOTE: we don't get rid of the hashtable if the bitmap later shrinks down * to zero or one page again. So, status can be TBM_HASH even when nentries * is zero or one.) */typedef enum{ TBM_EMPTY, /* no hashtable, nentries == 0 */ TBM_ONE_PAGE, /* entry1 contains the single entry */ TBM_HASH /* pagetable is valid, entry1 is not */} TBMStatus;/* * Here is the representation for a whole TIDBitMap: */struct TIDBitmap{ NodeTag type; /* to make it a valid Node */ MemoryContext mcxt; /* memory context containing me */ TBMStatus status; /* see codes above */ HTAB *pagetable; /* hash table of PagetableEntry's */ int nentries; /* number of entries in pagetable */ int maxentries; /* limit on same to meet maxbytes */ int npages; /* number of exact entries in pagetable */ int nchunks; /* number of lossy entries in pagetable */ bool iterating; /* tbm_begin_iterate called? */ PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */ /* the remaining fields are used while producing sorted output: */ PagetableEntry **spages; /* sorted exact-page list, or NULL */ PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */ int spageptr; /* next spages index */ int schunkptr; /* next schunks index */ int schunkbit; /* next bit to check in current schunk */ TBMIterateResult output; /* MUST BE LAST (because variable-size) */};/* Local function prototypes */static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b);static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno);static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno);static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);static void tbm_lossify(TIDBitmap *tbm);static int tbm_comparator(const void *left, const void *right);/* * tbm_create - create an initially-empty bitmap * * The bitmap will live in the memory context that is CurrentMemoryContext * at the time of this call. It will be limited to (approximately) maxbytes * total memory consumption. */TIDBitmap *tbm_create(long maxbytes){ TIDBitmap *tbm; long nbuckets; /* * Create the TIDBitmap struct, with enough trailing space to serve the * needs of the TBMIterateResult sub-struct. */ tbm = (TIDBitmap *) palloc(sizeof(TIDBitmap) + MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber)); /* Zero all the fixed fields */ MemSetAligned(tbm, 0, sizeof(TIDBitmap)); tbm->type = T_TIDBitmap; /* Set NodeTag */ tbm->mcxt = CurrentMemoryContext; tbm->status = TBM_EMPTY; /* * Estimate number of hashtable entries we can have within maxbytes. This * estimates the hash overhead at MAXALIGN(sizeof(HASHELEMENT)) plus a * pointer per hash entry, which is crude but good enough for our purpose. * Also count an extra Pointer per entry for the arrays created during * iteration readout. */ nbuckets = maxbytes / (MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(sizeof(PagetableEntry)) + sizeof(Pointer) + sizeof(Pointer)); nbuckets = Min(nbuckets, INT_MAX - 1); /* safety limit */ nbuckets = Max(nbuckets, 16); /* sanity limit */ tbm->maxentries = (int) nbuckets; return tbm;}/* * Actually create the hashtable. Since this is a moderately expensive * proposition, we don't do it until we have to. */static voidtbm_create_pagetable(TIDBitmap *tbm){ HASHCTL hash_ctl; Assert(tbm->status != TBM_HASH); Assert(tbm->pagetable == NULL); /* Create the hashtable proper */ MemSet(&hash_ctl, 0, sizeof(hash_ctl)); hash_ctl.keysize = sizeof(BlockNumber); hash_ctl.entrysize = sizeof(PagetableEntry); hash_ctl.hash = tag_hash; hash_ctl.hcxt = tbm->mcxt; tbm->pagetable = hash_create("TIDBitmap", 128, /* start small and extend */ &hash_ctl, HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT); /* If entry1 is valid, push it into the hashtable */ if (tbm->status == TBM_ONE_PAGE) { PagetableEntry *page; bool found; page = (PagetableEntry *) hash_search(tbm->pagetable, (void *) &tbm->entry1.blockno, HASH_ENTER, &found); Assert(!found); memcpy(page, &tbm->entry1, sizeof(PagetableEntry)); } tbm->status = TBM_HASH;}/* * tbm_free - free a TIDBitmap */voidtbm_free(TIDBitmap *tbm){ if (tbm->pagetable) hash_destroy(tbm->pagetable); if (tbm->spages) pfree(tbm->spages); if (tbm->schunks) pfree(tbm->schunks); pfree(tbm);}/* * tbm_add_tuples - add some tuple IDs to a TIDBitmap */voidtbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids){ int i; Assert(!tbm->iterating); for (i = 0; i < ntids; i++) { BlockNumber blk = ItemPointerGetBlockNumber(tids + i); OffsetNumber off = ItemPointerGetOffsetNumber(tids + i); PagetableEntry *page; int wordnum, bitnum; /* safety check to ensure we don't overrun bit array bounds */ if (off < 1 || off > MAX_TUPLES_PER_PAGE) elog(ERROR, "tuple offset out of range: %u", off); if (tbm_page_is_lossy(tbm, blk)) continue; /* whole page is already marked */ page = tbm_get_pageentry(tbm, blk); if (page->ischunk) { /* The page is a lossy chunk header, set bit for itself */ wordnum = bitnum = 0; } else { /* Page is exact, so set bit for individual tuple */ wordnum = WORDNUM(off - 1); bitnum = BITNUM(off - 1); } page->words[wordnum] |= ((bitmapword) 1 << bitnum); if (tbm->nentries > tbm->maxentries) tbm_lossify(tbm); }}/* * tbm_union - set union * * a is modified in-place, b is not changed */voidtbm_union(TIDBitmap *a, const TIDBitmap *b){ Assert(!a->iterating); /* Nothing to do if b is empty */ if (b->nentries == 0) return; /* Scan through chunks and pages in b, merge into a */ if (b->status == TBM_ONE_PAGE) tbm_union_page(a, &b->entry1); else { HASH_SEQ_STATUS status; PagetableEntry *bpage; Assert(b->status == TBM_HASH); hash_seq_init(&status, b->pagetable); while ((bpage = (PagetableEntry *) hash_seq_search(&status)) != NULL) tbm_union_page(a, bpage); }}/* Process one page of b during a union op */static voidtbm_union_page(TIDBitmap *a, const PagetableEntry *bpage){ PagetableEntry *apage; int wordnum; if (bpage->ischunk) { /* Scan b's chunk, mark each indicated page lossy in a */ for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) { bitmapword w = bpage->words[wordnum]; if (w != 0) { BlockNumber pg; pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD); while (w != 0) { if (w & 1) tbm_mark_page_lossy(a, pg); pg++; w >>= 1; } } } } else if (tbm_page_is_lossy(a, bpage->blockno)) { /* page is already lossy in a, nothing to do */ return; } else { apage = tbm_get_pageentry(a, bpage->blockno); if (apage->ischunk) { /* The page is a lossy chunk header, set bit for itself */ apage->words[0] |= ((bitmapword) 1 << 0); } else { /* Both pages are exact, merge at the bit level */ for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) apage->words[wordnum] |= bpage->words[wordnum]; } } if (a->nentries > a->maxentries) tbm_lossify(a);}/* * tbm_intersect - set intersection * * a is modified in-place, b is not changed */voidtbm_intersect(TIDBitmap *a, const TIDBitmap *b){ Assert(!a->iterating); /* Nothing to do if a is empty */ if (a->nentries == 0) return; /* Scan through chunks and pages in a, try to match to b */ if (a->status == TBM_ONE_PAGE) { if (tbm_intersect_page(a, &a->entry1, b)) { /* Page is now empty, remove it from a */ Assert(!a->entry1.ischunk); a->npages--; a->nentries--; Assert(a->nentries == 0); a->status = TBM_EMPTY; } } else { HASH_SEQ_STATUS status; PagetableEntry *apage; Assert(a->status == TBM_HASH); hash_seq_init(&status, a->pagetable); while ((apage = (PagetableEntry *) hash_seq_search(&status)) != NULL) { if (tbm_intersect_page(a, apage, b)) { /* Page or chunk is now empty, remove it from a */ if (apage->ischunk) a->nchunks--; else a->npages--; a->nentries--; if (hash_search(a->pagetable, (void *) &apage->blockno, HASH_REMOVE, NULL) == NULL) elog(ERROR, "hash table corrupted"); } } }}/* * Process one page of a during an intersection op * * Returns TRUE if apage is now empty and should be deleted from a */static booltbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b){ const PagetableEntry *bpage; int wordnum; if (apage->ischunk) { /* Scan each bit in chunk, try to clear */ bool candelete = true; for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) { bitmapword w = apage->words[wordnum]; if (w != 0) { bitmapword neww = w; BlockNumber pg; int bitnum; pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD); bitnum = 0; while (w != 0) { if (w & 1) { if (!tbm_page_is_lossy(b, pg) && tbm_find_pageentry(b, pg) == NULL) { /* Page is not in b at all, lose lossy bit */ neww &= ~((bitmapword) 1 << bitnum); } } pg++; bitnum++; w >>= 1; } apage->words[wordnum] = neww; if (neww != 0) candelete = false; }
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?