📄 hashpage.c
字号:
/* * _hash_pageinit() -- Initialize a new hash index page. */void_hash_pageinit(Page page, Size size){ Assert(PageIsNew(page)); PageInit(page, size, sizeof(HashPageOpaqueData));}/* * Attempt to expand the hash table by creating one new bucket. * * This will silently do nothing if it cannot get the needed locks. * * The caller should hold no locks on the hash index. * * The caller must hold a pin, but no lock, on the metapage buffer. * The buffer is returned in the same state. */void_hash_expandtable(Relation rel, Buffer metabuf){ HashMetaPage metap; Bucket old_bucket; Bucket new_bucket; uint32 spare_ndx; BlockNumber start_oblkno; BlockNumber start_nblkno; uint32 maxbucket; uint32 highmask; uint32 lowmask; /* * Obtain the page-zero lock to assert the right to begin a split (see * README). * * Note: deadlock should be impossible here. Our own backend could only be * holding bucket sharelocks due to stopped indexscans; those will not * block other holders of the page-zero lock, who are only interested in * acquiring bucket sharelocks themselves. Exclusive bucket locks are * only taken here and in hashbulkdelete, and neither of these operations * needs any additional locks to complete. (If, due to some flaw in this * reasoning, we manage to deadlock anyway, it's okay to error out; the * index will be left in a consistent state.) */ _hash_getlock(rel, 0, HASH_EXCLUSIVE); /* Write-lock the meta page */ _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); _hash_checkpage(rel, metabuf, LH_META_PAGE); metap = (HashMetaPage) BufferGetPage(metabuf); /* * Check to see if split is still needed; someone else might have already * done one while we waited for the lock. * * Make sure this stays in sync with _hash_doinsert() */ if (metap->hashm_ntuples <= (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1)) goto fail; /* * Can't split anymore if maxbucket has reached its maximum possible * value. * * Ideally we'd allow bucket numbers up to UINT_MAX-1 (no higher because * the calculation maxbucket+1 mustn't overflow). Currently we restrict * to half that because of overflow looping in _hash_log2() and * insufficient space in hashm_spares[]. It's moot anyway because an * index with 2^32 buckets would certainly overflow BlockNumber and hence * _hash_alloc_buckets() would fail, but if we supported buckets smaller * than a disk block then this would be an independent constraint. */ if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE) goto fail; /* * Determine which bucket is to be split, and attempt to lock the old * bucket. If we can't get the lock, give up. * * The lock protects us against other backends, but not against our own * backend. Must check for active scans separately. */ new_bucket = metap->hashm_maxbucket + 1; old_bucket = (new_bucket & metap->hashm_lowmask); start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket); if (_hash_has_active_scan(rel, old_bucket)) goto fail; if (!_hash_try_getlock(rel, start_oblkno, HASH_EXCLUSIVE)) goto fail; /* * Likewise lock the new bucket (should never fail). * * Note: it is safe to compute the new bucket's blkno here, even though we * may still need to update the BUCKET_TO_BLKNO mapping. This is because * the current value of hashm_spares[hashm_ovflpoint] correctly shows * where we are going to put a new splitpoint's worth of buckets. */ start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket); if (_hash_has_active_scan(rel, new_bucket)) elog(ERROR, "scan in progress on supposedly new bucket"); if (!_hash_try_getlock(rel, start_nblkno, HASH_EXCLUSIVE)) elog(ERROR, "could not get lock on supposedly new bucket"); /* * If the split point is increasing (hashm_maxbucket's log base 2 * increases), we need to allocate a new batch of bucket pages. */ spare_ndx = _hash_log2(new_bucket + 1); if (spare_ndx > metap->hashm_ovflpoint) { Assert(spare_ndx == metap->hashm_ovflpoint + 1); /* * The number of buckets in the new splitpoint is equal to the total * number already in existence, i.e. new_bucket. Currently this maps * one-to-one to blocks required, but someday we may need a more * complicated calculation here. */ if (!_hash_alloc_buckets(rel, start_nblkno, new_bucket)) { /* can't split due to BlockNumber overflow */ _hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE); _hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE); goto fail; } } /* * Okay to proceed with split. Update the metapage bucket mapping info. * * Since we are scribbling on the metapage data right in the shared * buffer, any failure in this next little bit leaves us with a big * problem: the metapage is effectively corrupt but could get written back * to disk. We don't really expect any failure, but just to be sure, * establish a critical section. */ START_CRIT_SECTION(); metap->hashm_maxbucket = new_bucket; if (new_bucket > metap->hashm_highmask) { /* Starting a new doubling */ metap->hashm_lowmask = metap->hashm_highmask; metap->hashm_highmask = new_bucket | metap->hashm_lowmask; } /* * If the split point is increasing (hashm_maxbucket's log base 2 * increases), we need to adjust the hashm_spares[] array and * hashm_ovflpoint so that future overflow pages will be created beyond * this new batch of bucket pages. */ if (spare_ndx > metap->hashm_ovflpoint) { metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint]; metap->hashm_ovflpoint = spare_ndx; } /* Done mucking with metapage */ END_CRIT_SECTION(); /* * Copy bucket mapping info now; this saves re-accessing the meta page * inside _hash_splitbucket's inner loop. Note that once we drop the * split lock, other splits could begin, so these values might be out of * date before _hash_splitbucket finishes. That's okay, since all it * needs is to tell which of these two buckets to map hashkeys into. */ maxbucket = metap->hashm_maxbucket; highmask = metap->hashm_highmask; lowmask = metap->hashm_lowmask; /* Write out the metapage and drop lock, but keep pin */ _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); /* Release split lock; okay for other splits to occur now */ _hash_droplock(rel, 0, HASH_EXCLUSIVE); /* Relocate records to the new bucket */ _hash_splitbucket(rel, metabuf, old_bucket, new_bucket, start_oblkno, start_nblkno, maxbucket, highmask, lowmask); /* Release bucket locks, allowing others to access them */ _hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE); _hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE); return; /* Here if decide not to split or fail to acquire old bucket lock */fail: /* We didn't write the metapage, so just drop lock */ _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); /* Release split lock */ _hash_droplock(rel, 0, HASH_EXCLUSIVE);}/* * _hash_alloc_buckets -- allocate a new splitpoint's worth of bucket pages * * This does not need to initialize the new bucket pages; we'll do that as * each one is used by _hash_expandtable(). But we have to extend the logical * EOF to the end of the splitpoint; this keeps smgr's idea of the EOF in * sync with ours, so that we don't get complaints from smgr. * * We do this by writing a page of zeroes at the end of the splitpoint range. * We expect that the filesystem will ensure that the intervening pages read * as zeroes too. On many filesystems this "hole" will not be allocated * immediately, which means that the index file may end up more fragmented * than if we forced it all to be allocated now; but since we don't scan * hash indexes sequentially anyway, that probably doesn't matter. * * XXX It's annoying that this code is executed with the metapage lock held. * We need to interlock against _hash_getovflpage() adding a new overflow page * concurrently, but it'd likely be better to use LockRelationForExtension * for the purpose. OTOH, adding a splitpoint is a very infrequent operation, * so it may not be worth worrying about. * * Returns TRUE if successful, or FALSE if allocation failed due to * BlockNumber overflow. */static bool_hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks){ BlockNumber lastblock; char zerobuf[BLCKSZ]; lastblock = firstblock + nblocks - 1; /* * Check for overflow in block number calculation; if so, we cannot extend * the index anymore. */ if (lastblock < firstblock || lastblock == InvalidBlockNumber) return false; MemSet(zerobuf, 0, sizeof(zerobuf)); RelationOpenSmgr(rel); smgrextend(rel->rd_smgr, lastblock, zerobuf, rel->rd_istemp); return true;}/* * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket' * * We are splitting a bucket that consists of a base bucket page and zero * or more overflow (bucket chain) pages. We must relocate tuples that * belong in the new bucket, and compress out any free space in the old * bucket. * * The caller must hold exclusive locks on both buckets to ensure that * no one else is trying to access them (see README). * * The caller must hold a pin, but no lock, on the metapage buffer. * The buffer is returned in the same state. (The metapage is only * touched if it becomes necessary to add or remove overflow pages.) */static void_hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket, BlockNumber start_oblkno, BlockNumber start_nblkno, uint32 maxbucket, uint32 highmask, uint32 lowmask){ Bucket bucket; Buffer obuf; Buffer nbuf; BlockNumber oblkno; BlockNumber nblkno; bool null; Datum datum; HashPageOpaque oopaque; HashPageOpaque nopaque; IndexTuple itup; Size itemsz; OffsetNumber ooffnum; OffsetNumber noffnum; OffsetNumber omaxoffnum; Page opage; Page npage; TupleDesc itupdesc = RelationGetDescr(rel); /* * It should be okay to simultaneously write-lock pages from each bucket, * since no one else can be trying to acquire buffer lock on pages of * either bucket. */ oblkno = start_oblkno; obuf = _hash_getbuf(rel, oblkno, HASH_WRITE, LH_BUCKET_PAGE); opage = BufferGetPage(obuf); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); nblkno = start_nblkno; nbuf = _hash_getnewbuf(rel, nblkno); npage = BufferGetPage(nbuf); /* initialize the new bucket's primary page */ nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); nopaque->hasho_prevblkno = InvalidBlockNumber; nopaque->hasho_nextblkno = InvalidBlockNumber; nopaque->hasho_bucket = nbucket; nopaque->hasho_flag = LH_BUCKET_PAGE; nopaque->hasho_page_id = HASHO_PAGE_ID; /* * Partition the tuples in the old bucket between the old bucket and the * new bucket, advancing along the old bucket's overflow bucket chain and * adding overflow pages to the new bucket as needed. */ ooffnum = FirstOffsetNumber; omaxoffnum = PageGetMaxOffsetNumber(opage); for (;;) { /* * at each iteration through this loop, each of these variables should * be up-to-date: obuf opage oopaque ooffnum omaxoffnum */ /* check if we're at the end of the page */ if (ooffnum > omaxoffnum) { /* at end of page, but check for an(other) overflow page */ oblkno = oopaque->hasho_nextblkno; if (!BlockNumberIsValid(oblkno)) break; /* * we ran out of tuples on this particular page, but we have more * overflow pages; advance to next page. */ _hash_wrtbuf(rel, obuf); obuf = _hash_getbuf(rel, oblkno, HASH_WRITE, LH_OVERFLOW_PAGE); opage = BufferGetPage(obuf); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); ooffnum = FirstOffsetNumber; omaxoffnum = PageGetMaxOffsetNumber(opage); continue; } /* * Re-hash the tuple to determine which bucket it now belongs in. * * It is annoying to call the hash function while holding locks, but * releasing and relocking the page for each tuple is unappealing too. */ itup = (IndexTuple) PageGetItem(opage, PageGetItemId(opage, ooffnum)); datum = index_getattr(itup, 1, itupdesc, &null); Assert(!null); bucket = _hash_hashkey2bucket(_hash_datum2hashkey(rel, datum), maxbucket, highmask, lowmask); if (bucket == nbucket) { /* * insert the tuple into the new bucket. if it doesn't fit on the * current page in the new bucket, we must allocate a new overflow * page and place the tuple on that page instead. */ itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); if (PageGetFreeSpace(npage) < itemsz) { /* write out nbuf and drop lock, but keep pin */ _hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK); /* chain to a new overflow page */ nbuf = _hash_addovflpage(rel, metabuf, nbuf); npage = BufferGetPage(nbuf); /* we don't need nopaque within the loop */ } noffnum = OffsetNumberNext(PageGetMaxOffsetNumber(npage)); if (PageAddItem(npage, (Item) itup, itemsz, noffnum, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add index item to \"%s\"", RelationGetRelationName(rel)); /* * now delete the tuple from the old bucket. after this section * of code, 'ooffnum' will actually point to the ItemId to which * we would point if we had advanced it before the deletion * (PageIndexTupleDelete repacks the ItemId array). this also * means that 'omaxoffnum' is exactly one less than it used to be, * so we really can just decrement it instead of calling * PageGetMaxOffsetNumber. */ PageIndexTupleDelete(opage, ooffnum); omaxoffnum = OffsetNumberPrev(omaxoffnum); } else { /* * the tuple stays on this page. we didn't move anything, so we * didn't delete anything and therefore we don't have to change * 'omaxoffnum'. */ Assert(bucket == obucket); ooffnum = OffsetNumberNext(ooffnum); } } /* * We're at the end of the old bucket chain, so we're done partitioning * the tuples. Before quitting, call _hash_squeezebucket to ensure the * tuples remaining in the old bucket (including the overflow pages) are * packed as tightly as possible. The new bucket is already tight. */ _hash_wrtbuf(rel, obuf); _hash_wrtbuf(rel, nbuf); _hash_squeezebucket(rel, obucket, start_oblkno, NULL);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -