📄 gdk_bbp.mx
字号:
@' The contents of this file are subject to the MonetDB Public License@' Version 1.1 (the "License"); you may not use this file except in@' compliance with the License. You may obtain a copy of the License at@' http://monetdb.cwi.nl/Legal/MonetDBLicense-1.1.html@'@' Software distributed under the License is distributed on an "AS IS"@' basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the@' License for the specific language governing rights and limitations@' under the License.@'@' The Original Code is the MonetDB Database System.@'@' The Initial Developer of the Original Code is CWI.@' Portions created by CWI are Copyright (C) 1997-2007 CWI.@' All Rights Reserved.@f gdk_bbp@a M. L. Kersten, P. Boncz, N. J. Nes@* BAT Buffer Pool (BBP)The BATs created and loaded are collected in a BAT buffer pool. The Bat Buffer Pool has a number of functions:@table @code@item administration and lookupThe BBP is a directory which contains status information about all known BATs. This interface may be used very heavily, by data-intensive applications.To eliminate all overhead, read-only access to the BBP may be done by table-lookups. The integer index type for these lookups is @%bat@, as retrieved by @%BBPcacheid(b)@. The @%bat@ zero is reserved for the nil bat. @item persistenceThe BBP is made persistent by saving it to the dictionary file called @emph{BBP.dir} in the database. The dictionary can always be reconstructed from the @emph{ .desc} files. Its main role is to speed-up system restart, because now it requires just a few IOs instead of a complete directory scan and reading all descriptorsto find all BATs.When the number of BATs rises, having all files in one directorybecomes a bottleneck. The BBP therefore implements a scheme that distributesall BATs in a growing directory tree with at most 64 BATs stored in one node.@item buffer managementThe BBP is responsible for loading and saving of BATs to disk. It also contains routines to unload BATs from memory when memory resources get scarce. For this purpose, it administers BAT memory reference counts (to know which BATs can be unloaded) and BAT usage statistics (it unloads the least recently used BATs).@item recoveryWhen the database is closed or during a run-time syncpoint, the systemtables must be written to disk in a safe way, that is immune for system failures (like disk full). To do so, the BBP implements an atomic commit and recovery protocol: first all files to be overwritten are moved to a BACKUP/dir. If that succeeds, the writes are done. If that also fully succeedsthe BACKUP/ dir is renamed to DELETE_ME/ and subsequently deleted.If not, all files in BACKUP/ are moved back to their original location.@item unloadingBats which have a logical reference (ie. a lrefs > 0) but no memoryreference (refcnt == 0) can be unloaded. Unloading dirty bats means,moving the original (committed version) to the BACKUP/ dir and savingthe bat. This complicates the commit and recovery/abort issues.The commit has to check if the bat is already moved. And The recoveryhas to always move back the files from the BACKUP/ dir.@item reference countingBats use have two kinds of references: logical and physical (pointer) ones.Both are administered with the BBPincref/BBPdecref routines. For backward compatibility, we maintain BBPfix/BBPunfix as shorthandsfor the adjusting the pointer references.@item share countingViews use the heaps of there parent bats. To save guard this, theparent has a shared counter, which is incremented and decrementedusing BBPshare and BBPunshare. These functions make sure theparent is memory resident as required because of the 'pointer' sharing.@end table@{@h#ifndef _GDK_BBP_H_#define _GDK_BBP_H_#define BBPINIT 2048#define BBPMAXSIZE 1024*1024#define BBPLOADED 1 /* set if bat in memory */#define BBPSWAPPED 2 /* set if dirty bat is not in memory */#define BBPTMP 4 /* set if non-persistent bat has image on disk */#define BBPDELETED 16 /* set if bat persistent at last commit is now transient */#define BBPEXISTING 32 /* set if bat was already persistent at end of last commit */#define BBPNEW 64 /* set if bat has become persistent since last commit */#define BBPPERSISTENT 96 /* mask for currently persistent bats */#define BBPSTATUS 127#define BBPDELETING 2048 /* set while we are deleting (special case in module unload) */#define BBPUNSTABLE 2176 /* set while we are unloading */#define BBPUNLOADING 128 /* set while we are unloading */#define BBPLOADING 256 /* set while we are loading */#define BBPSAVING 512 /* set while we are saving */#define BBPWAITING 2944#define BBPRENAMED 1024 /* set when bat is renamed in this transaction */#define BBPTRIM_ALL (((size_t)1) << (sizeof(size_t)*8 - 2)) /* very large positive size_t */#define BBPLASTUSED(x) ((x) & 0x7fffffff) /* stamp is always a positive int */gdk_export int BBPin; /* BATs swapped into BBP */gdk_export int BBPout; /* BATs swapped out of BBP */gdk_export int BBPsize; /* current occupied size of BBP array *//* global calls */gdk_export void BBPinit(void);gdk_export void BBPexit(void);gdk_export int BBPdir(int cnt, bat* subcommit);/* update interface */gdk_export void BBPclear(bat bid);gdk_export bat BBPinsert(BAT *b);gdk_export void BBPcacheit(BAT *b);gdk_export void BBPuncacheit(bat bid);gdk_export int BBPreclaim(BAT *b);gdk_export int BBPsave(BAT *b);gdk_export int BBPrename(bat bid, str nme);gdk_export BAT * BBPrecycle(int ht, int tt, ssize_t cap);gdk_export wrd BBPrecycle_minsize(wrd);/* query interface */gdk_export bat BBPindex(str nme);gdk_export BAT *BBPdescriptor(bat b);/* swapping interface */gdk_export int BBPrecover(void);gdk_export lng BBPdiskscan(void);gdk_export int BBPsync(int cnt, bat* subcommit);gdk_export int BBPincref(bat b, int logical);gdk_export void BBPkeepref(bat i);gdk_export void BBPreleaseref(bat i);gdk_export void BBPreleaselref(bat i);gdk_export int BBPdecref(bat b, int logical);gdk_export void BBPshare(bat b);gdk_export void BBPunshare(bat b);gdk_export void BBPextend(dbl factor, int buildhash);gdk_export void BBPatom_drop(int atom);gdk_export void BBPatom_load(int atom);gdk_export int BBPbackup(BAT *b, bit subcommit);@@c#include "monetdb_config.h"#include "gdk.h"#include "gdk_storage.h"@}@-The BBP has now a fixed address, so re-allocation due to a growing BBP caused by one thread does not disturb reads to the old entries by another.This is implemented using anonymous virtual memory; extensions on the same address are guaranteed because a large non-committed VM area is requested initially. New slots in the BBP are found at O(1) by keeping a freelistthat uses the 'next' field in the BBPrec records.@{@cBBPrec *BBP = NULL; /* fixed base VM address of BBP array */bat BBPmaxsize = BBPMAXSIZE; /* size of non-commited VM BBP array */bat BBPlimit = 0; /* current committed VM BBP array */bat BBPsize = 0; /* current used size of BBP array */bat BBP_free = 0; /* first free slot in BBP array */@}@-The hash index uses a bucket index (int array) of size mask that istuned for perfect hashing (1 lookup). The bucket chain uses the 'next'field in the BBPrec records.@{@h#define BBPnamecheck(s) (((s)[0]=='t' && (s)[1]=='m' && (s)[2]=='p' &&\ (s)[3]=='_')?strtol(s+4,NULL,8):0)#define BBPtmpcheck(s) (((s)[0]=='t' && (s)[1]=='m' && (s)[2]=='p' &&\ (s)[3]=='_')?1:0)@cbat *BBP_hash = NULL; /* BBP logical name hash buckets */bat BBP_mask = 0; /* number of buckets = & mask */static void BBPspin(bat bid, str debug, int event);static int BBPfree(BAT *b, str calledFrom );static int BBPdestroy(BAT *b);static void BBPuncacheit_(bat bid, int unloaddesc);static void BBPinitcache(void);static int BBPprepare(bit subcommit); static int stamp = 0;static INLINE intBBPstamp(void){ return ++stamp;}static voidBBPsetstamp(int newstamp){ stamp = newstamp;}static voidBBP_insert(bat i){ bat idx = (bat) (strHash(BBP_logical(i)) & BBP_mask); BBP_next(i) = BBP_hash[idx]; BBP_hash[idx] = i;}static voidBBP_delete(bat i){ bat *h = BBP_hash; str s = BBP_logical(i); bat idx = (bat) (strHash(s) & BBP_mask); for (h += idx; (i = *h) != 0; h = &BBP_next(i)) { if (strcmp(BBP_logical(i), s) == 0) { *h = BBP_next(i); break; } }}@-other globals@cint BBP_curstamp = 0; /* unique stamp for creation of a bat */MT_Id BBP_notrim = ~((MT_Id) 0); /* avoids BBPtrim when we really do not want it */int BBP_dirty = 0; /* BBP structures modified? */int BBPin = 0; /* bats loaded statistic */int BBPout = 0; /* bats saved statistic */@}@+ BBP Consistency and ConcurrencyWhile GDK provides the basic building blocks for an ACID system, initself it is not such a system, as we this would entail too much overheadthat is often not needed. Hence, some consistency control is left to theuser. The first important user constraint is that if a user updates a BAT, (s)he himself must assure that no-one else accesses this BAT. Concerning buffer management, the BBP carries out a swapping policy.BATs are kept in memory till the memory is full. If the memory is full,the malloc functions initiate BBP trim actions, that unload the coldest BATs that have a zero reference count. The second important user constraint is therefore that a user may only manipulate live BAT data in memory if it is sure that there is at least one reference count to that BAT.The main BBP array is protected by two locks:@table @code@item GDKcacheLock]this lock guards the free slot management in the BBP array. The BBP operations that allocate a new slot for a new BAT (@%BBPinit@,@%BBPcacheit@), delete the slot of a destroyed BAT (@%BBPreclaim@), or rename a BAT (@%BBPrename@), hold this lock. It also protects all BAT (re)naming actions include (read and write) in the hash table with BAT names.@item GDKswapLockthis lock guards the swap (loaded/unloaded) status of the BATs. Hence, allBBP routines that influence the swapping policy, or actually carry out theswapping policy itself, acquire this lock (e.g. @%BBPfix@,@%BBPunfix@).Note that this also means that updates to the BBP\_status indicator arraymust be protected by GDKswapLock. To reduce contention GDKswapLock was split into multiple locks; it is now an array of lock pointers which is accessed by GDKswapLock[ABS(bat)\&BBPLOCKMASK]@end tableRoutines that need both locks should first acquire the locks in the GDKswapLock array (in ascending order) and then GDKcacheLock (and release them in reverse order).To obtain maximum speed, read operations to existing elements in the BBP are unguarded. As said, it is the users responsibility that the BAT that is being read is not being modified. BBP update actions that modify the BBP data structure itself are locked by the BBP functions themselves. Hence, multiple concurrent BBP read operations may be ongoing while at the same time at most one BBP write operation @strong{on a different BAT} is executing. This holds for accesses to the public (quasi-)arrays @%BBPcache@, @%BBPstatus@, @%BBPrefs@, @%BBPlogical@ and @%BBPphysical@. These arrays are called quasi as now they are actually stored together in one big BBPrec array called BBP, that is allocated in anonymous VM space, so we can reallocate this structure without changing the base address (a crucial feature if read actions are to go on unlocked while other entries in the BBP may be modified). @{@h#define BBP_status_set(bid, mode, nme) { \ BBP_status(bid) = mode; \}#define BBP_status_on(bid, flags, nme) \ BBP_status_set(bid, BBP_status(bid) | flags, nme);#define BBP_status_off(bid, flags, nme) \ BBP_status_set(bid, BBP_status(bid) & ~(flags), nme);#define BBP_unload_inc(bid, nme) { \ gdk_set_lock(GDKunloadLock, nme); \ BBPunloadCnt++; \ gdk_unset_lock(GDKunloadLock, nme); }#define BBP_unload_dec(bid, nme) { \ gdk_set_lock(GDKunloadLock, nme); \ if (--BBPunloadCnt == 0) gdk_signal_cond(GDKunloadCond, nme); \ assert(BBPunloadCnt >= 0); \ gdk_unset_lock(GDKunloadLock, nme); }@cstatic MT_Id locked_by = 0;static INLINE MT_IdBBP_getpid(void){ MT_Id x = MT_getpid(); return x;}static int BBPunloadCnt = 0;voidBBPlock(str nme){ int i; /* wait for all pending unloads to finish */ gdk_set_lock(GDKunloadLock, nme); if (BBPunloadCnt > 0) gdk_wait_cond(GDKunloadCond, GDKunloadLock, nme); gdk_set_lock(GDKtrimLock, nme); BBP_notrim = BBP_getpid(); gdk_set_lock(GDKcacheLock, nme); for (i = 0; i <= BBPLOCKMASK; i++) gdk_set_lock(GDKswapLock[i & BBPLOCKMASK], nme); locked_by = BBP_notrim; gdk_unset_lock(GDKunloadLock, nme);}voidBBPunlock(str nme){ int i; for (i = BBPLOCKMASK; i >= 0; i--) gdk_unset_lock(GDKswapLock[i & BBPLOCKMASK], nme); gdk_unset_lock(GDKcacheLock, nme); BBP_notrim = 0; locked_by = 0; gdk_unset_lock(GDKtrimLock, nme);}static voidBBPinithash(void){ bat i = BBPsize; for (BBP_mask = 1; (BBP_mask << 1) <= BBPlimit; BBP_mask <<= 1) ; BBP_hash = (bat *) GDKmalloc(BBP_mask * sizeof(bat)); memset(BBP_hash, 0, BBP_mask * sizeof(bat)); BBP_mask--; while (--i > 0) { str s = BBP_logical(i); if (s) { if (*s != '.' && BBPtmpcheck(BBP_logical(i)) == 0) { BBP_insert(i); } if (BBP_logical(-i)) { BBP_insert(-i); } } else { BBP_next(i) = BBP_free; BBP_free = i; } }}@-BBPextend must take the trimlock, as it is called when other BBP locks areheld and it will allocate memory. This could trigger a BBPtrim, causing deadlock.@cvoidBBPextend(dbl factor, int buildhash){ int newsize = (int) (BBPlimit * factor); size_t maxsize = MAX(newsize * 2, BBPmaxsize) * sizeof(BBPrec); BBP_notrim = BBP_getpid(); BBP = (BBPrec *) GDKvmrealloc(BBP, BBPlimit * sizeof(BBPrec), newsize * sizeof(BBPrec), BBPmaxsize * sizeof(BBPrec), &maxsize, 1); if (BBP == NULL) GDKfatal("BBPextend: failed to extend BAT pool\n"); memset(BBP + BBPlimit, 0, (newsize - BBPlimit) * sizeof(BBPrec)); BBPlimit = newsize; BBPmaxsize = (int) (maxsize / sizeof(BBPrec)); if (buildhash) { GDKfree(BBP_hash); BBP_hash = NULL; BBP_free = 0; BBPinithash(); } BBP_notrim = 0;}static INLINE char *BBPparse(str *cur){ char *base, *c = *cur; for (c++; GDKisspace(*c); c++) ; for (base = c; !(GDKisspace(*c) || *c == ','); c++) ; *c = 0; *cur = c; return base;}static INLINE strBBPtmpname(str s, int len, bat i){ s[--len] = 0; while(i>0) { s[--len] = '0' + (i & 7); i >>= 3; } s[--len] = '_'; s[--len] = 'p'; s[--len] = 'm';
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -