📄 gdk_search.mx

📁 这个是内存数据库中的一个管理工具
💻 MX
📖 第 1 页 / 共 2 页
字号:
12 下一页
@' The contents of this file are subject to the MonetDB Public License@' Version 1.1 (the "License"); you may not use this file except in@' compliance with the License. You may obtain a copy of the License at@' http://monetdb.cwi.nl/Legal/MonetDBLicense-1.1.html@'@' Software distributed under the License is distributed on an "AS IS"@' basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the@' License for the specific language governing rights and limitations@' under the License.@'@' The Original Code is the MonetDB Database System.@'@' The Initial Developer of the Original Code is CWI.@' Portions created by CWI are Copyright (C) 1997-2007 CWI.@' All Rights Reserved.@f gdk_search@a M. L. Kersten, P. Boncz@* Search AcceleratorsWhat sets BATs apart from normal arrays is their built-in ability tosearch on both dimensions of the binary association.  The easiest way toimplement this is simply walk to the whole table and compare against eachelement.  This method is of course highly inefficient, much better performancecan be obtained if the BATs use some kind of index method to speed upsearching.While index methods speed up searching they also have disadvantages.In the first place extra storage is needed for the index. Second,insertion of data or removing old data requires updating of the indexstructure, which takes extra time.This means there is a need for both indexed and non-indexed BAT, thefirst to be used when  little or no searching is needed, thesecond to be used when searching is predominant. Also, there is nobest index method for all cases, different methods have different storageneeds and different performance. Thus, multiple index methods are provided,each suited to particular types of usage.For query-dominant environments it pays to build a search accelerator.The main problems to be solved are:@itemize@itemavoidance of excessive storage requirements, and@itemlimited maintenance overhead.@end itemize@-The idea that query intensive tasks need many different index methods hasbeen proven invalid. The current direction is multiple copies of data, whichcan than be sorted or clustered.@-The BAT library automatically decides when an index becomes costeffective.In situations where an index is expected, a call ismade to @emph{BAThash}.This operation check for indexing on the header.@{@+ Interface Declarations@h#ifndef _GDK_SEARCH_H_#define _GDK_SEARCH_H_#include "gdk.h"@}@+ Hash indexingThis is a highly efficient implementation of simple bucket-chained hashing.In the past, we used integer modulo for hashing, with bucket chains of mean size 4.This was shown to be inferior to direct hashing with integer anding. The new implementationreflects this.@{@hgdk_export Hash *HASHnew(Heap *hp, int tpe, size_t size, hash_t mask);gdk_export hash_t HASHmask(size_t cnt);gdk_export BAT *HASHprint(BAT *b);gdk_export void HASHremove(BAT *b);gdk_export void HASHdestroy(BAT *b);gdk_export int HASHgonebad(BAT *b, ptr v);gdk_export hash_t HASHprobe(Hash *h, ptr v);gdk_export hash_t HASHlist(Hash *h, size_t i);#define mix_sht(X)            (((X)>>7)^(X))#define mix_int(X)            (((X)>>7)^((X)>>13)^((X)>>21)^(X))#define hash_loc(H,V)         hash_any(H,V)#define hash_var(H,V)         hash_any(H,V)#define hash_any(H,V)         (ATOMhash((H)->type, (V)) & (H)->mask)#define hash_bte(H,V)         ((hash_t) (*(unsigned char*) (V)) & (H)->mask)#define hash_chr(H,V)         hash_bte(H,V)#define hash_sht(H,V)         ((hash_t) mix_sht(*((unsigned short*) (V))) & (H)->mask)#define hash_int(H,V)         ((hash_t) mix_int(*((unsigned int*) (V))) & (H)->mask)/* XXX return size_t-sized value for 8-byte oid? */#define hash_lng(H,V)         ((hash_t) mix_int((unsigned int) (*(lng *)(V) ^ (*(lng *)(V) >> 32))) & (H)->mask)@= hashfnd#define HASHfnd_@1(x,y,z)	{					\	hash_t _i;							\	BUN _v;								\	(x) = NULL;							\	if ((y)->hhash || BAThash((y), 0) || GDKfatal("HASHfnd_@1: hash build failed on %s.\n", BATgetId((y)))) \		HASHloop_@1((y), (y)->hhash, _i, (z), _v) {		\			(x) = _v;					\			break;						\		}							\}@h#define HASHfnd_str(x,y,z)	{					\	hash_t _i;							\	(x) = NULL;							\	if ((y)->hhash || BAThash((y), 0) || GDKfatal("HASHfnd_str: hash build failed on %s.\n", BATgetId((y)))) \		HASHloop_str((y), (y)->hhash, _i, (z)) {		\			(x) = BUNptr((y),_i);				\			break;						\		}							\}#define HASHfnd(x,y,z)	{						\	size_t _i;							\	x=NULL;								\	if ((y)->hhash || BAThash((y), 0) || GDKfatal("HASHfnd: hash build failed on %s.\n", BATgetId((y)))) \		HASHloop((y), (y)->hhash, _i, (z)) {			\			(x) = BUNptr((y),_i);				\			break;						\		}							\}@:hashfnd(chr)@@:hashfnd(bte)@@:hashfnd(sht)@@:hashfnd(int)@@:hashfnd(lng)@#if SIZEOF_VOID_P == SIZEOF_INT#define HASHfnd_ptr(x,y,z)	HASHfnd_int(x,y,z)#else /* SIZEOF_VOID_P == SIZEOF_LNG */#define HASHfnd_ptr(x,y,z)	HASHfnd_lng(x,y,z)#endif#define HASHfnd_bit(x,y,z)	HASHfnd_chr(x,y,z)#if SIZEOF_OID == SIZEOF_INT	/* OIDDEPEND */#define HASHfnd_oid(x,y,z)	HASHfnd_int(x,y,z)#else#define HASHfnd_oid(x,y,z)	HASHfnd_lng(x,y,z)#endif#define HASHfnd_flt(x,y,z)	HASHfnd_int(x,y,z)#define HASHfnd_dbl(x,y,z)	HASHfnd_lng(x,y,z)#define HASHfnd_any(x,y,z)	HASHfnd(x,y,z)@-A new entry is added with @%HASHins@ using the BAT, the BUN index,and a pointer to the value to be stored. An entry is removed by @%HASdel@.@= hashins#define HASHins_@1(h, i, v) {			\	hash_t _c = hash_@1(h,v);		\	h->link[i] = h->hash[_c];		\	h->hash[_c] = i;			\}@h#define HASHins_str(h,i,v) {			\	hash_t _c;				\	GDK_STRHASH(v,_c);			\	_c &= (h)->mask;			\	h->link[i] = h->hash[_c];		\	h->hash[_c] = i;			\}#define HASHins_any(h,i,v) {			\	hash_t _c = HASHprobe(h, v);		\	h->link[i] = h->hash[_c];		\	h->hash[_c] = i;			\}/* HASHins now receives a BAT* param and has become adaptive; killing wrongly configured hash tables *//* use HASHins_any or HASHins_<tpe> instead if you know what you're doing or want to keep the hash whatever */#define HASHins(b,i,v) \	do if (((i) & 1023) == 1023 && HASHgonebad((b),(v))) { HASHremove(b); } else  { HASHins_any((b)->hhash,(i),(v)); } while (0)#if SIZEOF_VOID_P == SIZEOF_INT#define HASHins_ptr(h,i,v)	HASHins_int(h,i,v)#else /* SIZEOF_VOID_P == SIZEOF_LNG */#define HASHins_ptr(h,i,v)	HASHins_lng(h,i,v)#endif#define HASHins_bit(h,i,v)	HASHins_chr(h,i,v)#if SIZEOF_OID == SIZEOF_INT	/* OIDDEPEND */#define HASHins_oid(h,i,v)	HASHins_int(h,i,v)#else#define HASHins_oid(h,i,v)	HASHins_lng(h,i,v)#endif#define HASHins_flt(h,i,v)	HASHins_int(h,i,v)#define HASHins_dbl(h,i,v)	HASHins_lng(h,i,v)#define HASHinsvar(h,i,v)	HASHins_any(h,i,v)#define HASHinsloc(h,i,v)	HASHins_any(h,i,v)@:hashins(chr)@@:hashins(bte)@@:hashins(sht)@@:hashins(int)@@:hashins(lng)@#define HASHdel(h, i, v, next) {					\	if (next && h->link[i+1] == i) {				\		h->link[i+1] = h->link[i];				\	} else {							\		size_t _c = HASHprobe(h, v);				\		if (h->hash[_c] == i) {					\			h->hash[_c] = h->link[i];			\		} else {						\			for(_c = h->hash[_c]; _c != HASH_MAX; 		\					_c = h->link[_c]){ 		\				if (h->link[_c] == i) {			\					h->link[_c] = h->link[i];	\					break;				\				}					\			}						\		}							\	} h->link[i] = HASH_MAX;					\}#define HASHmove(h, i, j, v, next) {					\	if (next && h->link[i+1] == i) {				\		h->link[i+1] = j;					\	} else {							\		size_t _c = HASHprobe(h, v);				\		if (h->hash[_c] == i) {					\			h->hash[_c] = j;				\		} else {						\			for(_c = h->hash[_c]; _c != HASH_MAX; 		\					_c = h->link[_c]){ 		\				if (h->link[_c] == i) {			\					h->link[_c] = j;		\					break;				\				}					\			}						\		}							\	} h->link[j] = h->link[i];					\}@}@- Hash Table CreationThe @emph{hash} indexing scheme for BATs reserves a block of memory to maintainthe hash table and a collision list. A one-to-one mapping exists betweenthe BAT and the collision list using the BUN index. NOTE: we alloc thelink list as a parallel array to the BUN array; hence the hash link array hasthe same size as BATcapacity(b) (not BATcount(b)). This allows us in theBUN insert and delete to assume that there is hash space iff there is BUNspace. If there is no BUN space, the BATextend now destroys the hash table.The hash mask size is a power of two, so we can do bitwise AND on thehash (integer) number to quickly find the head of the bucket chain.Clearly, the hash mask size is a crucial parameter. If we know that the columnis unique (hkey), we use direct hashing (mask size ~= BATcount). Otherwisewe dynamically determine the mask size by starting out with mask size = BATcount/64(just 1.5% of memory storage overhead). Then we start building the hash tableon the first 25% of the BAT. As we aim for max-collisions list length of 4,the list on 25% should not exceed length 1. So, if a small number of collisssionsoccurs (mask/2) then we abandon the attempt and restart with a mask that is 4 timeslarger. This converges after three cycles to direct hashing.@{@c#include "monetdb_config.h"#include "gdk.h"hash_tHASHmask(size_t cnt){	size_t m = 8;		/* minumum size */	while (m + m < cnt)		m += m;	if (m + m - cnt < 2 * (cnt - m))		m += m;	return m;}static voidHASHclear(Hash *h){	hash_t *i, *j;	for (i = h->hash, j = i + h->mask; i <= j; i++) {		*i = HASH_MAX;	}}Hash *HASHnew(Heap *hp, int tpe, size_t size, hash_t mask){	Hash *h = NULL;	if (HEAPalloc(hp, mask + size, sizeof(hash_t)) < 0)		return NULL;	h = (Hash*)GDKmalloc(sizeof(Hash));	if (!h)		return h;	h->lim = size;	h->mask = mask - 1;	h->link = (hash_t *) hp->base;	h->hash = h->link + h->lim;	h->type = tpe;	h->heap = hp;	HASHclear(h);		/* zero the mask */	return h;}@= starthash	for (xx = BUNsize(b); r < p; r += xx) {		ptr v = BUN@2(b, r);		hash_t c = hash_@1(h, v);		if (h->hash[c] == HASH_MAX && nslots-- == 0) {			break; /* mask too full */		}		h->link[i] = h->hash[c];		h->hash[c] = (hash_t) i++;	}	break;@= finishhash	for (xx = BUNsize(b); p < q; p += xx) {		ptr v = BUN@2(b, p);		hash_t c = hash_@1(h, v);		h->link[i] = h->hash[c];		h->hash[c] = (hash_t) i++;	}	break;@-The prime routine for the BAT layer is to create a new hash index.Its argument is the element type and the maximum number of BUNsbe stored under the hash function.@cBAT *BAThash(BAT *b, size_t masksize){	gdk_set_lock(GDKhashLock[ABS(b->batCacheid) & BBPLOCKMASK], "BAThash");	if (b->hhash == NULL) {		unsigned int xx, tpe = ATOMstorage(b->htype);		size_t i, cnt = BATcount(b);		hash_t mask;		BUN p = BUNfirst(b), q = BUNlast(b), r;		Hash *h = NULL;		Heap *hp = NULL;		str nme = BBP_physical(b->batCacheid);		/* cnt = 0, hopefully there is a proper capacity from which		 * we can derive enough information */		if (!cnt)			cnt = BATcapacity(b);		if (b->htype == TYPE_void) {			BATDEBUG GDKwarning("BAThash: creating hash-table on void column..\n");			tpe = TYPE_void;		}		/* determine hash mask size */		if (masksize > 0) {			mask = HASHmask(masksize);	/* p = first; so no dynamic scheme */		} else if (b->hkey) {			mask = HASHmask(cnt);	/* p = first; so no dynamic scheme */		} else {			/* dynamic hash: we start with HASHmask(cnt/64); if there are too many collisions			 * we try HASHmask(cnt/16), then HASHmask(cnt/4), and finally HASHmask(cnt).  */			mask = HASHmask(cnt >> 6);			p += (cnt >> 2) * BUNsize(b);	/* try out on first 25% of b */			if (p > q)				p = q;		}                if (mask < 1024) mask = 1024;		do {			size_t nslots = mask >> 3;	/* 1/2 full is too full */			r = BUNfirst(b);			i = BUNindex(b, r);			if (hp) {				HEAPfree(hp);				GDKfree(hp);			}			if (h) 				GDKfree(h);			/* create the hash structures */			hp = (Heap *) GDKmalloc(sizeof(Heap));			hp->filename = GDKmalloc(strlen(nme) + 12);			sprintf(hp->filename, "%s.%chash", nme, b->batCacheid > 0 ? 'h' : 't');			if ((h = HASHnew(hp, ATOMtype(b->htype), BATcapacity(b), (size_t) mask)) == NULL) {				gdk_unset_lock(GDKhashLock[ABS(b->batCacheid) & BBPLOCKMASK], "BAThash");				GDKfree(hp->filename);				GDKfree(hp);				return NULL;			}			if (hp->storage & STORE_MMAP)				MT_mmap_pin(hp->base, hp->maxsize);			switch (tpe) {#ifndef NOEXPAND_CHR			case TYPE_chr:				@:starthash(chr,hloc)@#endif#ifndef NOEXPAND_BTE			case TYPE_bte:				@:starthash(bte,hloc)@#endif#ifndef NOEXPAND_SHT			case TYPE_sht:				@:starthash(sht,hloc)@#endif#if !defined(NOEXPAND_INT) || !defined(NOEXPAND_FLT)			case TYPE_int:			case TYPE_flt:				@:starthash(int,hloc)@#endif#if !defined(NOEXPAND_DBL) || !defined(NOEXPAND_LNG)
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -