📄 gdk_atoms.mx

📁 这个是内存数据库中的一个管理工具
💻 MX
📖 第 1 页 / 共 5 页
字号:
	@:atommem(char,6)@	if (*src == bit_nil) {		strcpy(*dst, "nil");		return 3;	} else if (*src) {		strcpy(*dst, "true");		return 4;	}	strcpy(*dst, "false");	return 5;}bit *bitRead(bit *a, stream *s, size_t cnt){	stream_read(s, (char *) a, 1, cnt);	return stream_errnr(s) ? NULL : a;}intbitWrite(bit *a, stream *s, size_t cnt){	if (stream_write(s, (char *) a, 1, cnt) == (ssize_t) cnt)		return GDK_SUCCEED;	else		return GDK_FAIL;}intbatFromStr(char *src, int *len, bat **dst){	char *s, *t, *r = src;	int c, sign = 1;	bat bid;	@:atommem(bat,sizeof(bat))@	while (GDKisspace(*r))		r++;	if (*r == '<')		r++;	if (*r == '~') {		r++;		sign = -1;	}	t = r;	while ((c = *t) && (c == '_' || GDKisalnum(c)))		t++;	s = (char *) alloca((unsigned) (1 + t - r));	strncpy(s, r, t - r);	s[t - r] = 0;	bid = BBPindex(s);	**dst = bid == 0 ? bat_nil : sign * bid;	return (int) (t + (c == '>') - src);}intbatToStr(char **dst, int *len, bat *src){	bat b = *src;	int i;	str s;	if (b == bat_nil || (s = BBPname(b)) == NULL || *s == 0) {		@:atommem(char,4)@		strcpy(*dst, "nil");		return 3;	}	i = (int) (strlen(s) + 4);	@:atommem(char,i)@	snprintf(*dst, *len, "<%s%s>", b < 0 ? "~" : "", s);	return (int) strlen(*dst);}bat *batRead(bat *a, stream *s, size_t cnt){	stream_readIntArray(s, (int *) a, cnt);	/* bat==int */	return stream_errnr(s) ? NULL : a;}intbatWrite(bat *a, stream *s, size_t cnt){	/* bat==int */	return stream_writeIntArray(s, (int *) a, cnt) ? GDK_SUCCEED : GDK_FAIL;}@= numfromstrint@1FromStr(char* src, int * len, @1 **dst){	int sign = 1, error = 0;	@1 base = 0;	str q, p = src;	@:atommem(@1,sizeof(@1))@	while (GDKisspace(*p))		p++; 	if (p[0] == 'n' && p[1] == 'i' && p[2] == 'l') {		base = @1_nil; p += 3;	} else {		if (*p == '-' || *p == '+') {			if (*p++ == '-')				sign = -1;		}		if (!num10(*p)) {			error = 1;			p = src;		}		while (*p == '0')			p++;		for (q = p; num10(*p); p++) {			base = mult10(base) + base10(*p);		}		if (p - q > @2 || (p - q == @2 && strncmp(q, "@3", @2) > 0)) {			error = 1; /* overflow */		}		if (sizeof(@1) == 8 && p[0] == 'L' && p[1] == 'L') {			p += 2;		}	}	**dst = error ? @1_nil : sign * base;	return (int) (p - src);}@c@= atom_io @1 *@1Read(@1 *a, stream *s, size_t cnt){	stream_read@2Array(s, (@3*)a, cnt);	return stream_errnr(s) ? NULL : a;}int @1Write(@1 *a, stream *s, size_t cnt){	return stream_write@2Array(s, (@3*)a, cnt) ? GDK_SUCCEED : GDK_FAIL;}@c@:numfromstr(bte,3,127)@@:atomtostr(bte,"%hhd",bte)@@:atom_io(bte,Bte,bte)@@:numfromstr(sht,5,32767)@@:atomtostr(sht,"%hd",sht)@@:atom_io(sht,Sht,sht)@@:numfromstr(int,10,2147483647)@@:atomtostr(int,"%d",int)@@:atom_io(int,Int,int)@@:numfromstr(lng,19,9223372036854775807)@@:atomtostr(lng,LLFMT,lng)@@c@:atom_io(lng,Lng,lng)@@intlngToStr(char **dst, int *len, lng *src){	char *p;	int l=0;   	@:atommem(char,lngStrlen)@	if (*src == lng_nil) {		strcpy(*dst, "nil");		return 3;	} 	sprintf(*dst, LLFMT, *src);	return strlen(*dst);}@cintptrFromStr(char *src, int *len, ptr **dst){	int error = 0;	size_t base = 0;	str p = src;	@:atommem(ptr,sizeof(ptr))@	while (GDKisspace(*p))		p++;	if (p[0] == 'n' && p[1] == 'i' && p[2] == 'l') {		error = 1;		p += 3;	} else {		if (p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) {			p += 2;		}		if (!num16(*p)) {			error = 1;			p = src;		}		while (error == 0) {			size_t val = mult16(base) + base16(*p);			if (val < base)				error = 1;			base = val;			p++;			if (!num16(*p))				break;		}	}	**dst = error ? ((ptr) ptr_nil) : ((ptr) base);	return (int) (p - src);}@:atomtostr(ptr,SZFMT,size_t)@#if SIZEOF_VOID_P == SIZEOF_INT@:atom_io(ptr,Int,int)@#else /* SIZEOF_VOID_P == SIZEOF_LNG */@:atom_io(ptr,Lng,lng)@#endifintdblFromStr(char *src, int *len, dbl **dst){	char *p = src;	double d;	/* alloc memory */	@:atommem(dbl,sizeof(dbl))@	/* on overflow, strtod returns HUGE_VAL and sets errno to	   ERANGE; on underflow, it returns 0 and also sets errno to	   ERANGE.  We accept 0, but not HUGE_VAL. */	errno = 0;	d = strtod(src, &p);	if (p == src || (errno == ERANGE && d != 0)) {		**dst = dbl_nil;	/* default return value is nil */		p = src;	} else {		**dst = (dbl) d;	}	return (int) (p - src);}@:atomtostr(dbl,"%.17g",double)@@:atom_io(dbl,Lng,lng)@#if defined(HAVE_STRTOF) && !HAVE_DECL_STRTOFextern float strtof(const char *, char **);#endifintfltFromStr(char *src, int *len, flt **dst){#ifdef HAVE_STRTOF	char *p = src;#endif	int n = 0;	float f;	/* alloc memory */	@:atommem(flt,sizeof(flt))@#ifdef HAVE_STRTOF	/* on overflow, strtof returns HUGE_VALF and sets errno to	   ERANGE; on underflow, it returns 0 and also sets errno to	   ERANGE.  We accept 0, but not HUGE_VALF. */	errno = 0;	f = strtof(src, &p);	n = (int) (p - src);	if (n == 0 || (errno == ERANGE && f != 0)#ifdef INFINITY	    || f == INFINITY#endif#ifdef NAN#ifndef __PGI	    || f == NAN#endif#endif	    )#else /* no strtof, try sscanf */	if (sscanf(src, "%f%n", &f, &n) <= 0 || n <= 0#ifdef INFINITY	    || f == INFINITY#endif#ifdef NAN#ifndef __PGI	    || f == NAN#endif#endif	    )#endif	{		**dst = flt_nil;	/* default return value is nil */		n = 0;	} else {		**dst = (flt) f;	}	return n;}@:atomtostr(flt,"%.9g",float)@@:atom_io(flt,Int,int)@@}@+ String Atom ImplementationThe Built-in type string is partly handled in an atom extensionlibrary. The main reason is to limit the number of built-in typesin the BAT library kernel. Moreover, an extra indirection for a string is lessharmful than for manipulation of, e.g. an int.The internal representation of strings is without escape sequences.When the string is printed we should add the escapes back into it.The current escape policy is that single- and double-quote can be prepended by abackslash. Furthermore, the backslash may be followed by threeoctal digits to denote a character.@- Automatic Double EliminationBecause in many typical situations lots of double string values occurin tables, the string insertion provides automatic double elimination.To do this, a GDK_STRHASHTABLE(=1024) bucket hashtable is hidden in the first 4096 (8192 on 64-bit architectures) bytes of the string heap, consisting of integer offsets of the first string hashing to that bucket in the heap. Furthermore, the first 4(8) bytes before each string in the heap is an integer offset to the next string hashing to the same number.However, in many other situations the cardinality of string columns is large,or the string values might even be unique. In those cases, our fixed-size hash table will start to overflow quickly. Therefore, after the hash table is full(this is measured very simplistically by looking whether the string heap exceeds a heap size = GDK_ELIMLIMIT -- done this way to keep compatibility with old bat images) we flush the hash table. If one views the string heaps as consecutive chunksof size GDK_ELIMLIMIT bytes, then all strings within one chunk are double-eliminated.There is a macro GDK_ELIMBASE(offset) that computes the base of the chunk in whicha certain byte-offset falls.@-This is a departure from our previous policy of not looking at the hash tables at all after overflow occurred. The advantage of the new approach is that if we have a value distribution that is skewed (ie some values are very frequent), these values will always be double eliminated, saving a considerable amount of space. Disadvantage of the approach is that we always have to reserve space for the next pointer (4(8) byte integer offset) that is stored right in front of the string (and consequently have to keep all string chunks and offsets aligned to 4(8)). All this translates into some wasted space. However, if there are that many different strings that the hash table overflows, the strings must be relatively long and the relative storage overhead should be low.@-Notice that this mechanism enables to keep a certain linear storage propertyin the string heaps. This is important if we want to take a BATslice on a BATby simply loading or @strong{mmap()}ping slices of the BAT files on disk into memory.This is relevant in order to process a very large BAT iteratively by taking slicesin order to reduce memory consumption. Notice that if there are few different string values, the hash table has not overflowed, and the string heap size will be small (i.e. < GDK_ELIMLIMIT), so in those cases it is not a problem to load the entire string heap.If the hash table @strong{has} overflowed, we want to be able to only map a slice of the string heap as well. Now, given that the first string in the BAT-slice is called F1 and its heap offset is O1 and the last string in the slice is F2 and its offset is O2, then the slice we should take from the string heap is: @exampleGDK_ELIMBASE(F1) .. MAX(GDK_ELIMBASE(F2)+GDK_ELIMLIMIT), O2+strlen(F2))@end exampleThe routine strElimDoubles() can be used to check whether all strings are still being double-eliminated in the original hash-table.Only then we know that unequal offset-integers in the BUN array meansguaranteed different strings in the heap. This optimization is made at some points in the GDK. Make sure you check GDK_ELIMDOUBLES before assuming this!@{@cintstrElimDoubles(Heap *h){	return GDK_ELIMDOUBLES(h);}@}@- String Comparison, NILs and UTF-8Using the char* type for strings is handy as this is the type of any constant strings in a C/C++ program. Therefore, MonetDB uses this definition for @%str@. However, different compilers and platforms use either signed or unsigned characters for the @%char@ type.It is required that string ordering in MonetDB is consistent over platforms though.As for the choice how strings should be ordered, our support for UTF-8 actually imposes that itshould follow `unsigned char' doctrine (like in the AIX native compiler). In this semantics, thoughwe have to take corrective action to ensure that str(nil) is the smallest value of the domain.@{@h#define GDK_STRNIL(s)    ((s) == NULL || *(chr*) (s) == GDK_chr_min)#define GDK_STRLEN(s)    ((GDK_STRNIL(s)?1:strlen(s))+1)#define GDK_STRCMP(l,r)  (GDK_STRNIL(l)?(GDK_STRNIL(r)?0:-1):GDK_STRNIL(r)?1: \                          (*(unsigned char*)(l) < *(unsigned char*)(r))?-1: \                          (*(unsigned char*)(l) > *(unsigned char*)(r))?1: \                          strCmpNoNil((unsigned char*)(l),(unsigned char*)(r)))@cintstrNil(str s){	return GDK_STRNIL(s);}intstrLen(const char *s){	return (int) GDK_STRLEN(s);}intstrCmp(str l, str r){	return GDK_STRCMP(l, r);}intstrCmpNoNil(unsigned char *l, unsigned char *r){	while (*l == *r) {		if (*l == 0)			return 0;		l++;		r++;	}	return (*l < *r) ? -1 : 1;}voidstrHeap(Heap *d, size_t cap){	size_t size;	var_t *h, *e;	cap = MAX(cap, BATTINY);	size = (GDK_STRHASHTABLE + 1) * sizeof(var_t) + MIN(GDK_ELIMLIMIT, cap * 12);	if (HEAPalloc(d, size, 1) >= 0) {		d->free = GDK_STRHASHTABLE * sizeof(var_t);		h = (var_t *) d->base;		for (e = h; e < h + GDK_STRHASHTABLE; e++) {			*e = 0;		}	}}@- Hash Function The string hash function is a very simple hash function that xors and rotates all characters together. It is optimized to process 2 charactersat a time (adding 16-bits to the hash value each iteration).@h#define GDK_STRHASH(x,y) {                                             \        str _c = (str) (x);                                            \        for((y)=0; _c[0] && _c[1]; _c+=2) {                            \                 (y) = ((y) << 3) ^ ((y) >> 11) ^ ((y) >> 17) ^ (_c[1] << 8) ^ _c[0];\        }                                                              \        (y) ^= _c[0];                                                  \}@chash_tstrHash(str s){	hash_t res;	GDK_STRHASH(s, res);	return res;}void strCleanHash(Heap *h, int rebuild) {        if (!GDK_ELIMDOUBLES(h)) {                /* flush hash table for security */	        memset(h->base, 0, GDK_STRHASHSIZE);	} else if (rebuild) {                var_t xx, cur=0, end=0;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -