gdk_setop.mx

来自「这个是内存数据库中的一个管理工具」· MX 代码 · 共 807 行 · 第 1/2 页

MX
807
字号
@' The contents of this file are subject to the MonetDB Public License@' Version 1.1 (the "License"); you may not use this file except in@' compliance with the License. You may obtain a copy of the License at@' http://monetdb.cwi.nl/Legal/MonetDBLicense-1.1.html@'@' Software distributed under the License is distributed on an "AS IS"@' basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the@' License for the specific language governing rights and limitations@' under the License.@'@' The Original Code is the MonetDB Database System.@'@' The Initial Developer of the Original Code is CWI.@' Portions created by CWI are Copyright (C) 1997-2007 CWI.@' All Rights Reserved.@f gdk_setop@a Peter Boncz@* Set OperationsSet operations are provided in two series:@itemize@itemk-@emph{operand}, which look only at the head column.@items-@emph{operand} series, that look at the whole BUN.@end itemizeOperands provided are:@itemize@item [s,k]uniqueproduces a copy of the bat, with double elimination@item [s,k]unionproduces a bat union.@item [s,k]diffproduces bat difference.@item [s,k]intersectionproduce bat intersection.@end itemizeImplementations typically take two forms: if the input relation(s) is/areordered, a merge-algorithm is used. Otherwise, hash-indices are producedon demand for the hash-based versions.The @emph{[k,s]intersect(l,r)} operations result in all BUNs of @emph{l} thatare also in @emph{r}. They do not do double-elimination over the @emph{l} BUNs.The @emph{[k,s]diff(l,r)} operations result in all BUNs of @emph{l} that arenot in @emph{r}. They do not do double-elimination over the @emph{l} BUNs.The @emph{[k,s]union(l,r)} operations result in all BUNs of @emph{l}, plus all BUNs of @emph{r} that are not in @emph{l}. They do not do double-eliminationover the @emph{l} nor @emph{r} BUNs.Operations with double-elimination can be formed by performing @emph{[k,s]unique(l)} on their operands.The @emph{kintersect(l,r)} is used also as implementation for the @emph{semijoin()}.@{@h#ifndef _GDK_SETOP_H#define _GDK_SETOP_H#include "gdk.h"#define HITk(t1,t2)             TRUE#define HITs(t1,t2)             ((*cmp)(t1,t2) == 0)#define TAILCHECKs(l,r)         TYPEerror(BATttype(l),	BATttype(r))#define TAILCHECKk(l,r)         FALSE#define EQUALs(t1,t2)           ((*cmp)(t1,t2) == 0 && (*cmp)(t1,tnil))#define EQUALk(t1,t2)           TRUE#define FLIPs(l,r)              TRUE#define FLIPk(l,r)              FALSE#define HITintersect(h,t)       bunfastins(bn,h,t)#define HITdiff(h,t)#define MISSintersect(h,t)#define MISSdiff(h,t)           bunfastins(bn,h,t)#define ENDintersect(h,t)#define ENDdiff(h,t)            for(;p1<q1;p1+=s1) bunfastins(bn,h,t)#define RALIGNdiff(bn,l,r)      FALSE#define RALIGNintersect(bn,l,r) ((BAThordered(l)&BAThordered(r)&1) && l->hkey\				 && BATcount(bn)==BATcount(r))#endif /* _GDK_SETOP_H */@c#include "monetdb_config.h"#include "gdk.h"#include "gdk_setop.h"@+ Double EliminationComes in two flavors: looking at one column, or at two at-a-time.Implementation is either merge- or hash-based.@= mergeelim	BATloopFast(b, p, q, xx) {		ptr h = BUNh@2(b,p);		ptr t = BUNt@3(b,p);		for (r = p + xx; (r < q) && (@4 == 0); r += xx) {			if (HIT@1(t, BUNt@3(b,r)))				goto next@2@3@5;		}		bunfastins(bn, h, t);  next@2@3@5:;	}@= hashelim	zz = BUNindex(bn, BUNfirst(bn));	if (!bn->hhash){		if (BAThash(bn, BATcapacity(bn)/2) == NULL) {			BBPreclaim(bn);			return NULL;		}	}	BATloopFast(b, p, q, xx) {		ptr h = BUNh@2(b,p);		ptr t = BUNt@3(b,p);		int ins = 1;		hash_t yy;		if (BATprepareHash(bn)) {			BBPreclaim(bn);			return NULL;		}		HASHloop@4(bn, bn->hhash, yy, h, r) {			if (HIT@1(t, BUNt@3(bn,r))) {				ins=0;				break;			}		}		if (ins) {			bunfastins(bn, h, t);			if (bn->hhash)				HASHins@4(bn->hhash, (hash_t) zz, h);			zz++;		}	}@= elim	{		int xx, (*cmp)(ptr, ptr) = BATatoms[b->ttype].atomCmp;		size_t zz;		BUN p, q, r;		if (BAThordered(b)&1) {			if (b->tvarsized) {				@:mergeelim(@1,@2,var,@4,@3)@			} else {				@:mergeelim(@1,@2,loc,@4,@3)@			}		} else if (b->tvarsized) {			@:hashelim(@1,@2,var,@3)@		} else {			@:hashelim(@1,@2,loc,@3)@		}		(void) cmp;		break;	}@= elim_doubles	switch(ATOMstorage(b->htype)) {#ifndef NOEXPAND_CHR	case TYPE_chr:		@:elim(@1,loc,_chr,simple_CMP(h,BUNhloc(b,r),chr))@#endif#ifndef NOEXPAND_BTE	case TYPE_bte:		@:elim(@1,loc,_bte,simple_CMP(h,BUNhloc(b,r),bte))@#endif#ifndef NOEXPAND_SHT	case TYPE_sht:		@:elim(@1,loc,_sht,simple_CMP(h,BUNhloc(b,r),sht))@#endif#ifndef NOEXPAND_INT	case TYPE_int:		@:elim(@1,loc,_int,simple_CMP(h,BUNhloc(b,r),int))@#endif#ifndef NOEXPAND_FLT	case TYPE_flt:		@:elim(@1,loc,_flt,simple_CMP(h,BUNhloc(b,r),flt))@#endif#ifndef NOEXPAND_DBL	case TYPE_dbl:		@:elim(@1,loc,_dbl,simple_CMP(h,BUNhloc(b,r),dbl))@#endif#ifndef NOEXPAND_LNG	case TYPE_lng:		@:elim(@1,loc,_lng,simple_CMP(h,BUNhloc(b,r),lng))@#endif	default:		{			int (*merge)(ptr, ptr)= BATatoms[b->htype].atomCmp;			if (b->hvarsized) {				@:elim(@1,var,var,((*merge)(h,BUNhvar(b,r))))@			} else {				@:elim(@1,loc,loc,((*merge)(h,BUNhloc(b,r))))@			}		}	}@cBAT *BATins_kunique(BAT *bn, BAT *b){	bit unique = FALSE;	BATcheck(b, "BATins_kunique: src BAT");	BATcheck(bn, "BATins_kunique: dst BAT");	unique = (BATcount(bn)==0);	@:elim_doubles(k)@	if (unique && bn->hkey == FALSE) {		/* we inserted unique head-values into an empty BAT;		   hence, the resulting BAT's head is (now) unique/key ... */		BATkey(bn,TRUE);	}	return bn;      bunins_failed:	BBPreclaim(bn);	return NULL;}static BAT *BATins_sunique(BAT *bn, BAT *b){	bit unique = FALSE;	BUN fst1, fst2, last1, last2;	BATcheck(b, "BATins_sunique: src BAT");	BATcheck(bn, "BATins_sunique: dst BAT");	unique = (BATcount(bn)==0);	fst1 = BUNfirst(bn);	fst2 = BUNfirst(b);	last1 = (BUNlast(bn) - BUNsize(bn));	last2 = (BUNlast(b) - BUNsize(b));	if (BATcount(b) && (BAThordered(b) & 1) && ATOMcmp(b->htype, BUNhead(b, fst2), BUNhead(b, last2)) == 0 &&	    (BATcount(bn) == 0 || (ATOMcmp(bn->htype, BUNhead(bn, fst1), BUNhead(b, fst2)) == 0 && (BAThordered(bn) & 1) && ATOMcmp(bn->htype, BUNhead(bn, fst1), BUNhead(bn, last1)) == 0))) {		return BATins_kunique(BATmirror(bn), BATmirror(b));	}	if (BATcount(b) && (BATtordered(b) & 1) && ATOMcmp(b->ttype, BUNtail(b, fst2), BUNtail(b, last2)) == 0 &&	    (BATcount(bn) == 0 || (ATOMcmp(bn->ttype, BUNtail(bn, fst1), BUNtail(b, fst2)) == 0 && (BATtordered(bn) & 1) && ATOMcmp(bn->ttype, BUNtail(bn, fst1), BUNtail(bn, last1)) == 0))) {		return BATins_kunique(bn, b);	}	if ((BATtordered(b) & 1) && ATOMstorage(b->ttype) < TYPE_str) {		bn= BATmirror(bn);		b= BATmirror(b);	}	@:elim_doubles(s)@	if (unique && bn->batSet == FALSE) {		/* we inserted unique BUNs into an empty BAT;		   hence, the resulting BAT is (now) unique/set ... */		BATset(bn,TRUE);	}	return bn;      bunins_failed:	BBPreclaim(bn);	return NULL;}@- UniqueThe routine @`BATsunique@5 removes duplicate BUNs,The routine @`BATkunique@5 removes duplicate head entries.@cBAT *BATkunique(BAT *b){	BAT *bn;	BATcheck(b, "BATkunique:");	if (b->hkey) {		bn = BATcopy(b, b->htype, b->ttype, FALSE);	} else {		size_t cnt = BATcount(b);		if (cnt > 10000) {			BAT *tmp2 = NULL, *tmp1, *tmp0 = VIEWhead_(b, BAT_WRITE);			if (tmp0) {				tmp1 = BATsample(b, 1000);				if (tmp1) {					tmp2 = BATkunique(tmp1);					if (tmp2) {						cnt = (size_t) ((((lng) BATcount(tmp2)) * cnt) / 900);						BBPreclaim(tmp2);					}					BBPreclaim(tmp1);				}				BBPreclaim(tmp0);			}			if (tmp2 == NULL)				return NULL;		}		bn = BATnew(BAThtype(b), BATttype(b), cnt);		if (bn == NULL || BATins_kunique(bn, b) == NULL)			return NULL;	}	/* property management */	if (b->halign == 0) {		b->halign = OIDnew(1);	}	BATkey(bn, TRUE);	/* this we accomplished */	BATkey(BATmirror(bn), b->tkey);	bn->hsorted = BAThordered(b);	bn->tsorted = BATtordered(b);	if (BATcount(bn) == BATcount(b)) {		ALIGNset(bn, b);	}	return bn;}BAT *BATukunique(BAT *b){	BAT *v, *bn;	BATcheck(b, "BATukunique:");	bn = BATkunique(v = VIEWhead(b));	BBPreclaim(v);	return bn;}BAT *BATsunique(BAT *b){	BAT *bn;	BATcheck(b, "BATsnique:");	if (b->hkey || b->tkey || b->batSet) {		bn = BATcopy(b, b->htype, b->ttype, FALSE);	} else {		size_t cnt = BATcount(b);		if (cnt > 10000) {			BAT *tmp2 = NULL, *tmp1 = BATsample(b, 1000);			if (tmp1) {				tmp2 = BATkunique(tmp1);				if (tmp2) {					cnt = BATcount(tmp2) * (cnt / 1000);					BBPreclaim(tmp2);				}				BBPreclaim(tmp1);			}			if (tmp2 == NULL)				return NULL;		}		bn = BATnew(BAThtype(b), BATttype(b), cnt);		if (bn == NULL || BATins_sunique(bn, b) == NULL)			return NULL;	}	/* property management */	BATset(bn, TRUE);	/* this we accomplished */	BATkey(bn, b->hkey);	BATkey(BATmirror(bn), b->tkey);	bn->hsorted = BAThordered(b);	bn->tsorted = BATtordered(b);	if (BATcount(bn) == BATcount(b)) {		ALIGNset(bn, b);	}	return bn;}@}@+ Difference and IntersectDifference and Intersection are handled together. For each routinethere are two versions: @`BATkdiff@5(l,r) and @`BATkintersect@5(l,r) (whichlook at the head column only), versus @`BATsdiff@5(l,r) and@`BATsintersect@5(l,r) (looking at both columns).TODO synced/key case..@{@= mergecheck	BUN p1 = BUNfirst(l), p2 = BUNfirst(r);	BUN q1 = BUNlast(l),  q2 = BUNlast(r);	int s1 = BUNsize(l),  s2 = BUNsize(r);	ALGODEBUG THRprintf(GDKout, "#BATins_@1@2: mergecheck[@1, @2, @3, @4, @5];\n");	if (p2 < q2) BATloopFast(l, p1, q1, s1) {		ptr  h = BUNh@2(l,p1);		ptr  t = BUNtail(l,p1);		ptr h2 = BUNh@2(r,p2);		int c;		while ((c = @4) > 0) {			if ((p2 += s2) >= q2)				goto end@2@3;			h2 = BUNh@2(r,p2);		}		if (c == 0) {			h2 = hnil;			if (@4) { /* check for not-nil (nils don't match anyway) */				BUN pb = p2;				for (;;) {					if (EQUAL@5(t, BUNtail(r,pb))) {						HIT@1(h, t);						break;					}					if ((pb += s2) >= q2) {						MISS@1(h, t);						break;					}					h2 = BUNh@2(r,pb);					if (@4) {						MISS@1(h, t);						break;

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?