gdk_setop.mx
来自「这个是内存数据库中的一个管理工具」· MX 代码 · 共 807 行 · 第 1/2 页
MX
807 行
@' The contents of this file are subject to the MonetDB Public License@' Version 1.1 (the "License"); you may not use this file except in@' compliance with the License. You may obtain a copy of the License at@' http://monetdb.cwi.nl/Legal/MonetDBLicense-1.1.html@'@' Software distributed under the License is distributed on an "AS IS"@' basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the@' License for the specific language governing rights and limitations@' under the License.@'@' The Original Code is the MonetDB Database System.@'@' The Initial Developer of the Original Code is CWI.@' Portions created by CWI are Copyright (C) 1997-2007 CWI.@' All Rights Reserved.@f gdk_setop@a Peter Boncz@* Set OperationsSet operations are provided in two series:@itemize@itemk-@emph{operand}, which look only at the head column.@items-@emph{operand} series, that look at the whole BUN.@end itemizeOperands provided are:@itemize@item [s,k]uniqueproduces a copy of the bat, with double elimination@item [s,k]unionproduces a bat union.@item [s,k]diffproduces bat difference.@item [s,k]intersectionproduce bat intersection.@end itemizeImplementations typically take two forms: if the input relation(s) is/areordered, a merge-algorithm is used. Otherwise, hash-indices are producedon demand for the hash-based versions.The @emph{[k,s]intersect(l,r)} operations result in all BUNs of @emph{l} thatare also in @emph{r}. They do not do double-elimination over the @emph{l} BUNs.The @emph{[k,s]diff(l,r)} operations result in all BUNs of @emph{l} that arenot in @emph{r}. They do not do double-elimination over the @emph{l} BUNs.The @emph{[k,s]union(l,r)} operations result in all BUNs of @emph{l}, plus all BUNs of @emph{r} that are not in @emph{l}. They do not do double-eliminationover the @emph{l} nor @emph{r} BUNs.Operations with double-elimination can be formed by performing @emph{[k,s]unique(l)} on their operands.The @emph{kintersect(l,r)} is used also as implementation for the @emph{semijoin()}.@{@h#ifndef _GDK_SETOP_H#define _GDK_SETOP_H#include "gdk.h"#define HITk(t1,t2) TRUE#define HITs(t1,t2) ((*cmp)(t1,t2) == 0)#define TAILCHECKs(l,r) TYPEerror(BATttype(l), BATttype(r))#define TAILCHECKk(l,r) FALSE#define EQUALs(t1,t2) ((*cmp)(t1,t2) == 0 && (*cmp)(t1,tnil))#define EQUALk(t1,t2) TRUE#define FLIPs(l,r) TRUE#define FLIPk(l,r) FALSE#define HITintersect(h,t) bunfastins(bn,h,t)#define HITdiff(h,t)#define MISSintersect(h,t)#define MISSdiff(h,t) bunfastins(bn,h,t)#define ENDintersect(h,t)#define ENDdiff(h,t) for(;p1<q1;p1+=s1) bunfastins(bn,h,t)#define RALIGNdiff(bn,l,r) FALSE#define RALIGNintersect(bn,l,r) ((BAThordered(l)&BAThordered(r)&1) && l->hkey\ && BATcount(bn)==BATcount(r))#endif /* _GDK_SETOP_H */@c#include "monetdb_config.h"#include "gdk.h"#include "gdk_setop.h"@+ Double EliminationComes in two flavors: looking at one column, or at two at-a-time.Implementation is either merge- or hash-based.@= mergeelim BATloopFast(b, p, q, xx) { ptr h = BUNh@2(b,p); ptr t = BUNt@3(b,p); for (r = p + xx; (r < q) && (@4 == 0); r += xx) { if (HIT@1(t, BUNt@3(b,r))) goto next@2@3@5; } bunfastins(bn, h, t); next@2@3@5:; }@= hashelim zz = BUNindex(bn, BUNfirst(bn)); if (!bn->hhash){ if (BAThash(bn, BATcapacity(bn)/2) == NULL) { BBPreclaim(bn); return NULL; } } BATloopFast(b, p, q, xx) { ptr h = BUNh@2(b,p); ptr t = BUNt@3(b,p); int ins = 1; hash_t yy; if (BATprepareHash(bn)) { BBPreclaim(bn); return NULL; } HASHloop@4(bn, bn->hhash, yy, h, r) { if (HIT@1(t, BUNt@3(bn,r))) { ins=0; break; } } if (ins) { bunfastins(bn, h, t); if (bn->hhash) HASHins@4(bn->hhash, (hash_t) zz, h); zz++; } }@= elim { int xx, (*cmp)(ptr, ptr) = BATatoms[b->ttype].atomCmp; size_t zz; BUN p, q, r; if (BAThordered(b)&1) { if (b->tvarsized) { @:mergeelim(@1,@2,var,@4,@3)@ } else { @:mergeelim(@1,@2,loc,@4,@3)@ } } else if (b->tvarsized) { @:hashelim(@1,@2,var,@3)@ } else { @:hashelim(@1,@2,loc,@3)@ } (void) cmp; break; }@= elim_doubles switch(ATOMstorage(b->htype)) {#ifndef NOEXPAND_CHR case TYPE_chr: @:elim(@1,loc,_chr,simple_CMP(h,BUNhloc(b,r),chr))@#endif#ifndef NOEXPAND_BTE case TYPE_bte: @:elim(@1,loc,_bte,simple_CMP(h,BUNhloc(b,r),bte))@#endif#ifndef NOEXPAND_SHT case TYPE_sht: @:elim(@1,loc,_sht,simple_CMP(h,BUNhloc(b,r),sht))@#endif#ifndef NOEXPAND_INT case TYPE_int: @:elim(@1,loc,_int,simple_CMP(h,BUNhloc(b,r),int))@#endif#ifndef NOEXPAND_FLT case TYPE_flt: @:elim(@1,loc,_flt,simple_CMP(h,BUNhloc(b,r),flt))@#endif#ifndef NOEXPAND_DBL case TYPE_dbl: @:elim(@1,loc,_dbl,simple_CMP(h,BUNhloc(b,r),dbl))@#endif#ifndef NOEXPAND_LNG case TYPE_lng: @:elim(@1,loc,_lng,simple_CMP(h,BUNhloc(b,r),lng))@#endif default: { int (*merge)(ptr, ptr)= BATatoms[b->htype].atomCmp; if (b->hvarsized) { @:elim(@1,var,var,((*merge)(h,BUNhvar(b,r))))@ } else { @:elim(@1,loc,loc,((*merge)(h,BUNhloc(b,r))))@ } } }@cBAT *BATins_kunique(BAT *bn, BAT *b){ bit unique = FALSE; BATcheck(b, "BATins_kunique: src BAT"); BATcheck(bn, "BATins_kunique: dst BAT"); unique = (BATcount(bn)==0); @:elim_doubles(k)@ if (unique && bn->hkey == FALSE) { /* we inserted unique head-values into an empty BAT; hence, the resulting BAT's head is (now) unique/key ... */ BATkey(bn,TRUE); } return bn; bunins_failed: BBPreclaim(bn); return NULL;}static BAT *BATins_sunique(BAT *bn, BAT *b){ bit unique = FALSE; BUN fst1, fst2, last1, last2; BATcheck(b, "BATins_sunique: src BAT"); BATcheck(bn, "BATins_sunique: dst BAT"); unique = (BATcount(bn)==0); fst1 = BUNfirst(bn); fst2 = BUNfirst(b); last1 = (BUNlast(bn) - BUNsize(bn)); last2 = (BUNlast(b) - BUNsize(b)); if (BATcount(b) && (BAThordered(b) & 1) && ATOMcmp(b->htype, BUNhead(b, fst2), BUNhead(b, last2)) == 0 && (BATcount(bn) == 0 || (ATOMcmp(bn->htype, BUNhead(bn, fst1), BUNhead(b, fst2)) == 0 && (BAThordered(bn) & 1) && ATOMcmp(bn->htype, BUNhead(bn, fst1), BUNhead(bn, last1)) == 0))) { return BATins_kunique(BATmirror(bn), BATmirror(b)); } if (BATcount(b) && (BATtordered(b) & 1) && ATOMcmp(b->ttype, BUNtail(b, fst2), BUNtail(b, last2)) == 0 && (BATcount(bn) == 0 || (ATOMcmp(bn->ttype, BUNtail(bn, fst1), BUNtail(b, fst2)) == 0 && (BATtordered(bn) & 1) && ATOMcmp(bn->ttype, BUNtail(bn, fst1), BUNtail(bn, last1)) == 0))) { return BATins_kunique(bn, b); } if ((BATtordered(b) & 1) && ATOMstorage(b->ttype) < TYPE_str) { bn= BATmirror(bn); b= BATmirror(b); } @:elim_doubles(s)@ if (unique && bn->batSet == FALSE) { /* we inserted unique BUNs into an empty BAT; hence, the resulting BAT is (now) unique/set ... */ BATset(bn,TRUE); } return bn; bunins_failed: BBPreclaim(bn); return NULL;}@- UniqueThe routine @`BATsunique@5 removes duplicate BUNs,The routine @`BATkunique@5 removes duplicate head entries.@cBAT *BATkunique(BAT *b){ BAT *bn; BATcheck(b, "BATkunique:"); if (b->hkey) { bn = BATcopy(b, b->htype, b->ttype, FALSE); } else { size_t cnt = BATcount(b); if (cnt > 10000) { BAT *tmp2 = NULL, *tmp1, *tmp0 = VIEWhead_(b, BAT_WRITE); if (tmp0) { tmp1 = BATsample(b, 1000); if (tmp1) { tmp2 = BATkunique(tmp1); if (tmp2) { cnt = (size_t) ((((lng) BATcount(tmp2)) * cnt) / 900); BBPreclaim(tmp2); } BBPreclaim(tmp1); } BBPreclaim(tmp0); } if (tmp2 == NULL) return NULL; } bn = BATnew(BAThtype(b), BATttype(b), cnt); if (bn == NULL || BATins_kunique(bn, b) == NULL) return NULL; } /* property management */ if (b->halign == 0) { b->halign = OIDnew(1); } BATkey(bn, TRUE); /* this we accomplished */ BATkey(BATmirror(bn), b->tkey); bn->hsorted = BAThordered(b); bn->tsorted = BATtordered(b); if (BATcount(bn) == BATcount(b)) { ALIGNset(bn, b); } return bn;}BAT *BATukunique(BAT *b){ BAT *v, *bn; BATcheck(b, "BATukunique:"); bn = BATkunique(v = VIEWhead(b)); BBPreclaim(v); return bn;}BAT *BATsunique(BAT *b){ BAT *bn; BATcheck(b, "BATsnique:"); if (b->hkey || b->tkey || b->batSet) { bn = BATcopy(b, b->htype, b->ttype, FALSE); } else { size_t cnt = BATcount(b); if (cnt > 10000) { BAT *tmp2 = NULL, *tmp1 = BATsample(b, 1000); if (tmp1) { tmp2 = BATkunique(tmp1); if (tmp2) { cnt = BATcount(tmp2) * (cnt / 1000); BBPreclaim(tmp2); } BBPreclaim(tmp1); } if (tmp2 == NULL) return NULL; } bn = BATnew(BAThtype(b), BATttype(b), cnt); if (bn == NULL || BATins_sunique(bn, b) == NULL) return NULL; } /* property management */ BATset(bn, TRUE); /* this we accomplished */ BATkey(bn, b->hkey); BATkey(BATmirror(bn), b->tkey); bn->hsorted = BAThordered(b); bn->tsorted = BATtordered(b); if (BATcount(bn) == BATcount(b)) { ALIGNset(bn, b); } return bn;}@}@+ Difference and IntersectDifference and Intersection are handled together. For each routinethere are two versions: @`BATkdiff@5(l,r) and @`BATkintersect@5(l,r) (whichlook at the head column only), versus @`BATsdiff@5(l,r) and@`BATsintersect@5(l,r) (looking at both columns).TODO synced/key case..@{@= mergecheck BUN p1 = BUNfirst(l), p2 = BUNfirst(r); BUN q1 = BUNlast(l), q2 = BUNlast(r); int s1 = BUNsize(l), s2 = BUNsize(r); ALGODEBUG THRprintf(GDKout, "#BATins_@1@2: mergecheck[@1, @2, @3, @4, @5];\n"); if (p2 < q2) BATloopFast(l, p1, q1, s1) { ptr h = BUNh@2(l,p1); ptr t = BUNtail(l,p1); ptr h2 = BUNh@2(r,p2); int c; while ((c = @4) > 0) { if ((p2 += s2) >= q2) goto end@2@3; h2 = BUNh@2(r,p2); } if (c == 0) { h2 = hnil; if (@4) { /* check for not-nil (nils don't match anyway) */ BUN pb = p2; for (;;) { if (EQUAL@5(t, BUNtail(r,pb))) { HIT@1(h, t); break; } if ((pb += s2) >= q2) { MISS@1(h, t); break; } h2 = BUNh@2(r,pb); if (@4) { MISS@1(h, t); break;
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?