📄 gdk_relop.mx
字号:
dst_t *lim = (dst_t*) (bn->batBuns->base + bn->batBuns->size); size_t li=0, lsz = BUNsize(l)/sizeof(@1), lhi = BATcount(l)*lsz; size_t ri=0, rsz = BUNsize(r)/sizeof(@1), rhi = (MAX(8,BATcount(r))-8)*rsz; @1 *la = (@1*) BUNtloc(l,BUNfirst(l)); @1 *ra = (@1*) BUNhloc(r,BUNfirst(r)); for(ri=li=0; li<lhi; li+=lsz, ri=0) { ssize_t off = (r->ttype == TYPE_void)?((ssize_t) r->tseqbase):r->tloc - r->hloc; ssize_t len = cur - (dst_t*) bn->batBuns->base; @1 v = la[li]; oid o; if (v == @1_nil) continue; /* unroll 8 times, factor out cur->l and memory re-allocation checking */ while(1) { if (cur+8 >= lim) { bn->batBuns->free = ((BUN) cur) - bn->batBuns->base; BATsetcount(bn,bn->batBuns->free/BUNsize(bn)); if (BATextend(bn, 8 + (size_t) (BATcount(bn) * (((dbl) lhi)/(li+1)))) == NULL) return 1; cur = (dst_t*) (bn->batBuns->base + bn->batBuns->free); lim = (dst_t*) (bn->batBuns->base + bn->batBuns->size); } if (ri >= rhi) break; if (r->ttype == TYPE_void) { @:nlthetajoin_unroll8(@3,void)@ } else { @:nlthetajoin_unroll8(@3,oid)@ } } /* do rest in more expensive loop */ if (r->ttype == TYPE_void) { size_t cnt = BATcount(r)*rsz; for(; ri < cnt; ri+=rsz) @:nlthetajoin_void(@3)@ } else { size_t cnt = BATcount(r)*rsz; for(; ri < cnt; ri+=rsz) @:nlthetajoin_oid(@3)@ } /* fill in the left oids for the generated result tuples */ o = *(oid*) BUNhead(l, ((BUN)(la+li))-l->tloc); len -= cur - (dst_t*) bn->batBuns->base; while(len < 0) cur[len++].l = o; bn->batBuns->free = ((BUN) cur) - bn->batBuns->base; } BATsetcount(bn,bn->batBuns->free/BUNsize(bn)); return 0;}@= nlthetajoin_call case TYPE_@1: if (nlthetajoin_@2_@1(bn,l,r)) goto bunins_failed; break;@= nlthetajoin_tpe#ifndef NOEXPAND_CHR @:nlthetajoin_@1(chr,@2,@3)@#endif#ifndef NOEXPAND_BTE @:nlthetajoin_@1(bte,@2,@3)@#endif#ifndef NOEXPAND_SHT @:nlthetajoin_@1(sht,@2,@3)@#endif#ifndef NOEXPAND_INT @:nlthetajoin_@1(int,@2,@3)@#endif#ifndef NOEXPAND_LNG @:nlthetajoin_@1(lng,@2,@3)@#endif#ifndef NOEXPAND_FLT @:nlthetajoin_@1(flt,@2,@3)@#endif#ifndef NOEXPAND_DBL @:nlthetajoin_@1(dbl,@2,@3)@#endif@ctypedef struct { oid l,r; } dst_t;@:nlthetajoin_tpe(impl,gt,>)@@:nlthetajoin_tpe(impl,ge,>=)@@:nlthetajoin_tpe(impl,lt,<)@@:nlthetajoin_tpe(impl,le,<=)@@:nlthetajoin_tpe(impl,eq,==)@BAT *BATnlthetajoin(BAT *l, BAT *r, int op, size_t estimate) { int optimize = (l->htype == TYPE_oid || BAThdense(l)) && (r->ttype == TYPE_oid || BATtdense(r)) \ /* the follwoing might be trivial cases, * but the "optimized" nlthetajoin implementation cannot handle them, yet ... */ && l->ttype != TYPE_void && r->htype != TYPE_void; BAT *bn = BATnew(ATOMtype(l->htype), ATOMtype(r->ttype), estimate+128); int lo = 0, hi = 0; if (bn == NULL) return NULL; if (op == JOIN_GT) { lo = 1; hi = GDK_int_max; if (optimize) switch(ATOMstorage(l->ttype)) { @:nlthetajoin_tpe(call,gt)@ default: optimize=0; } } else if (op == JOIN_GE) { lo = 0; hi = GDK_int_max; if (optimize) switch(ATOMstorage(l->ttype)) { @:nlthetajoin_tpe(call,ge)@ default: optimize=0; } } else if (op == JOIN_LT) { lo = GDK_int_min; hi = -1; if (optimize) switch(ATOMstorage(l->ttype)) { @:nlthetajoin_tpe(call,lt)@ default: optimize=0; } } else if (op == JOIN_LE) { lo = GDK_int_min; hi = 0; if (optimize) switch(ATOMstorage(l->ttype)) { @:nlthetajoin_tpe(call,le)@ default: optimize=0; } } else if (op == JOIN_EQ) { if (optimize) switch(ATOMstorage(l->ttype)) { @:nlthetajoin_tpe(call,eq)@ default: optimize=0; } } if (!optimize) { int lx, rx, (*cmp) (ptr, ptr) = BATatoms[l->ttype].atomCmp; ptr nil = ATOMnilptr(l->ttype); BUN rp, rq, lp, lq; BATloopFast(l, lp, lq, lx) { ptr v = (ptr) BUNtail(l, lp); if ((*cmp) (v, nil) == 0) { continue; } BATloopFast(r, rp, rq, rx) { ptr w = (ptr) BUNhead(r, rp); int c = (*cmp) (v, w); if ((c >= lo) & (c <= hi)) { bunfastins(bn, BUNhead(l, lp), BUNtail(r, rp)); } } } } bn->hsorted = l->hsorted; bn->tsorted = 0; return bn; bunins_failed: BBPreclaim(bn); return NULL;}@}@+ BandjoinA non-equi join of two relations R and S is called a Band-join ifthe join predicate requires the values of R to fall within a given range.This kind of joins is encountered in real world domains, such as thoseinvolved with time and distance.The boundary conditions for the bandjoin are constants or a NULL value.The latter enables encoding of arbitrary theta joins using the moregeneral bandjoin.Incidentally note that c1 = c2 = 0 leads to an equi-join.The straight forward implementation uses a nested loop.The current implementation does not optimize processing, becausethe impact of the choices is not yet clear.The hash indexing routines have been extended with a Band argument.@{@cBAT *BATbandjoin(BAT *l, BAT *r, ptr c1, ptr c2){ BAT *bn; BUN p, q; BUN v, w; @:joincheck(BATbandjoin,l->ttype,r->htype)@ @:joinbat(JOIN_BAND,BATbandjoin(l,r,c1,c2),oid_nil)@ switch (ATOMstorage(r->htype)) { case TYPE_chr: @:bandjoin(chr)@ case TYPE_bte: @:bandjoin(bte)@ case TYPE_sht: @:bandjoin(sht)@ case TYPE_int: @:bandjoin(int)@ case TYPE_flt: @:bandjoin(flt)@ case TYPE_dbl: @:bandjoin(dbl)@ case TYPE_lng: @:bandjoin(lng)@ default: GDKerror("BATbandjoin: type not implemented\n"); return NULL; } /* set sorted flags by hand, because we used BUNfastins() */ bn->hsorted = BAThordered(l); bn->tsorted = FALSE; ESTIDEBUG THRprintf(GDKout, "#BATbandjoin: actual resultsize: " SZFMT "\n", BATcount(bn)); return bn;}@@}@-The easiest case is to implement a nested loop for band operations.Choice point is to determine the status of the NULL values in the finalresult.@{@= bandjoin{ int xx, yy; @1 *x1; @1 *x2; BATloopFast(l, p, q, xx) { x1 = (@1 *) BUNtloc(l, p); BATloopFast(r, v, w, yy) { x2 = (@1 *) BUNhloc(r, v); if ((*x1 >= *x2 - *(@1 *) c1) && (*x1 <= *x2 + *(@1 *) c2)) { if (BUNfastins(bn, BUNhead(l,p), BUNtail(r, v)) == NULL) { BBPreclaim(bn); return NULL; } } } } break;}@@}@+ SemijoinThe @%BATsemijoin@ performs a semijoin over @%l@ and @%r@. It returnsa subset of @%l@ that matches at least one element in @%r@.The result inherits the integrity properties.Various algorithms exist. The main one BATkintersect() residesoutside this file, in the set-operations implementation (gdk_setop).Other variants for the semijoin include the fetch-semijoin(for dense join columns), the reverse semijoin that loops over rinstead of l, and semijoin using binary search in r.@{@= semijoinbat bn = BATnew(BAThtype(@1), BATttype(@1), MAX(BATTINY, MIN(BATcount(l), BATcount(r)))); ESTIDEBUG THRprintf(GDKout, "#%s.semijoinbat: estimated resultsize: " SZFMT "\n",@4,MAX(BATTINY, MIN(BATcount(l), BATcount(r)))); if (bn == NULL) { return bn; } BATkey(bn, BAThkey(@1)); BATkey(BATmirror(bn), BATtkey(@1)); bn->hsorted = @2; bn->tsorted = @3;@}@-In the sorted cases with a low semijoin hit-rate, we do lookup usingprobe-based binary search, instead of a full merge scan.Normal merge-semijoin with a full scan on both is handled by kintersect(default exit) if both relations are large or if their sizes do notdiffer significantly.@{@= binsemijoin{ BUN lp, lq; int xx; ptr nil = ATOMnilptr(l->htype); if (cpy == l) { BATloopFast(l, lp, lq, xx) { ptr v = BUNh@1(l,lp); if (!@3_EQ(v, nil, @2) && SORTfnd_@2(r, v)) { bunfastins(bn, v, BUNtail(l,lp)); } } } else { BATloopFast(l, lp, lq, xx) { BUN rp, rq; ptr v = BUNh@1(l,lp); if (!@3_EQ(v, nil, @2)) { int yy; SORTloop_@2(BATmirror(r), rp, rq, v, v, yy) { bunfastins(bn, v, BUNtail(r,rp)); } } } }}break;@cstatic BAT *BATbinsemijoin(BAT *l, BAT *r, BAT *cpy){ BAT *bn, *del = NULL; int loc, var; @:joincheck(BATbinsemijoin,l->htype,r->htype)@ @:semijoinbat(cpy,TRUE,(l==cpy&&(BATtordered(l)&1)),"BATbinsemijoin")@ if (!(BAThordered(r) & 1)) { del = r = BATsort(r); if (del == NULL) return NULL; } BATmmap_pin(r); switch (loc = var = ATOMstorage(l->htype)) {#ifndef NOEXPAND_CHR case TYPE_chr: @:binsemijoin(loc,chr,simple)@#endif#ifndef NOEXPAND_BTE case TYPE_bte: @:binsemijoin(loc,bte,simple)@#endif#ifndef NOEXPAND_SHT case TYPE_sht: @:binsemijoin(loc,sht,simple)@#endif#ifndef NOEXPAND_INT case TYPE_int: @:binsemijoin(loc,int,simple)@#endif#ifndef NOEXPAND_FLT case TYPE_flt: @:binsemijoin(loc,flt,simple)@#endif#ifndef NOEXPAND_DBL case TYPE_dbl: @:binsemijoin(loc,dbl,simple)@#endif#ifndef NOEXPAND_LNG case TYPE_lng: @:binsemijoin(loc,lng,simple)@#endif default: if (l->hvarsized) { if (r->hvarsized) { @:binsemijoin(var,var,atom)@ } else { @:binsemijoin(var,loc,atom)@ } } else { if (r->hvarsized) { @:binsemijoin(loc,var,atom)@ } else { @:binsemijoin(loc,loc,atom)@ } } } BATmmap_unpin(r); /* propagate properties */ bn->hsorted = l->hsorted; bn->tsorted = 0; if (BATcount(bn) == BATcount(l)) { if (l == cpy) { ALIGNset(bn, l); } else if (BAThkey(l) && BAThkey(r)) { ALIGNsetH(bn, l); } } if (del) BBPreclaim(del); ESTIDEBUG THRprintf(GDKout, "#BATbinsemijoin: actual resultsize: " SZFMT "\n", BATcount(bn)); return bn; bunins_failed: BATmmap_unpin(r); BBPreclaim(bn); return NULL;}@-The reverse semijoin is only better if the other side (r) is muchsmaller than l, and iff you already have the hash table on l. It uses hash tables onboth relations: on r to check that no item is processed twice (not necessary to checkiff BAThkey(r) and one on l to find the matching tuples.@{@= revsemijoin{ int xx; hash_t yy; BUN lp = 0, rp = 0, rq = 0, rr = 0; ptr nil = ATOMnilptr(l->htype); if (merge) { ALGODEBUG THRprintf(GDKout, "#BATrevsemijoin: merge\n"); BATloopFast(r, rp, rq, xx) { ptr v = BUN@3(r,rp); rr = rp + xx; if (rr < rq && @1_EQ(v,BUN@3(r,rr),@2)) continue; if (!@1_EQ(v, nil, @2)) { HASHloop_@2(l, l->hhash, yy, v, lp) bunfastins(bn, v, BUNtail(l, lp)); } } } else if (rdoubles) { ALGODEBUG THRprintf(GDKout, "#BATrevsemijoin: rdoubles\n"); BATloopFast(r, rp, rq, xx) { ptr v = BUN@3(r,rp); HASHloop_@2(r, r->hhash, yy, v, rr) break; if (rr != rp) continue; if (!@1_EQ(v, nil, @2)) { HASHloop_@2(l, l->hhash, yy, v, lp) bunfastins(bn, v, BUNtail(l, lp)); } } } else { BATloopFast(r, rp, rq, xx) { ptr v = BUN@3(r,rp); if (!@1_EQ(v, nil, @2)) { HASHloop_@2(l, l->hhash, yy, v, lp) bunfastins(bn, v, BUNtail(l, lp)); } } }}break;@cstatic BAT *BATrevsemijoin(BAT *l, BAT *r){ int any, rdoubles = (BAThkey(r) == 0), merge = rdoubles & BAThordered(r); BAT *bn; @:joincheck(BATrevsemijoin,l->htype,r->htype)@ @:semijoinbat(l,FALSE,FALSE,"BATrevsemijoin")@ if (BATprepareHash(l)) return NULL; if (rdoubles && BATprepareHash(r)) return NULL; BATmmap_pin(l); switch (any = ATOMstorage(l->htype)) {#ifndef NOEXPAND_CHR case TYPE_chr: @:revsemijoin(simple,chr,hloc)@#endif#ifndef NOEXPAND_BTE case TYPE_bte: @:revsemijoin(simple,bte,hloc)@#endif#ifndef NOEXPAND_SHT case TYPE_sht: @:revsemijoin(simple,sht,hloc)@#endif#if !defined(NOEXPAND_INT) || !defined(NOEXPAND_FLT) case TYPE_int: case TYPE_flt: @:revsemijoin(simple,int,hloc)@#endif#if !defined(NOEXPAND_DBL) || !defined(NOEXPAND_LNG) case TYPE_dbl: case TYPE_lng: @:revsemijoin(simple,lng,hloc)@#endif default: @:revsemijoin(atom,any,head)@
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -