📄 gdk_relop.mx
字号:
if (!rcount || (lcount<<3) > rcount) tpe = TYPE_var; /* insert double-eliminated strings as ints */ } bn = BATnew(TYPE_void, ATOMtype(tpe), estimate); if (bn == NULL) goto ready; ESTIDEBUG THRprintf(GDKout, "#BATfetchjoin: estimated resultsize: " SZFMT "\n", lcount); dst = BUNfirst(bn); zz = BUNsize(bn); if (ATOMstorage(tpe) == TYPE_chr) { @:voidfetchjoin(tloc,chr,SIMPLE)@ } else if (ATOMstorage(tpe) == TYPE_bte) { @:voidfetchjoin(tloc,bte,SIMPLE)@ } else if (ATOMstorage(tpe) == TYPE_sht) { @:voidfetchjoin(tloc,sht,SIMPLE)@ } else if (tpe != TYPE_bat && (ATOMstorage(tpe) == TYPE_int || ATOMstorage(tpe) == TYPE_flt)) { /* ensure use of ATOMput for TYPE_bat */ @:voidfetchjoin(tloc,int,SIMPLE)@ } else if (ATOMstorage(tpe) == TYPE_lng || ATOMstorage(tpe) == TYPE_dbl) { @:voidfetchjoin(tloc,lng,SIMPLE)@ } else if (r->tvarsized) { @:voidfetchjoin(tvar,bn->ttype,ATOM)@ } else { @:voidfetchjoin(tloc,bn->ttype,ATOM)@ } ret = bn; goto bunins_failed; bunins_failed: bn->batBuns->free = dst - bn->batBuns->base; BATsetcount(bn,bn->batBuns->free/zz); if (ret == NULL) { BBPreclaim(bn); goto ready; } /* handle string trick */ if (tpe == TYPE_var && ATOMstorage(r->ttype) == TYPE_str) { BAT *bm = BATmirror(bn); bn->theap = (Heap*)GDKzalloc(sizeof(Heap)); if (bn->theap && r->theap->filename) { char *nme = BBP_physical(bn->batCacheid); bn->theap->filename = (str) GDKmalloc(strlen(nme) + 12); GDKfilepath(bn->theap->filename, NULL, nme, "theap"); } if (HEAPcopy(bn->theap, r->theap) < 0) { BBPreclaim(bn); ret = NULL; goto ready; } bn->ttype = bm->htype = r->ttype; bn->tvarsized = bm->hvarsized = 1; } if (nondense) { bn->hsorted = (BATtordered(l) & BAThordered(r) & 1) ? BAThordered(l) : 0; } else { BATseqbase(bn, seqbase); if (seqbase != oid_nil) BATkey(bn, TRUE); bn->hsorted = GDK_SORTED; } } else if (l->tseqbase != oid_nil) { /* execute using slice */ BAT *v = BATmirror(VIEWhead(BATmirror(r))); oid lo_val = MAX(l->tseqbase, r->hseqbase); oid hi_val = MIN(l->tseqbase + lcount, r->hseqbase + rcount); size_t lo_pos = lo_val - r->hseqbase; size_t hi_pos = hi_val; if (hi_pos > r->hseqbase) hi_pos -= r->hseqbase; else hi_pos = 0; ALGODEBUG THRprintf(GDKout, "#BATfetchjoin: BAThvoid(l) && BATtvoid(l) && l->tseqbase != oid_nil => bn = BATslice(BATmirror(VIEWhead(BATmirror(r))), lo_pos=" SZFMT ", hi_pos=" SZFMT ");\n", lo_pos, hi_pos); bn = BATslice(v, lo_pos, hi_pos); if (seqbase != oid_nil) seqbase += lo_val - l->tseqbase; BATseqbase(bn, seqbase); BBPunfix(v->batCacheid); } else { /* nil join column => empty result */ ALGODEBUG THRprintf(GDKout, "#BATfetchjoin: BAThvoid(l) && BATtvoid(l) && l->tseqbase == oid_nil\n"); bn = BATnew(ATOMtype(l->htype), ATOMtype(r->ttype), 10); if (bn == NULL) goto ready; ESTIDEBUG THRprintf(GDKout, "#BATfetchjoin: estimated resultsize: %d\n", 10); } /* property propagation */ if ( BATcount(bn) == lcount) { ALIGNsetH(bn, l); /* BAThkey(r), remember? */ } else { if (hitalways_check) { GDKerror("BATfetchjoin(%s,%s) does not hit always (|bn|="SZFMT" != "SZFMT"=|l|) => can't use fetchjoin.\n", BATgetId(l), BATgetId(r), BATcount(bn), lcount); } bn->hsorted = l->hsorted; } bn->tsorted = (BATtordered(l) & BATtordered(r) & 1) ? GDK_SORTED : 0; if (BATtkey(l)) { /* if BATtkey(l) elements of r match at most once */ if ((BATtordered(l) & 1) && BATcount(bn) == rcount) { ALIGNsetT(bn, r); } else { BATkey(BATmirror(bn), BATtkey(r)); } } ret = bn; ready: if (l != l_orig) { BBPreclaim(l); /* was created as a temporary (slice) select on l */ } ESTIDEBUG THRprintf(GDKout, "#BATfetchjoin: actual resultsize: " SZFMT "\n", BATcount(bn)); return ret;}BAT *BATfetchjoin(BAT *l, BAT *r, size_t estimate){ /* fetchjoin now implies that you assure no fetch misses (hitalways) */ /* allows swapping of left and right input for faster processing */ return batfetchjoin(l, r, estimate, TRUE, TRUE);}BAT *BATleftfetchjoin(BAT *l, BAT *r, size_t estimate){ /* fetchjoin now implies that you assure no fetch misses (hitalways) */ /* do not swap left and right input, and hence maintain order of left head in result */ return batfetchjoin(l, r, estimate, FALSE, TRUE);}@-This routine does the join optimization. TODO: it should be expressed in MIL.@cstatic BAT *batjoin(BAT *l, BAT *r, size_t estimate, bit swap){ size_t i, lsize, rsize, lcount, rcount; int lfetch, rfetch, must_hash; lng logr, logl; ERRORcheck(l == NULL, "BATjoin: invalid left operand"); ERRORcheck(r == NULL, "BATjoin: invalid right operand"); ERRORcheck(TYPEerror(l->ttype, r->htype), "BATjoin: type conflict\n"); lcount= BATcount(l); rcount= BATcount(r); if ( (lcount == 0) || (rcount == 0) ){ BAT *bn; @:return_empty_join_result(l,r, BATjoin: |l|==0 or |r|==0)@ }@-collect statistics that help us decide what to do@c lsize = l->batBuns->size + (l->hheap ? l->hheap->size : 0) + (l->theap ? l->theap->size : 0); rsize = r->batBuns->size + (r->hheap ? r->hheap->size : 0) + (r->theap ? r->theap->size : 0); for (logr = 4, i = rcount; i > 0; logr++) i >>= 1; for (logl = 4, i = lcount; i > 0; logl++) i >>= 1; rfetch = BAThdense(r); lfetch = BATtdense(l); /* in case of fetchjoin, make sure we propagate a non-join void column */ if (lfetch && rfetch) { if (BAThvoid(l) && !BATtvoid(r)) lfetch = 0; if (swap && BATtvoid(r) && !BAThvoid(l)) rfetch = 0; } must_hash = (swap && (rsize < lsize)) ? l->thash != NULL : r->hhash != NULL;@-In special cases (equal join columns, void join columns, or orderedjoin columns), we take special action.@c if (swap && lfetch && !(rfetch && lcount < rcount)) { ALGODEBUG THRprintf(GDKout, "#BATjoin: BATmirror(BATfetchjoin(BATmirror(r), BATmirror(l), " SZFMT "));\n", estimate); return BATmirror(batfetchjoin(BATmirror(r), BATmirror(l), estimate, TRUE, FALSE)); } else if (rfetch) { ALGODEBUG THRprintf(GDKout, "#BATjoin: BATfetchjoin(l, r, " SZFMT ");\n", estimate); return batfetchjoin(l, r, estimate, swap, FALSE); }@-If both are ordered we do merge-join, or if hash-join is not possible rightaway and one input is ordered and the other is much smaller, we do nestedloop binary search (both implemented by BATmergejoin).@c if ((BATtordered(l) & BAThordered(r) & 1) || (must_hash && (((BATtordered(l) & 1) && ((lng) lcount > logl * (lng) rcount) && swap) || ((BAThordered(r) & 1) && ((lng) rcount > logr * (lng) lcount))))) { ALGODEBUG THRprintf(GDKout, "#BATjoin: BATmergejoin(l,r," SZFMT ");\n", estimate); return batmergejoin(l, r, estimate, swap); }@-hash join: the bread&butter join of monet@c if (swap && (rsize < lsize)) { /* assume largest fits memory */ ALGODEBUG THRprintf(GDKout, "#BATjoin: BATmirror(BAThashjoin(BATmirror(r), BATmirror(l)," SZFMT "));\n", estimate); return BATmirror(BAThashjoin(BATmirror(r), BATmirror(l), estimate)); } ALGODEBUG THRprintf(GDKout, "#BATjoin: BAThashjoin(l,r," SZFMT ");\n", estimate); return BAThashjoin(l, r, estimate);}BAT *BATjoin(BAT *l, BAT *r, size_t estimate){ /* allows swapping of left and right input for faster processing */ BAT *b = batjoin(l, r, estimate, TRUE); /* invest in property check, since we cannot easily derive the result properties, * but later operations might benefit from / depend on them * Disable at runtime via `Mserver --debug=16777216` or * `debugmask(or(debugmask(),16777216));` in MIL. */ JOINPROPCHK if (b) BATpropcheck(b, BATPROPS_ALL); return b;}BAT *BATleftjoin(BAT *l, BAT *r, size_t estimate){ /* do not swap left and right input, and hence maintain order of left head in result */ BAT *b = batjoin(l, r, estimate, FALSE); /* invest in property check, since we cannot easily derive the result properties, * but later operations might benefit from / depend on them * Disable at runtime via `Mserver --debug=16777216` or * `debugmask(or(debugmask(),16777216));` in MIL. */ JOINPROPCHK if (b) BATpropcheck(b, BATPROPS_ALL); return b;}@@}@+ OuterjoinThe left outerjoin between two BAT is also supported. The code isidentical to the hashjoin algorithm with the extension to insert a BUNif no match can be found.@{@= outerjoinloop{ ptr v, nilh = ATOMnilptr(r->htype), nilt = ATOMnilptr(r->ttype); hash_t xx; int yy; BUN p, q, w; BATloopFast(l, p, q, yy) { size_t i = 0; v = (ptr) BUNtail(l, p); if (!@1_EQ(v, nilh, @2)) HASHloop_@2(r, r->hhash, xx, v, w) { bunfastins(bn, BUNhead(l, p), BUNtail(r, w)); i++; } if (i == 0) { bunfastins(bn, BUNhead(l, p), nilt); } }}break;@@-The baseline join algorithm creates a hash on the smallest element andprobes it using the larger one. [TODO]@cBAT *BATouterjoin(BAT *l, BAT *r, size_t estimate){ BAT *bn = NULL; @:joincheck(BATouterjoin,l->ttype,r->htype)@ if (BAThdense(l) && BAThkey(r)) { if (estimate == oid_nil) estimate = BATcount(l); bn = BATnew(TYPE_void, ATOMtype(r->ttype), estimate); if (bn == NULL) return bn; ESTIDEBUG THRprintf(GDKout, "#BATouterjoin: estimated resultsize: " SZFMT "\n", estimate); BATseqbase(bn, l->hseqbase); } if (BAThdense(r) == FALSE && BAThordered(r) & 1) { /* use the merge-join; it takes care of the rest */ ALGODEBUG THRprintf(GDKout, "#BATouterjoin: mergejoin(l, r, bn, ATOMnilptr(r->ttype), estimate);\n"); bn = mergejoin(l, r, bn, ATOMnilptr(r->ttype), estimate); ESTIDEBUG THRprintf(GDKout, "#BATouterjoin: actual resultsize: " SZFMT "\n", BATcount(bn)); /* invest in property check, since we cannot easily derive the result properties, * but later operations might benefit from / depend on them * Disable at runtime via `Mserver --debug=16777216` or * `debugmask(or(debugmask(),16777216));` in MIL. */ JOINPROPCHK if (bn) BATpropcheck(bn, BATPROPS_ALL); return bn; } else if (bn == NULL) { @:joinbat(JOIN_EQ,BATouterjoin(l,r,oid_nil),estimate)@ } BATmmap_pin(r); if (BAThdense(r)) { /* positional algorithm: hash on void column would give error and is stupid */ ptr nilt = ATOMnilptr(r->ttype); bit nonil = TRUE; BUN p, q, w, s = BUNfirst(bn); int yy, xx = BUNsize(bn); BATloopFast(l, p, q, yy) { oid v = *(oid *) BUNtail(l, p); ptr t = nilt; if (v != oid_nil) { BUNfndVOID(w, r, &v); if (w) t = BUNtail(r, w); } nonil &= (t != nilt); bunfastins_nocheck(bn, s, BUNhead(l, p), t, xx); s += xx; } bn->tsorted = ((BATtordered(l) & BATtordered(r) & 1) && nonil) ? GDK_SORTED : 0; } else { /* hash based algorithm (default) */ int any = ATOMstorage(r->htype); if (BATprepareHash(r)) { BBPreclaim(bn); return NULL; } switch (any) {#ifndef NOEXPAND_CHR case TYPE_chr: @:outerjoinloop(simple,chr)@#endif#ifndef NOEXPAND_BTE case TYPE_bte: @:outerjoinloop(simple,bte)@#endif#ifndef NOEXPAND_SHT case TYPE_sht: @:outerjoinloop(simple,sht)@#endif#if !defined(NOEXPAND_INT) || !defined(NOEXPAND_FLT) case TYPE_int: case TYPE_flt: @:outerjoinloop(simple,int)@#endif#if !defined(NOEXPAND_DBL) || !defined(NOEXPAND_LNG) case TYPE_dbl: case TYPE_lng: @:outerjoinloop(simple,lng)@#endif default: @:outerjoinloop(atom,any)@ } bn->tsorted = 0; } BATmmap_unpin(r); /* set sorted flags by hand, because we used BUNfastins() */ if (r->hkey) { ALIGNsetH(bn, l); /* always 1 hit, so columns are equal */ } else { bn->hsorted = BAThordered(l); } ESTIDEBUG THRprintf(GDKout, "#BATouterjoin: actual resultsize: " SZFMT "\n", BATcount(bn)); /* Just to silence compilers (Intel's icc) that otherwise might * complain about "declared but never referenced" labels * (condition should never be true). * (A "dead" goto between the return and the label makes (other) * compilers (Sun) complain about never reached code...) */ if (!bn) goto bunins_failed; /* invest in property check, since we cannot easily derive the result properties, * but later operations might benefit from / depend on them * Disable at runtime via `Mserver --debug=16777216` or * `debugmask(or(debugmask(),16777216));` in MIL. */ JOINPROPCHK if (bn) BATpropcheck(bn, BATPROPS_ALL); return bn; bunins_failed: BATmmap_unpin(r); BBPreclaim(bn); return NULL;}@@}@+ ThetaJoinCurrent predicates supported are: JOIN_EQ, JOIN_LT,JOIN_GE, JOIN_LE and JOIN_GT. The JOIN_EQ will pass the control to thenormal @:BATjoin@ equijoin. The is and index-based join: if an indexis not present, it will be created on the smallest relation.We do lots of code-inlining: first of all on join type (4), andfurthermore on left-tail (equal right-head) type (5), which are thejoin columns. We factor out more by splitting on storage strategy(variable-sized/fixed-size) of both the left-head, and right tailcolumns (2*2).In the end, this results in 4*5*2*2 = 80 different inner loops.@{@cBAT *BATthetajoin(BAT *l, BAT *r, int op, size_t estimate){ size_t _lcount = BATcount(l); size_t _rcount = BATcount(r); size_t _estimate = _lcount * _rcount; @:joincheck(BATthetajoin,l->ttype,r->htype)@ if (estimate < _estimate) _estimate = estimate; if (op == JOIN_EQ) { /* exploit all equi-join optimizations */ ALGODEBUG THRprintf(GDKout, "#BATthetajoin(l,r,JOIN_EQ): BATjoin(l, r);\n"); return BATjoin(l, r, _estimate); } return BATnlthetajoin(l, r, op, _estimate);}BAT *BATleftthetajoin(BAT *l, BAT *r, int op, size_t estimate){ size_t _lcount = BATcount(l); size_t _rcount = BATcount(r); size_t _estimate = _lcount * _rcount; @:joincheck(BATleftthetajoin,l->ttype,r->htype)@ if (estimate < _estimate) _estimate = estimate; if (op == JOIN_EQ) { /* exploit all equi-join optimizations */ ALGODEBUG THRprintf(GDKout, "#BATleftthetajoin(l,r,JOIN_EQ): BATleftjoin(l, r);\n"); return BATleftjoin(l, r, _estimate); } return BATnlthetajoin(l, r, op, _estimate);}/* nested loop join; finally MonetDB can enjoy the virtues of this algorithm as well! */@= nlthetajoin_unroll8 @:nlthetajoin_@2(@1)@ ri += rsz; @:nlthetajoin_@2(@1)@ ri += rsz; @:nlthetajoin_@2(@1)@ ri += rsz; @:nlthetajoin_@2(@1)@ ri += rsz; @:nlthetajoin_@2(@1)@ ri += rsz; @:nlthetajoin_@2(@1)@ ri += rsz; @:nlthetajoin_@2(@1)@ ri += rsz; @:nlthetajoin_@2(@1)@ ri += rsz;@= nlthetajoin_void { cur->r = off++; cur += (v @1 ra[ri]); } @= nlthetajoin_oid { cur->r = *(oid*) (((BUN)(ra+ri))+off); cur += (v @1 ra[ri]); }@= nlthetajoin_implstatic int nlthetajoin_@2_@1(BAT *bn, BAT *l, BAT *r) { dst_t *cur = (dst_t*) (bn->batBuns->base + bn->batBuns->free);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -