📄 gdk_batop.mx
字号:
bunfastins_nocheck(bn, q, BUNt@1(b, p), tl, bs); q += bs; } } else { HASHloop_str(b, b->hhash, i, tl) { p = BUNptr(b, i); if (q < r) bunfastins_nocheck(bn, q, BUNt@1(b, p), tl, bs); q += bs; } }@= hashselect switch(ATOMstorage(b->htype)) {#ifndef NOEXPAND_CHR case TYPE_chr: @:valselect(@1,_chr)@ break;#endif#ifndef NOEXPAND_BTE case TYPE_bte: @:valselect(@1,_bte)@ break;#endif#ifndef NOEXPAND_SHT case TYPE_sht: @:valselect(@1,_sht)@ break;#endif#ifndef NOEXPAND_INT case TYPE_int: @:valselect(@1,_int)@ break;#endif#ifndef NOEXPAND_FLT case TYPE_flt: @:valselect(@1,_flt)@ break;#endif#ifndef NOEXPAND_DBL case TYPE_dbl: @:valselect(@1,_dbl)@ break;#endif#ifndef NOEXPAND_LNG case TYPE_lng: @:valselect(@1,_lng)@ break;#endif#ifndef NOEXPAND_STR case TYPE_str: @:stringselect(@1)@ break;#endif default: if (b->hvarsized) { @:valselect(@1,var)@ } else { @:valselect(@1,loc)@ } break; }@cstatic BAT *BAT_hashselect(BAT *b, BAT *bn, ptr tl){ int ht = bn->htype, tt = bn->ttype; size_t size = BATcount(bn); hash_t i; BATcheck(b, "BAT_hashselect"); b= BATmirror(b); if (BATprepareHash(b)) { bunins_failed: BBPreclaim(bn); return NULL; } while (bn) { BUN p, q = BUNfirst(bn); BUN r = (BUN) ((char *) Bunbase(bn) + bn->batBuns->size); int bs = BUNsize(bn); if (b->tvarsized) { @:hashselect(var)@ } else { @:hashselect(loc)@ } if (q <= r) break; size = (q - BUNfirst(bn)) / bs; BBPreclaim(bn); bn = BATnew(ht, tt, size); } return bn;}@}@- Range SelectionsThe routine @%BATselect@ locates the BAT subset whose tail componentsatisfies the range condition T l <[=] tail <[=] h. Either boundaryis included in the result iff the respective bit parameter "li"/"hi"is TRUE. A nil value in either dimension defines infinity. The valueis set accordingly.Range selections without lower or upper bound use the nil atomto indicate this (this is somewhat confusing). Note, however, thatthrough the definition of MIL we do not want the nils to appear inthe result (as (nil @{<,=,>@} ANY) = bit(nil) != true).@{@cBAT *BAT_select_(BAT *b, ptr tl, ptr th, bit li, bit hi, bit tail, bit preserve_order){ int hval, lval, equi, t, ht, tt; size_t offset, batcnt, estimate = 0; ptr nil; BAT *bn; BUN p, q; BATcheck(b, "BATselect: \n"); BATcheck(tl, "BATselect: tl value required\n");@-Examine type, and values for lower- and higher-bound.@c batcnt = BATcount(b); t = b->ttype; nil = ATOMnilptr(t); lval = ATOMcmp(t, tl, nil) || (th == NULL); equi = ((th == NULL) || (lval && !ATOMcmp(t, tl, th))); if (equi) { if (th == NULL) hi = li; th = tl; hval = 1; /* equi-select */ } else { hval = ATOMcmp(t, th, nil); } /* preliminarily determine result types */ ht = BAThtype(b); tt = tail ? BATttype(b) : TYPE_void; if (hval && ((ATOMcmp(t, tl, th) > 0) || (equi && !(li && hi)))) { /* empty range */ ALGODEBUG THRprintf(GDKout, "#BAT_select_(b=%s): empty range;\n", BATgetId(b)); return BATnew(ht, tt, 10); }@}@- Slice ImplementationsWhen the result is a dense slice of the BAT, we can optimize.A slice does not need to copy the BAT selected on, it can justgive back a 'view' on the memory of the existing BAT. See BATslice().@{@c if (BATtordered(b) & 1) { BAT *v = tail ? b : VIEWhead_(b, b->batRestricted); size_t high = batcnt; size_t low = 0; if (BATtdense(b)) { /* Selections on voids are positional. */ if (hval) { size_t h = (*(oid *) th) + (hi ? 1 : 0); if (h > b->tseqbase) h -= b->tseqbase; else h = 0; if (h < high) high = h; } if (lval) { if (*(oid *)tl != oid_nil) { size_t l = (*(oid *) tl) + (li ? 0 : 1); if (l > b->tseqbase) l -= b->tseqbase; else l = 0; if (l > low) low = l; } else { if (equi) { /* nil-equi select on dense columns is empty */ high = low; } } } } else { /* Use probe-based binary search */ offset = BUNindex(b, BUNfirst(b)); if (lval) { if (li) p = SORTfndfirst(b, tl); else p = SORTfndlast(b, tl); } else { /* No lower bound, we must still exclude nils. They are in * front, so we can still slice, by starting after them. */ p = SORTfndlast(b, nil); } low = BUNindex(b, p); if (low > offset) low -= offset; else low = 0; if (hval) { if (hi) q = SORTfndlast(b, th); else q = SORTfndfirst(b, th); high = BUNindex(b, q); if (high > offset) high -= offset; else high = 0; } } ALGODEBUG THRprintf(GDKout, "#BAT_select_(b=%s): BATslice(v=%s, low=" SZFMT ", high=" SZFMT ");\n", BATgetId(b), BATgetId(v), low, high); bn = BATslice(v, low, high); if (!tail) { BBPreclaim(v); } return bn; }@-Use sampling to determine a good result size, when the bat is large.@c if (BATtkey(b)) { estimate = 1; } else if (batcnt > 100000) { size_t _lo = batcnt / 2, _hi = _lo + 105; BAT *tmp1; ALGODEBUG THRprintf(GDKout, "#BAT_select_(b=%s): sampling: tmp1 = BATslice(b=%s, _lo=" SZFMT ", _hi=" SZFMT ");\n", BATgetId(b), BATgetId(b), _lo, _hi); tmp1 = BATslice(b, _lo, _hi); /* slice keeps all parent properties */ if (tmp1) { BAT *tmp2; ALGODEBUG THRprintf(GDKout, "#BAT_select_(b=%s): sampling: tmp2 = BAT_select_(tmp1=%s, tl, th, tail);\n", BATgetId(b), BATgetId(tmp1)); tmp2 = BAT_select_(tmp1, tl, th, li, hi, tail, FALSE); if (tmp2) { /* reserve 105% of what has been estimated */ estimate = (size_t) ((((lng) BATcount(tmp2)) * (lng) batcnt) / LL_CONSTANT(100)); BBPreclaim(tmp2); } BBPreclaim(tmp1); } } else { estimate = MAX(estimate, BATguess(b)); }@-Create the result BAT and execute the select algorithm.@c if (ht == TYPE_void && tt == TYPE_void) { ht = TYPE_oid; } bn = BATnew(ht, tt, estimate); if (bn) { int nocheck = (estimate >= batcnt); if (!preserve_order && equi && b->thash) { ALGODEBUG THRprintf(GDKout, "#BAT_select_(b=%s): BAT_hashselect(b=%s, bn=%s, tl); (using existing hash-table)\n", BATgetId(b), BATgetId(b), BATgetId(bn)); bn = BAT_hashselect(b, bn, tl); } else if (!preserve_order && equi && ATOMsize(b->ttype) > 1 && estimate * 100 < batcnt && batcnt * 2 * sizeof(int) < (GDK_mem_maxsize / 4)) { /* Build a hash-table on the fly for equi-select if the selectivity is low * and it is not too big */ ALGODEBUG THRprintf(GDKout, "#BAT_select_(b=%s): BAT_hashselect(b=%s, bn=%s, tl); (building hash-table on the fly)\n", BATgetId(b), BATgetId(b), BATgetId(bn)); bn = BAT_hashselect(b, bn, tl); } else { ALGODEBUG THRprintf(GDKout, "#BAT_select_(b=%s): BAT_scanselect(b=%s, bn=%s, tl, th, equi=%d, lval=%d, hval=%d, nocheck=%d);\n", BATgetId(b), BATgetId(b), BATgetId(bn), equi, lval, hval, nocheck); bn = BAT_scanselect(b, bn, tl, th, li, hi, equi, lval, hval, nocheck); } } if (bn == NULL) { return NULL; /* error occurred */ }@-Propagate alignment info. Key properties are inherited from the parent.Hash changes the order; IDX yields ordered tail; scan respects original order.@c if (BATcount(bn)) { BATkey(bn, b->hkey); BATkey(BATmirror(bn), b->tkey); } else { BATkey(bn, TRUE); BATkey(BATmirror(bn), TRUE); } if (equi && tail) { BATsetprop_wrd(bn, GDK_AGGR_CARD, (wrd) (BATcount(bn) > 0)); if (b->ttype == TYPE_bit) { BATsetprop_wrd(bn, GDK_AGGR_SIZE, (*(bit *) tl == TRUE) ? BATcount(bn) : 0); } } if (equi && b->thash) { bn->hsorted = bn->tsorted = FALSE; } else { if (BATcount(bn) == BATcount(b)) ALIGNset(bn, b); bn->hsorted = BAThordered(b); bn->tsorted = BATtordered(b); } ALGODEBUG THRprintf(GDKout, "#BAT_select_(b=%s): %s: hkey=%d, tkey=%d, hsorted=%d, tsorted=%d.\n", BATgetId(b), BATgetId(bn), bn->hkey, bn->tkey, bn->hsorted, bn->tsorted); ESTIDEBUG THRprintf(GDKout, "#BAT_select_(b=%s): resultsize: estimated " SZFMT ", got " SZFMT ".\n", BATgetId(b), estimate, BATcount(bn)); return bn;}BAT *BAT_select(BAT *b, ptr h, ptr t, bit tail){ return BAT_select_(b, h, t, TRUE, TRUE, tail, FALSE);}BAT *BATselect_(BAT *b, ptr h, ptr t, bit li, bit hi){ return BAT_select_(b, h, t, li, hi, TRUE, FALSE);}BAT *BATuselect_(BAT *b, ptr h, ptr t, bit li, bit hi){ return BAT_select_(b, h, t, li, hi, FALSE, FALSE);}BAT *BATselect(BAT *b, ptr h, ptr t){ return BAT_select_(b, h, t, TRUE, TRUE, TRUE, FALSE);}BAT *BATuselect(BAT *b, ptr h, ptr t){ return BAT_select_(b, h, t, TRUE, TRUE, FALSE, FALSE);}@}@- Top-N selectionThe top-N elements can be easily obtained by trimming thespace. The auxiliary index structures are removed.For non-variable size BATs it merely requiresadjustment of the free space labels. Other BATs requirea loop through the tuples to be deleted. [todo]@{@csize_tBATtopN(BAT *b, size_t topN){ BATcheck(b, "BATtopN"); if (topN > BATcount(b)) { GDKerror("BATtopN: not enough tuples in target\n"); } else if (topN * BUNsize(b) > b->batBuns->size) { GDKerror("BATtopN: not enough capacity to keep result\n"); } else if (b->dims.headvarsized || b->dims.tailvarsized) { HASHremove(b); while (BATcount(b) > topN) BUNdelete(b, BUNlast(b), FALSE); } else { HASHremove(b); b->batBuns->free = BUNptr(b, topN) - Bunbase(b); BATsetcount(b, topN); } return 0;}@- Random Selections@cBAT *BATsample(BAT *b, size_t size){ size_t cnt, i, r = 0, n, j; size_t *choice, *dst; BAT *bn; BATcheck(b, "BATsample: source BAT"); cnt = BATcount(b); n = MIN(size, BATcount(b)); bn = BATnew(BAThtype(b), BATttype(b), n); BATcheck(bn, "BATsample: dest BAT"); if (n == 0) return bn; dst = choice = (size_t *) GDKmalloc(n * sizeof(size_t)); if (n * 2 < BATcount(b)) { /* nondense sample */ char *vec = (char *) GDKzalloc(1 + (cnt / 8)); for (j = 0; j < n; j++) { r += rand(); i = r % cnt; for (;;) { int mask = 1 << (i & 7); if (vec[i >> 3] & mask) { if (++i == cnt) i = 0; } else { vec[i >> 3] |= mask; break; } } *dst++ = i; } GDKfree(vec); } else if (cnt < 65536) { unsigned short *vec = (unsigned short *) GDKmalloc(cnt * sizeof(unsigned short)); for (i = 0; i < cnt; i++) vec[i] = (unsigned short) i; for (j = 0; j < n; j++) { r += rand(); i = r % cnt; *dst++ = vec[i]; vec[i] = vec[--cnt]; } GDKfree(vec); } else { size_t *vec = (size_t *) GDKmalloc(cnt * sizeof(size_t)); for (i = 0; i < cnt; i++) vec[i] = i; for (j = 0; j < n; j++) { r += rand(); i = r % cnt; *dst++ = vec[i]; vec[i] = vec[--cnt]; } GDKfree(vec); } /* merge all positions into a sorted list */ qsort((void *) choice, n, sizeof(size_t),#if SIZEOF_SIZE_T == SIZEOF_INT (int (*)(const void *, const void *)) intCmp#else (int (*)(const void *, const void *)) lngCmp#endif ); /* insert the sorted sample */ cnt = BUNindex(b, BUNfirst(b)); for (j = 0; j < n; j++) { BUN p = BUNptr(b, cnt + choice[j]); bunfastins(bn, BUNhead(b, p), BUNtail(b, p)); } GDKfree(choice); /* clean up choice array of BUN positions */ /* set sorted flags by hand, because we used BUNfastins() */ bn->hsorted = BAThordered(b); bn->tsorted = BATtordered(b); BATkey(bn, b->hkey); BATkey(BATmirror(bn), b->tkey); return bn; bunins_failed: BBPreclaim(bn); return NULL;}@- Horizontal Fragmentation@cBAT *BATfragment(BAT *b, ptr hl, ptr hh, ptr tl, ptr th){ BATcheck(b, "BATfragment:BAT required\n"); if ((hl == NULL) && (hh == NULL)) { return BATselect(b, tl, th); } if ((BAThordered(b) & 1) == FALSE && (BATtordered(b) & 1)) { return BATmirror(BATrestrict(BATmirror(b), tl, th, hl, hh)); } return BATrestrict(b, hl, hh, tl, th);}@@}@-The baseline algorithm for fragment location is a two-phase process.First we search on the 1stdimension and collect the qualifying BUNs in a marking on thestack. In the second phase, the tail is analyzed for all itemsalready marked and qualifying associations are copied into the result.An index is exploited when possible.@{@= restrict1 if (BAThordered(b)&1) { int offset; BUN p1, p2; b= BATmirror(b); SORTloop(b, p1, p2, hl, hh, offset) { *m++ = p1; } b= BATmirror(b); } else { int lval = !@1_EQ(ATOMnilptr(t),hl,@2); int hval = !@1_EQ(ATOMnilptr(t),hh,@2); if (hval && lval && @1_GT(hl,hh,@2)) { GDKerror("BATrestrict: illegal head range.\n"); } else { int xx; BATloopFast(b, p, l, xx) { if ((!lval || @1_LE(hl, BUNh@3(b, p),@2)) && (!hval || @1_LE(BUNh@3(b, p), hh,@2))) { *m++ = p; } } } }@@= restrict2 { tl = @1_EQ(ATOMnilptr(t),tl,@2) ? 0 : tl; th = @1_EQ(ATOMnilptr(t),th,@2) ? 0 : th; if (th && tl && @1_GT(tl,th,@2)) { GDKerror("BATrestrict: illegal tail range.\n"); } else { for (; i < m; i++) { ptr v = BUNt@4(b, *i); if ((!tl || @1_LE(tl, v, @2)) && (!th || @1_LE(v, th, @2))) { bunfastins(bn, BUNh@3(b, *i), v); } } } } break;@cBAT *BATrestrict(BAT *b, ptr hl, ptr hh, ptr tl, ptr th){ BAT *bn; BUN p = NULL, l; BUN *mark, *m, *i; size_t s; int t; BATcheck(hl, "BATrestrict:hl is null"); BATcheck(hh, "BATrestrict:hh is null"); BATcheck(tl, "BATrestrict:tl is null"); BATcheck(th, "BATrestrict:th is null"); bn = BATnew(BAThtype(b), BATttype(b), BATguess(b)); ESTIDEBUG THRprintf(GDKout, "#BATrestrict: estimated resultsize: " SZFMT "\n", BATguess(b)); if (bn == NULL) { return NULL; } BATkey(bn, b->hkey); BATkey(BATmirror(bn), b->tkey); bn->hsorted = BAThordered(b); bn->tsorted = BATtordered(b); s = BATcount(b); if (s == 0) { ESTIDEBUG THRprintf(GDKout, "#BATrestrict: actual resultsize: " SZFMT "\n", BATcount(bn)); return bn; } mark = (BUN *) GDKmalloc((unsigned) s * sizeof(BUN)); m = mark; i = mark; switch (ATOMstorage(t = b->htype)) {#ifndef NOEXPAND_CHR case TYPE_chr: @:restrict1(simple,chr,loc)@ break;#endif#ifndef NOEXPAND_BTE case TYPE_bte: @:restrict1(simple,bte,loc)@ break;#endif#ifndef NOEXPAND_SHT case TYPE_sht: @:restrict1(simple,sht,loc)@ break;#endif#ifndef NOEXPAND_INT case TYPE_int: @:restrict1(simple,int,loc)@ break;#endif#ifndef NOEXPAND_FLT case TYPE_flt: @:restrict1(simple,flt,loc)@ break;#endif#ifndef NOEXPAND_DBL case TYPE_dbl: @:restrict1(simple,dbl,loc)@ break;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -