📄 gdk_batop.mx
字号:
#endif#ifndef NOEXPAND_LNG case TYPE_lng: @:restrict1(simple,lng,loc)@ break;#endif default: if (b->hvarsized) { @:restrict1(atom,t,var)@ } else { @:restrict1(atom,t,loc)@ } break; } /* second phase */ if (b->hvarsized) { switch (ATOMstorage(t = b->ttype)) {#ifndef NOEXPAND_CHR case TYPE_chr: @:restrict2(simple,chr,var,loc)@ break;#endif#ifndef NOEXPAND_BTE case TYPE_bte: @:restrict2(simple,bte,var,loc)@ break;#endif#ifndef NOEXPAND_SHT case TYPE_sht: @:restrict2(simple,sht,var,loc)@ break;#endif#ifndef NOEXPAND_INT case TYPE_int: @:restrict2(simple,int,var,loc)@ break;#endif#ifndef NOEXPAND_FLT case TYPE_flt: @:restrict2(simple,flt,var,loc)@ break;#endif#ifndef NOEXPAND_DBL case TYPE_dbl: @:restrict2(simple,dbl,var,loc)@ break;#endif#ifndef NOEXPAND_LNG case TYPE_lng: @:restrict2(simple,lng,var,loc)@ break;#endif default: if (b->tvarsized) { @:restrict2(atom,t,var,var)@ } else { @:restrict2(atom,t,var,loc)@ } break; } } else { switch (ATOMstorage(t = b->ttype)) {#ifndef NOEXPAND_CHR case TYPE_chr: @:restrict2(simple,chr,loc,loc)@ break;#endif#ifndef NOEXPAND_BTE case TYPE_bte: @:restrict2(simple,bte,loc,loc)@ break;#endif#ifndef NOEXPAND_SHT case TYPE_sht: @:restrict2(simple,sht,loc,loc)@ break;#endif#ifndef NOEXPAND_INT case TYPE_int: @:restrict2(simple,int,loc,loc)@ break;#endif#ifndef NOEXPAND_FLT case TYPE_flt: @:restrict2(simple,flt,loc,loc)@ break;#endif#if !defined(NOEXPAND_LNG) || !defined(NOEXPAND_DBL) case TYPE_dbl: case TYPE_lng: @:restrict2(simple,lng,loc,loc)@ break;#endif default: if (b->tvarsized) { @:restrict2(atom,t,loc,var)@ } else { @:restrict2(atom,t,loc,loc)@ } break; } } GDKfree(mark); /* propagate alignment info */ if (BATcount(bn) == BATcount(b)) ALIGNset(bn, b); ESTIDEBUG THRprintf(GDKout, "#BATrestrict: actual resultsize: " SZFMT "\n", BATcount(bn)); return bn; bunins_failed: GDKfree(mark); BBPreclaim(bn); return NULL;}@}@+ BAT Sorting@%BATsort@ returns a sorted copy. @%BATorder@ sorts the BAT itself.@{@c#ifdef HAVE_RESTRICT#define __r restrict#else#ifdef HAVE___RESTRICT__#define __r __restrict__#else#define __r#endif#endif@:sort(1,,,_rev,>)@@:sort(GDK_SORTED_REV,_REV,_rev,,<)@@= sort_tpestatic intchk_order@3_@1(@1*__r col){ size_t i, r = 1; for (i = 0; i < 128; i += 4) { r &= (col[i+1] @2= col[i+0]) & (col[i+2] @2= col[i+1]) & (col[i+3] @2= col[i+2]) & (col[i+4] @2= col[i+3]); } return r != 0;}@= sort@:sort_tpe(chr,@5,@3)@@:sort_tpe(bte,@5,@3)@@:sort_tpe(sht,@5,@3)@@:sort_tpe(int,@5,@3)@@:sort_tpe(lng,@5,@3)@@:sort_tpe(flt,@5,@3)@@:sort_tpe(dbl,@5,@3)@static intchk_order@3_oid_oid(oid*__r col){ size_t i, r = 1; for (i = 0; i < 256; i += 8) { r &= (col[i+2] @5= col[i+0]) & (col[i+4] @5= col[i+2]) & (col[i+6] @5= col[i+4]) & (col[i+8] @5= col[i+6]); } return r != 0;}intBATordered@3(BAT* b){ size_t cnt = BATcount(b); if ((BAThordered(b) & @1) == 0 && cnt > 0) { int (*cmp) (ptr, ptr) = BATatoms[b->htype].atomCmp; char *cur = (char *) BUNhloc(b, BUNfirst(b)); char *end = (char *) BUNhloc(b, BUNlast(b)); int xx = BUNsize(b); /* we may have negative information already; this saves a scan */ if (b->H->nosorted@3 > BUNindex(b, cur) && b->H->nosorted@3 < BUNindex(b, end) && cmp(BUNhead(b, BUNptr(b, b->H->nosorted@3 - 1)), BUNhead(b, BUNptr(b, b->H->nosorted@3))) @5 0) { return FALSE; } /* for [tpe,void] and [OID,OID] bats, we have fast 128-at-a-time routines */ if (ATOMstorage(b->htype) == ATOMstorage(TYPE_oid) && BUNsize(b) == 2 * ATOMsize(TYPE_oid)) { while (cur + 256 * sizeof(oid) < end) { if (!chk_order@3_oid_oid((oid *) cur)) break; cur += 256 * sizeof(oid); } } else if (ATOMstorage(b->htype) == TYPE_chr && BUNsize(b) == sizeof(chr)) { while (cur + 128 * sizeof(chr) < end) { if (!chk_order@3_chr((chr *) cur)) break; cur += 128 * sizeof(chr); } } else if (ATOMstorage(b->htype) == TYPE_bte && BUNsize(b) == sizeof(bte)) { while (cur + 128 * sizeof(bte) < end) { if (!chk_order@3_bte((bte *) cur)) break; cur += 128 * sizeof(bte); } } else if (ATOMstorage(b->htype) == TYPE_sht && BUNsize(b) == sizeof(sht)) { while (cur + 128 * sizeof(sht) < end) { if (!chk_order@3_sht((sht *) cur)) break; cur += 128 * sizeof(sht); } } else if (ATOMstorage(b->htype) == TYPE_int && BUNsize(b) == sizeof(int)) { while (cur + 128 * sizeof(int) < end) { if (!chk_order@3_int((int *) cur)) break; cur += 128 * sizeof(int); } } else if (ATOMstorage(b->htype) == TYPE_lng && BUNsize(b) == sizeof(lng)) { while (cur + 128 * sizeof(lng) < end) { if (!chk_order@3_lng((lng *) cur)) break; cur += 128 * sizeof(lng); } } else if (ATOMstorage(b->htype) == TYPE_flt && BUNsize(b) == sizeof(flt)) { while (cur + 128 * sizeof(flt) < end) { if (!chk_order@3_flt((flt *) cur)) break; cur += 128 * sizeof(flt); } } else if (ATOMstorage(b->htype) == TYPE_dbl && BUNsize(b) == sizeof(dbl)) { while (cur + 128 * sizeof(dbl) < end) { if (!chk_order@3_dbl((dbl *) cur)) break; cur += 128 * sizeof(dbl); } } /* check sortedness tuple-by-tuple */ if (b->hheap) { BUN base = b->hheap->base; char *prv = base + *(var_t *) cur; cur += xx; while (cur < end) { char *val = base + *(var_t *) cur; if (cmp(prv, val) @5 0) { /* record negative position info */ b->H->nosorted@3 = BUNindex(b, cur); return FALSE; } prv = val; cur += xx; } } else { char *prv = cur; cur += xx; while (cur < end) { if (cmp(prv, cur) @5 0) { /* record negative position info */ b->H->nosorted@3 = BUNindex(b, cur); return FALSE; } prv = cur; cur += xx; } } } /* it is sorted@3! set the properties */ if ((b->hsorted & (bit) GDK_SORTED@2) == 0) { b->batDirtydesc = TRUE; } b->hsorted |= (bit) GDK_SORTED@2; return TRUE;}@:sort1(@1,@2,@3,@4,@5,,q)@@:sort1(@1,@2,@3,@4,@5,s,s)@@= sort1BAT *BAT@6sort@3(BAT *b){ BAT *bn; int tt = b->ttype; BATcheck(b, "BATsort@3: BAT"); if (b->htype == TYPE_void && b->hseqbase == oid_nil) { /* b's head is void-nil, hence we return a read-only copy/view of b */ return BATcopy(b, b->htype, b->ttype, FALSE); } if ((GDK_SORTED@2 == GDK_SORTED && b->htype == TYPE_void) || (b->htype != TYPE_void && BATordered@3(b))) { /* b is already ordered as desired, hence we return a read-only copy/view of b */ return BATcopy(b, b->htype, b->ttype, FALSE); } if (BATcount(b) < 2) { /* with fewer than 2 BUNs, b is ordered, hence we return a read-only copy/view of b */ b->hsorted = (bit) GDK_SORTED@2; if (b->htype == TYPE_oid) { oid h = * (oid *) BUNhloc(b, BUNfirst(b)); if (h != oid_nil) { b->hdense = 1; b->hseqbase = h; } } return BATcopy(b, b->htype, b->ttype, FALSE); } /* a void tail column 0,1,2,3,... must be materialized to oid before sorting */ if (tt == TYPE_void && b->tseqbase != oid_nil) { tt = TYPE_oid; } if ((GDK_SORTED@2 == GDK_SORTED_REV && b->htype == TYPE_void) || (b->htype != TYPE_void && (b->hkey || '@7' != 's') && BATordered@4(b))) { /* b is ordered in the opposite direction, hence we return a reverted copy of b */ /* a void head column must be materialized to oid before reverting */ int ht = b->htype; if (ht == TYPE_void && b->hseqbase != oid_nil) { ht = TYPE_oid; } bn = BATrevert(BATcopy(b, ht, tt, TRUE)); if (bn == NULL) return bn; return bn; } bn = BATcopy(b, b->htype, tt, TRUE); if (bn == NULL) return bn; return BAT@6order@3(bn);}BAT *BAT@6order@3(BAT *b){ BATcheck(b, "BATorder@3: BAT"); if (b->htype == TYPE_void && b->hseqbase == oid_nil) { /* b's head is void-nil, hence we return b as is */ return BATcopy(b, b->htype, b->ttype, FALSE); } if ((GDK_SORTED@2 == GDK_SORTED && b->htype == TYPE_void) || (b->htype != TYPE_void && BATordered@3(b))) { /* b is already ordered as desired, hence we return b as is */ return b; } if (BATcount(b) < 2) { /* with less than 2 BUNs, b is ordered, hence we return b as is */ b->hsorted = (bit) GDK_SORTED@2; return b; } if (b->ttype == TYPE_void && b->tseqbase != oid_nil) { /* materialize void-tail in-place */ b = BATmaterializet(b, BATcount(b)); } if ((GDK_SORTED@2 == GDK_SORTED_REV && b->htype == TYPE_void) || (b->htype != TYPE_void && (b->hkey || '@7' != 's') && BATordered@4(b))) { /* b is ordered in the opposite direction, hence we revert b */ b = BATrevert(b); return b; } GDK@7sort@3(BUNfirst(b), (b->hheap)?b->hheap->base:NULL, BATcount(b), BUNsize(b), b->htype, b->hloc); HASHdestroy(b); ALIGNdel(b, "BATorder@3", FALSE); b->hsorted = (bit) GDK_SORTED@2; b->tsorted = FALSE; b->hdense = FALSE; b->batDirtydesc = b->batDirtybuns = TRUE; return b;}@}@+ Reverse a BAT@%BATrevert@ rearranges a BAT in reverse order on head.@{@cBAT *BATrevert(BAT *b){ size_t xx; char *buf; BUN p, q; BATcheck(b, "BATrevert"); if ((b->htype == TYPE_void && b->hseqbase != oid_nil) || (b->ttype == TYPE_void && b->tseqbase != oid_nil)) { /* materialize void columns in-place */ b = BATmaterialize(b, BATcount(b)); } ALIGNdel(b, "BATrevert", FALSE); xx = (size_t) BUNsize(b); buf = (char *) GDKmalloc(xx); for (p = BUNlast(b) - xx, q = BUNfirst(b); p > q; p -= xx, q += xx) { memcpy(buf, p, xx); memcpy(p, q, xx); memcpy(q, buf, xx); } HASHdestroy(b); if (b->hsorted&GDK_SORTED) b->hsorted = GDK_SORTED_REV; else if (b->hsorted&GDK_SORTED_REV) b->hsorted = GDK_SORTED; else b->hsorted = FALSE; if (b->tsorted&GDK_SORTED) b->tsorted = GDK_SORTED_REV; else if (b->tsorted&GDK_SORTED_REV) b->tsorted = GDK_SORTED; else b->tsorted = FALSE; GDKfree(buf); return b;}@@}@+ BAT partitioningFor distributed processing we support hash and rangepartitioning operators: @%BATsplithash@ and @%BATsplitrange@.@{@-The @%part_bat@ function creates a partition BAT.@cstatic BAT *part_bat(BAT *b, int ht, int tt, size_t expected_size, int respect_order){ BAT *bn = BATnew(ht, tt, (size_t) ((double) expected_size * BATMARGIN)); if (bn) { BATkey(bn, b->hkey); BATkey(BATmirror(bn), b->tkey); bn->hsorted = (respect_order && (BAThordered(b) & 1)) ? GDK_SORTED : 0; bn->tsorted = (respect_order && (BATtordered(b) & 1)) ? GDK_SORTED : 0; } return bn;}@- hash partitioning@c#define BUNhash(bx,hx,tx)\ /* assert(n <= 0x40000000); */\ i = (int)(HASHprobe(&h,tx)%n); \ if ((r = BUNfnd(bx, &i)) != NULL){\ bat bid = *(bat*)BUNtloc(bx,r); \ bunfastins(BBPdescriptor(bid), hx, tx);\ }BAT *BAThashsplit(BAT *b, size_t n, int unary){ BAT *metabat, *bn, *bf; BUN r; Hash h; size_t cnt; int i = 0; /* assert(n <= 0x40000000); */ BATcheck(b, "BAThashsplit"); if (n > BATcount(b)) { GDKwarning("BAThashsplit: reduced number of ranges (" SZFMT ") to number of tuples (" SZFMT ").", n, BATcount(b)); n = BATcount(b); } if (n < 1) { GDKerror("BAThashsplit: number of ranges must not be less than 1!\n"); return 0; } metabat = BATnew(TYPE_int, TYPE_bat, n); if (metabat == NULL) return NULL; bn = unary ? VIEWhead_(b, b->batRestricted) : b; if (n <= 1) { if (BUNins(metabat, &i, &bn->batCacheid, FALSE) == NULL) goto bunins_failed; } else { BUN p, q; for (i = 2; i < (int) n; i *= 2) ; h.mask = i - 1; h.type = BATttype(b); cnt = (size_t) (BATMARGIN * (double) BATbuncount(b) / (double) n); for (i = 0; i < (int) n; i++) { bf = part_bat(bn, BAThtype(bn), BATttype(bn), cnt, TRUE); if (bf == NULL) { BBPreclaim(metabat); return NULL; } if (BUNins(metabat, &i, &bf->batCacheid, FALSE) == NULL) goto bunins_failed; } @:updateloop(metabat,b,BUNhash)@ BATloop(metabat, p, q) { bat bt = *(bat *) BUNtail(metabat, p); BBPunfix(bt); } } return metabat; bunins_failed: BBPreclaim(metabat); return NULL;}@}@-Range partitioning ensures that identical values appear in onepartition only. The routine also tries to deliver partitions ofuniform size.@{@cBAT *BATrangesplit(BAT *b, size_t n, int unary){ BAT *metabat, *slice, *histo, *bf = NULL, *bn, *m; int target, tpe, *sizes; int xx, zz = 0; size_t yy = 0; ptr *seps, nilval; BUN r, s; dbl scale; size_t thorough = (n <= 1 || BATtvoid(b) || (BATtordered(b) & 1)) ? 1 : 10; BATcheck(b, "BATrangesplit"); if (n > BATcount(b)) { GDKwarning("BATrangesplit: reduced number of ranges (" SZFMT ") to number of tuples (" SZFMT ").", n, BATcount(b)); n = BATcount(b); } if (n < 1) { GDKerror("BAThashsplit: number of ranges must not be less than 1!\n"); return 0; } /* assert(BATcount(b)/n <= 0x7fffffff); */ bn = unary ? VIEWhead_(b, b->batRestricted) : b; m = BATmirror(b); metabat = BATnew(BATttype(b), TYPE_bat, n); BATcheck(metabat, "BATrangesplit 2"); nilval = ATOMnilptr(BATttype(b));@-@}We use sampling to determine bucket sizes.Uniform bucket sizes are the ideal to be achieved.If necessary though, we deliver less than n buckets.@{@c slice = BATsample(b, MIN(MAX(30 * n * thorough, 100 * thorough), BATcount(b))); histo = BAThistogram(slice); target = (int) (BATcount(b) / n); /* see assert above */ scale = ((dbl) BATcount(b)) / ((dbl) BATcount(slice)); BBPreclaim(slice); sizes = (int *) GDKmalloc(2 * n * sizeof(int)); seps = (ptr *) GDKmalloc(2 * n * sizeof(ptr));@-Use the histogram to determine good split boundaries on b.@c BATorder(histo); BATloopFast(histo, r, s, xx) { int cnt = *(int *) BUNtloc(histo, r); int add = (int) (scale * cnt); if (zz + add > target) { if ((zz + add - target) < (target - zz)) { sizes[yy] = zz + add; seps[yy] = ATOMdup(histo->htype, BUNhead(histo, r)); add = 0; } else { sizes[yy] = zz; seps[yy] = ATOMdup(histo->htype, BUNhead(histo, (r - xx))); } zz = 0; yy++; } zz += add; } if (yy) { if ((sizes[yy - 1] + zz - target) > (target - zz)) { sizes[yy] = zz; } else { yy--; /* join with the last */ } } seps[yy] = nilval; BBPreclaim(histo); if (n > 1 && n != yy + 1) { GDKwarning("rangesplit: delivering %lu instead of %lu fragments\n", yy + 1, n); n = yy + 1; }@-CASE 1: just one bucket.This is done without copying b.@c if (n <= 1) { if (BUNins(metabat, nilval, &bn->batCacheid, FALSE) == NULL) goto bunins_failed;@-CASE 2: sorted on fragmentation column.We can again avoid copying, by giving slices (views) on the source BAT.Virtual oids (void) is a special subcase with positional lookup insteadof binary search.@c
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -