📄 aggr.mx
字号:
@' The contents of this file are subject to the MonetDB Public License@' Version 1.1 (the "License"); you may not use this file except in@' compliance with the License. You may obtain a copy of the License at@' http://monetdb.cwi.nl/Legal/MonetDBLicense-1.1.html@'@' Software distributed under the License is distributed on an "AS IS"@' basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the@' License for the specific language governing rights and limitations@' under the License.@'@' The Original Code is the MonetDB Database System.@'@' The Initial Developer of the Original Code is CWI.@' Portions created by CWI are Copyright (C) 1997-2007 CWI.@' All Rights Reserved.@f aggr@a S. Manegold @v 1.0@+ Aggregates ModuleThis module contains some efficient aggregate functions that compute their result in one scan, rather than in the iterative manner of the generic MIL aggregrate implementations.The implementation code is derived from the original 'aggr' module. It uses a complete type-specific code expansion to avoid any type-checkingin the inner-most loops.Where feasible, it replaced (expansive) hash-lookup by significantly cheaper positionalvoid-lookups (if the head-column of the group-extend BAT ("e") is "void") or at least by (also positional) array lookups (in case the group-ids span a reasonably small range);In addition to the 2-parameter @{ { } @{ } }(BAT[oid,any::1] b, BAT[oid,any] e)functions, there are now also 3-parameter @{ { } @{ } }(BAT[void,any::1] b,BAT[void,oid] g, BAT[oid,any] e) functions, that require b & g to behead-aligned, and do the fetchjoin(reverse(g),b) on-the-fly;The routines should not be stored in their own module, butadded to the 'group' module. This can be achieved by settingthe module.@{@malmodule aggr;@= sumprod_signaturescommand sum(b:bat[:oid,:@1], e:bat[:oid,:any_1]) :bat[:oid,:@2] address AX3aggrX3_sum_@1_@2comment "Sum over grouped tail sum on @1";command sum(b:bat[:oid,:@1],g:bat[:oid,:oid],e:bat[:oid,:any_1]) :bat[:oid,:@2]address AX3aggrX3_sum3_@1_@2comment "Grouped tail sum on @1";command product(b:bat[:oid,:@1], g:bat[:oid,:oid], e:bat[:oid,:any_1]) :bat[:oid,:@2] address AX3aggrX3_prod3_@1_@2comment "Product over grouped tail on @1";@mal@:sumprod_signatures(sht,sht)@@:sumprod_signatures(sht,int)@@:sumprod_signatures(sht,lng)@@:sumprod_signatures(int,int)@@:sumprod_signatures(oid,int)@@:sumprod_signatures(int,lng)@@:sumprod_signatures(lng,lng)@@:sumprod_signatures(flt,flt)@@:sumprod_signatures(flt,dbl)@@:sumprod_signatures(dbl,dbl)@@= sum_avg_signaturescommand avg(b:bat[:oid,:@1], e:bat[:oid,:any_1]) :bat[:oid,:dbl] address AX3aggrX3_avg_@1comment "Grouped tail average on @1";command avg(b:bat[:oid,:@1], g:bat[:oid,:oid], e:bat[:oid,:any_1]):bat[:oid,:dbl] address AX3aggrX3_avg3_@1comment "Grouped tail average on @1";@-We may have to extend the signatures to all possible {void,oid} combos@mal@:sum_avg_signatures(sht)@@:sum_avg_signatures(int)@@:sum_avg_signatures(lng)@@:sum_avg_signatures(flt)@@:sum_avg_signatures(dbl)@command min(b:bat[:oid,:any_1], e:bat[:oid,:any_2]) :bat[:oid,:any_1] address AX3aggrX3_min;command max(b:bat[:oid,:any_1], e:bat[:oid,:any_2]) :bat[:oid,:any_1] address AX3aggrX3_max;command min(b:bat[:oid,:any_1],g:bat[:oid,:oid],e:bat[:oid,:any_2]):bat[:oid,:any_1]address AX3aggrX3_min3;command max(b:bat[:oid,:any_1], g:bat[:oid,:oid], e:bat[:oid,:any_2]) :bat[:oid,:any_1] address AX3aggrX3_max3;command count(b:bat[:oid,:any_1], e:bat[:oid,:any_2], ignorenils:bit) :bat[:oid,:int] address AX3aggrX3_countcomment "Grouped count";command count(b:bat[:oid,:any_1], g:bat[:oid,:oid], e:bat[:oid,:any_2], nonils:bit) :bat[:void,:int] address AX3aggrX3_count3;command size(b:bat[:void,:bit], e:bat[:void,:any_1]) :bat[:void,:int] address AX3aggrX3_sizecomment "Grouped count of true values";command count(b:bat[:void,:any_1], e:bat[:oid,:any_2]) :bat[:void,:int] address AX3aggrX3_count2Nilscomment "Grouped count";command count(b:bat[:void,:any_1], e:bat[:void,:any_2]) :bat[:void,:int] address AX3aggrX3_count2Nils;command count_no_nil(b:bat[:oid,:any_1],e:bat[:oid,:any_1]):bat[:oid,:int]address AX3count_no_nil2;command count(b:bat[:oid,:any_1], g:bat[:oid,:oid], e:bat[:oid,:any_2]) :bat[:oid,:int] address AX3aggrX3_count3Nilscomment "Grouped count";command count_no_nil(b:bat[:oid,:any_1],g:bat[:oid,:oid],e:bat[:oid,:any_2]) :bat[:oid,:int]address AX3count_no_nil3;@+ ImplementationThese implementations need just one scan and a simple hash-maintained datastructure to compute a group of common aggregates.@c#include "mal_config.h"#include <gdk.h>#include <gdk_scanselect.h> /* for type-specific HT_bunfastins_nocheck_noinc(), until they're moved to gdk.mx *//*with group OIDs spanning a range of less than SMALL_AGGR_MAX (the actualnumber of groups might be even less, in case there are "holes" in the groupOID range), we use a simple array as temporary sum/cnt table on order tobenefit from positional lookups; with size of sum <= 8 bytes and size ofcnt == 4 bytes, we stay below 16 KBytes, i.e., within (almost) any L1 cache*/#define SMALL_AGGR_MAX 1024@-The macro CHKrange is just for array-lookups, analogously to BUNfntVOID &HASHfnd_oid for void- and hash-lookups, respectively@c#define CHKrange(r, bn, h) r = (BUN)(((*(oid*)h >= min) && (*(oid*)h <= max))?h:NULL)@- Result initialization@c/* init_result @1: tail-type: sht/int/lng/flt/dbl / any / void*/@= init_result{ REGISTER BUN _p = BUNlast(bn); REGISTER int _bunsize = BUNsize(bn); REGISTER int _cnt = BATcount(e); bn->tsorted = bn->hsorted = 0; min = max = (oid) 0; ALIGNsetH(bn, e); /* set all sums/avgs/counts to zero; for prod, zero is 1 */ /* where necessary, calculate min/max oid with minimal effort */ if (e->htype == TYPE_void) { oid nil = oid_nil; (void) nil; /* silence compiler about unused variable */ ALGODEBUG THRprintf(GDKout, "#init_result(@1): e->htype == TYPE_void, e->hseqbase=" SZFMT "\n", (size_t) e->hseqbase); BATloopFast(e, p, q, xx) { void@1_bunfastins_nocheck_noinc(bn, _p, &nil, &zero); _p += _bunsize; } BATseqbase(bn,e->hseqbase); } else if (BAThordered(e)&1) { if (_cnt) min = *(oid*)BUNhloc(e, BUNfirst(e)); BATloopFast(e, p, q, xx) { oid@1_bunfastins_nocheck_noinc(bn, _p, BUNhloc(e,p), &zero); _p += _bunsize; } if (_cnt) max = *(oid*)BUNhloc(e, BUNlast(e)-BUNsize(e)); ALGODEBUG THRprintf(GDKout, "#init_result(@1): BAThordered(e)&1, min=" SZFMT ", max=" SZFMT "\n", (size_t) min, (size_t) max); } else { oid i; if (_cnt) min = max = *(oid*)BUNhloc(e, BUNfirst(e)); BATloopFast(e, p, q, xx) { oid@1_bunfastins_nocheck_noinc(bn, _p, BUNhloc(e,p), &zero); _p += _bunsize; i = *(oid*)BUNhloc(e, p); if (i < min) min = i; else if (i > max) max = i; } ALGODEBUG THRprintf(GDKout, "#init_result(@1): min=" SZFMT ", max=" SZFMT "\n", (size_t) min, (size_t) max); } bn->batBuns->free = _p - bn->batBuns->base; BATsetcount(bn, bn->batBuns->free/_bunsize); if (!bn->batDirty) bn->batDirty = TRUE;}@- Sum, Product & Average@c/* aggrX3_sum e-void-head e-oid-head e-oid-head void-lookup hash-lookup array-lookup @1: 0 0 1 use sums-array? @2: 0 1 0 do BATprepareHash? @3: BUNfndVOID HASHfnd_oid CHKrange lookup @4: var loc loc e/bn-head-access @5: "BUNhloc(b,p)" for (oid) b-head-type, b-head access "&bhsb; bhsb++" for (void) b-head-type @6: sht / int / lng / flt / dbl b/bn-tail-type @7: "loc" for fixsized b/bn-tail-type, b/bn-tail-access "var" for varsized b/bn-tail-type (only loc used currently) @8: BUNt@7(bn,r) BUNt@7(bn,r) &sums[(*(oid*)h)-min] *dst: sum in-place or in sums-array ? @9: result type*/@= aggrX3_sum ALGODEBUG THRprintf(GDKout, "#aggrX3_sum(@1,@2,@3,@4,@5,@6,@7,@8,@9);\n"); if (@1) { /* create tmp. sums array */ size_t i; sums = (@9*) GDKmalloc(range*sizeof(@9)); for (i = 0; i < range; i++) sums[i] = zero; } if (@2 && BATprepareHash(bn)) { if (@1) GDKfree(sums); BBPreclaim(bn); return GDK_FAIL; } /* scan b, and add values to sums in-place or in sums-array */ bhsb = b->hseqbase - 1; BATloopFast(b, p, q, xx) { @6 *t = (@6*) BUNt@7(b,p); oid *h = (oid*) @5; @3(r, bn, h); if (r) { @9 *dst = (@9*) @8; if (*dst != @9_nil) { if (*t == @6_nil) { *dst = @9_nil; } else { *dst += *t; } } } } if (@1) { /* copy sums array to final result */ BATloopFast(bn, p, q, xx) { oid h = (*(oid*) BUNh@4(bn,p)) - min; *(@9*)BUNt@7(bn, p) = sums[h]; } GDKfree(sums); }@@c/* aggrX3_prod e-void-head e-oid-head e-oid-head void-lookup hash-lookup array-lookup @1: 0 0 1 use prods-array? @2: 0 1 0 do BATprepareHash? @3: BUNfndVOID HASHfnd_oid CHKrange lookup @4: var loc loc e/bn-head-access @5: "BUNhloc(b,p)" for (oid) b-head-type, b-head access "&bhsb; bhsb++" for (void) b-head-type @6: sht / int / lng / flt / dbl b/bn-tail-type @7: "loc" for fixsized b/bn-tail-type, b/bn-tail-access "var" for varsized b/bn-tail-type (only loc used currently) @8: BUNt@7(bn,r) BUNt@7(bn,r) &prods[(*(oid*)h)-min] *dst: prod in-place or in prods-array ? @9: result type*/@= aggrX3_prod ALGODEBUG THRprintf(GDKout, "#aggrX3_prod(@1,@2,@3,@4,@5,@6,@7,@8,@9);\n"); if (@1) { /* create tmp. prods array */ size_t i; prods = (@9*) GDKmalloc(range*sizeof(@9)); for (i = 0; i < range; i++) prods[i] = zero; } if (@2 && BATprepareHash(bn)) { if (@1) GDKfree(prods); BBPreclaim(bn); return GDK_FAIL; } /* scan b, and mul values to prods in-place or in prods-array */ bhsb = b->hseqbase - 1; BATloopFast(b, p, q, xx) { @6 *t = (@6*) BUNt@7(b,p); oid *h = (oid*) @5; @3(r, bn, h); if (r) { @9 *dst = (@9*) @8; if (*dst != @9_nil) { if (*t == @6_nil) { *dst = @9_nil; } else { *dst *= *t; } } } } if (@1) { /* copy prods array to final result */ BATloopFast(bn, p, q, xx) { oid h = (*(oid*) BUNh@4(bn,p)) - min; *(@9*)BUNt@7(bn, p) = prods[h]; } GDKfree(prods); }@@c/* aggrX3_avg e-void-head e-oid-head e-oid-head void-lookup hash-lookup array-lookup @1: 0 0 1 use sums-array? @2: 0 1 0 do BATprepareHash? @3: BUNfndVOID HASHfnd_oid CHKrange lookup @4: var loc loc e/bn-head-access @5: "BUNhloc(b,p)" for (oid) b-head-type, b-head access "&bhsb; bhsb++" for (void) b-head-type @6: sht / int / lng / flt / dbl b/bn-tail-type @7: "loc" for fixsized b/bn-tail-type, b/bn-tail-access "var" for varsized b/bn-tail-type (only loc used currently) @8: BUNt@7(bn,r) BUNt@7(bn,r) &sums[(*(oid*)h)-min] *dst: sum in-place or in sums-array ? @9: BUNindex(bn,r)-off (*(oid*)h)-min index in cnt array*/@= aggrX3_avg ALGODEBUG THRprintf(GDKout, "#aggrX3_avg(@1,@2,@3,@4,@5,@6,@7,@8,@9);\n"); if (@1) { /* create tmp. sums array */ size_t i; sums = (dbl*) GDKmalloc(range*sizeof(dbl)); for (i = 0; i < range; i++) sums[i] = zero; } if (@2 && BATprepareHash(bn)) { if (@1) GDKfree(sums); BBPreclaim(bn); return GDK_FAIL; } cnt = (size_t*) GDKmalloc(slots*sizeof(cnt[0])); memset(cnt, 0, slots*sizeof(cnt[0])); /* scan b, adding sums, and incrementing counts */ bhsb = b->hseqbase - 1; BATloopFast(b, p, q, xx) { @6 *t = (@6*) BUNt@7(b,p); oid *h = (oid*) @5; @3(r, bn, h); if (r) { dbl *dst = (dbl*) @8; if (*dst != dbl_nil) { if (*t == @6_nil) { *dst = dbl_nil; } else { *dst += *t; } cnt[@9]++; } } } /* postprocess by dividing sums by counts */ if (@1) { /* sums in sums-array */ BATloopFast(bn, p, q, xx) { oid h = (*(oid*) BUNh@4(bn,p)) - min; dbl *dst = (dbl*) BUNt@7(bn, p); if (cnt[h] == 0 || sums[h] == dbl_nil) { *dst = dbl_nil; } else { *dst = (dbl) (sums[h]/cnt[h]); } } GDKfree(sums); } else { /* sums in-place */ size_t yy = 0; BATloopFast(bn, p, q, xx) { dbl *dst = (dbl*) BUNt@7(bn, p); if (cnt[yy] == 0) { *dst = dbl_nil; } else if (*dst != dbl_nil) { *dst = (dbl) (*dst / cnt[yy]); } yy++; } } GDKfree(cnt);@c/* arithsum @6: sht / int / lng / flt / dbl b/bn-tail-type @7: "loc" for fixsized b/bn-tail-type, b/bn-tail-access "var" for varsized b/bn-tail-type (only loc used currently) @9: result type*/@= arithsumintCMDaggrX3_sum_@1_@3(BAT **ret, BAT *b, BAT *e){ BAT *bn = BATnew(e->htype, TYPE_@3, BATcount(e)); @3 zero = (@3) 0, *sums; BUN p, q, r; int xx; size_t range; oid min, max; oid bhsb; if( bn == NULL) return GDK_FAIL; ALGODEBUG THRprintf(GDKout, "#CMDaggrX3_sum_@1_@3[@2](b=%s,e=%s);\n", BATgetId(b), BATgetId(e)); /* init: set all sums to zero and calculate min/max oid */ @:init_result(@3)@ range = max - min + 1; /* scan b, and calculate sums */ if (e->htype == TYPE_void) { /* void lookup */ if (b->htype == TYPE_void) { @:aggrX3_sum(0,0,BUNfndVOID,var,&bhsb;bhsb++,@1,@2,BUNt@2(bn,r),@3)@ } else { @:aggrX3_sum(0,0,BUNfndVOID,var,BUNhloc(b,p),@1,@2,BUNt@2(bn,r),@3)@ } /* e->htype == TYPE_oid */ } else if (range > SMALL_AGGR_MAX) { /* hash lookup */ if (b->htype == TYPE_void) { @:aggrX3_sum(0,1,HASHfnd_oid,loc,&bhsb;bhsb++,@1,@2,BUNt@2(bn,r),@3)@ } else { @:aggrX3_sum(0,1,HASHfnd_oid,loc,BUNhloc(b,p),@1,@2,BUNt@2(bn,r),@3)@ } } else { /* array lookup */ if (b->htype == TYPE_void) { @:aggrX3_sum(1,0,CHKrange,loc,&bhsb;bhsb++,@1,@2,&sums[(*(oid*)h)-min],@3)@
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -