📄 gdk_align.mx
字号:
@' The contents of this file are subject to the MonetDB Public License@' Version 1.1 (the "License"); you may not use this file except in@' compliance with the License. You may obtain a copy of the License at@' http://monetdb.cwi.nl/Legal/MonetDBLicense-1.1.html@'@' Software distributed under the License is distributed on an "AS IS"@' basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the@' License for the specific language governing rights and limitations@' under the License.@'@' The Original Code is the MonetDB Database System.@'@' The Initial Developer of the Original Code is CWI.@' Portions created by CWI are Copyright (C) 1997-2007 CWI.@' All Rights Reserved.@f gdk_align@a Peter Boncz@* BAT Alignment For BATs that result from a n-ary relational scheme it may help toalign the BATs on their head value. In particular, it permitsreplacing a hash-join by a merge-join, which is significantly fasteron large tables. Especially if the BATs involved cause page activityor when you can not afford the large hash structures to speed-up processing.@-For orthogonality, we support alignment between arbitrary columns (head or tail).@-All standard GDK set-calls update the alignment info in their respective ways. For example, the routine @emph{BUNclustercopy} shuffles the first argument, such that the BUNs are in the same order as those in the second argument. This operation will mark both columns of the first @emph{BAT} as synced withthe second (likewise, @emph{BATcopy()}, which makes a copy, instead of in-place shuffling, has the same alignment effect, @emph{BATmark()} marks the tail column as synced with the head of the original @emph{BAT}, and for instance @emph{BATsemijoin()} marks both return columns as aligned with its left parameter).@Each alignment sequence is given a unique identifier, so as to easilydetect this situation. It is retained in the @emph{BAT descriptor}.@+ Alignment Design ConsiderationsAlignment primitives require the right hooks to be inserted in several places in the GDK, apart form this file:@itemize@item @emph{ BUN update operations}. The updated BATs have to be marked as un-aligned.@item @emph{ set operations}. For most relational operations somestatements can be made about the size and order of the BATs they produce. This information can be formalized by indicating alignmentinformation automatically.@item @emph{ transaction operations}. Alignment statuses must be kept consistent under database commits and aborts.@end itemize@As for performance, the most important observation to make is that operations that do not need alignment, will suffer most fromoverheads introduced in the BUN update mechanism. For this reason,the alignment-delete operation has to be very cheap. It iscaptured by the @emph{ALIGNdel} macro, and just zaps one characterfield in the @emph{BAT} record.@@{@+ Alignment ImplementationThe @emph{BAT} record is equipped with an @emph{batAlign} field that keepsboth information about the head and tail column. The leftmost4 bits are for the head, the rightmost 4 for the tail. Thishas been done to make the zap ultra-cheap.Both head and tail column contain an OID in the @emph{halign} and @emph{talign}fields respectively to mark their alignment group. All BATs with thesame OID in this field (and the ALIGN_SYNCED bit on) are guaranteedby the system to have equal head columns. As an exception, theymight also have TYPE_void head columns (a virtual column). In such a case, the tail values correspond to the head valuesthat would have been there in a non-virtual column, continuingthe same head-sequence as the other BATs in the sync group.@c#include "monetdb_config.h"#include "gdk.h"intALIGNcommit(BAT *b){ BATcheck(b, "ALIGNcommit: bat required"); if (!b->halign) { b->halign = OIDnew(1); } if (!b->talign) { b->talign = OIDnew(1); } return 0;}intALIGNundo(BAT *b){ BATcheck(b, "ALIGNundo: bat required"); return 0;}intALIGNsetH(BAT *b1, BAT *b2){ ssize_t diff = BUNindex(b1, BUNfirst(b1)) - BUNindex(b2, BUNfirst(b2)); BATcheck(b1, "ALIGNsetH: bat 1 required"); BATcheck(b2, "ALIGNsetH: bat 2 required"); if (b2->halign == 0) { b2->halign = OIDnew(1); b2->batDirtydesc = TRUE; } else { /* propagate GDK_AGGR information */ BATpropagate(b1, b2, GDK_AGGR_SIZE); BATpropagate(b1, b2, GDK_AGGR_CARD); BATpropagate(b1, b2, GDK_AGGR_HASNIL); } if (BAThvoid(b2)) { /* b2 is either dense or has a void(nil) head */ if (b1->htype != TYPE_void) b1->hdense = TRUE; BATseqbase(b1, b2->hseqbase); } else if (b1->htype != TYPE_void) { /* b2 is not dense, so set b1 not dense */ b1->hdense = FALSE; BATseqbase(b1, oid_nil); } BATkey(b1, b2->hkey != FALSE); b1->hsorted = BAThordered(b2); b1->halign = b2->halign; b1->batDirtydesc = TRUE; b1->H->nosorted_rev = b2->H->nosorted_rev + diff; b1->H->nokey[0] = b2->H->nokey[0] + diff; b1->H->nokey[1] = b2->H->nokey[1] + diff; b1->H->nosorted = b2->H->nosorted + diff; b1->H->nodense = b2->H->nodense + diff; return 0;}@-The routines @emph{ALIGN_synced} and @emph{ALIGN_ordered}allow to simply query the alignment status of the two head columnsof two BATs.@cintALIGNsynced(BAT *b1, BAT *b2){ BATcheck(b1, "ALIGNsynced: bat 1 required"); BATcheck(b2, "ALIGNsynced: bat 2 required"); /* first try to prove head columns are not in sync */ if (BATcount(b1) != BATcount(b2)) return 0; if (ATOMtype(BAThtype(b1)) != ATOMtype(BAThtype(b2))) return 0; if (BAThvoid(b1) && BAThvoid(b2)) return (b1->hseqbase == b2->hseqbase); /* then try that they are */ if (b1->batCacheid == b2->batCacheid) return 1; /* same bat. trivial case */ if (BATcount(b1) == 0) return 1; /* empty bats of same type. trivial case */ if (b1->halign && b1->halign == b2->halign) return 1; /* columns marked as equal by algorithmics */ if (VIEWparentcol(b1) && ALIGNsynced(BBPcache(VIEWparent(b1)), b2)) return 1; /* view on same bat --- left recursive def.. */ if (VIEWparentcol(b2) && ALIGNsynced(b1, BBPcache(VIEWparent(b2)))) return 1; /* view on same bat --- right recursive def.. */ return 0; /* we simply don't know */}@-BATs are related if one pair of columns match. This means thatthere is some 1-1 correspondence in their BUNs.@cintALIGNrelated(BAT *b1, BAT *b2){ BATcheck(b1, "ALIGNrelated: bat 1 required"); BATcheck(b2, "ALIGNrelated: bat 2 required"); return (ALIGNsynced(b1, b2) || ALIGNsynced(BATmirror(b1), b2) || ALIGNsynced(b1, BATmirror(b2)) || ALIGNsynced(BATmirror(b1), BATmirror(b2)));}@+ View BATSThe general routine for getting a 'view' BAT upon another BAT is @emph{VIEWcreate}. On this @emph{#read-only} BAT (there is kernelsupport for this), you can then make vertical slices. Use @emph{VIEWhead} for this. @It is possible to create a view on a writable BAT. Updatesin the parent are then automatically reflected in the VIEW.Note that the VIEW bat itself can never be modified.@Horizontal views should only be given out on a view BAT, butonly if it is dead sure the parent BAT is read-only.This because they cannot physically share the batBuns heapwith the parent, as they need a modified version.@cBAT *VIEWcreate_(BAT *b, int slice_view){ BAT *bn, *recycled = NULL; bat parent; BATcheck(b, "VIEWcreate_: bat required"); bn = BBPrecycle(TYPE_void, TYPE_void, 1); if (bn) { recycled = BATmirror(bn); } else { bn = BATcreatedesc(b->htype, b->ttype, FALSE); } parent = VIEWparent(b); if (parent == 0 || b->batBuns->copied) { parent = b->batCacheid; } bn->batParentid = parent; if (parent < 0) parent = -parent; BBPshare(parent);@-the H and T column descriptors are fully copied. We need copiesbecause in case of a mark, we are going to override a column with a void. Take care to zero the accelerator data, though. @c bn->dims = b->dims; if (recycled) recycled->dims = BATmirror(b)->dims; *bn->H = *b->H; *bn->T = *b->T; assert(bn->U->buns.base == (char*)1 || bn->U->buns.base == NULL); *bn->U = *b->U; bn->H->props = bn->T->props = NULL; BATinit_idents(bn);@-The b->P structure cannot be shared and must be copied individually.@c bn->batSet = b->batSet; bn->batDirty = BATdirty(b); bn->batRestricted = BAT_READ;@-The U record may be shared with the parent; in that case, thesearch accelerators of the parent can be used. If, however, we want to take a horizontal fragment (stable=false), this cannot be done, and we need to put different information in U (so we can't use a copy.@c if (slice_view || VIEWparent(b)) { /* slices are unequal to their parents; cannot use accs */ bn->hhash = bn->thash = NULL; } else { /* equal pointers to parent mean view uses acc of parent */ bn->hhash = b->hhash; bn->thash = b->thash; if (recycled) { recycled->hhash = b->thash; recycled->thash = b->hhash; } } if (recycled == NULL) { BBPcacheit(bn); /* enter in BBP and create mirror */ } return bn;}BAT *VIEWcreate(BAT *b){ return VIEWcreate_(b, FALSE);}@-The @#VIEWhead@ routine effortlessly projects out the tail column.@c BAT *VIEWhead(BAT *b){ BAT *bn = VIEWcreate(b); BAT *bm = BATmirror(bn); BATstore *bs = (BATstore *) bn; if (bn == NULL || bm == NULL) return NULL; bn->T = &bs->T; if (bn->T != bm->H) *bn->T = *bm->H; bm->H = bn->T; bn->ttype = bm->htype = TYPE_void; bn->tvarsized = bm->hvarsized = 1; bn->thash = bm->hhash = NULL; BATseqbase(bm, oid_nil); return bn;}BAT *VIEWhead_(BAT *b, int mode){ BAT *bn = VIEWhead(b); if (bn) bn->batRestricted = mode; return bn;}@-the @#VIEWcombine@ routine effortlessly produces a view with doublevision on the head column.@cBAT *VIEWcombine(BAT *b){ BAT *bn = VIEWcreate(b); BAT *bm = BATmirror(bn); if (bn == NULL || bm == NULL) return NULL; bm->H = bn->T = bn->H; bn->ttype = bm->htype = bn->htype; bn->tloc = bm->hloc = bn->hloc; bn->ttype = bm->htype = bn->htype; bn->tkey = bm->hkey = bn->hkey; bn->tseqbase = bm->hseqbase = bn->hseqbase; bn->tvarsized = bm->hvarsized = bn->hvarsized; bn->thash = bm->hhash = bn->hhash; return bn;}@-The @#BATmaterialize@ routine produces in-place materialized version ofa void bat (which should have been a VIEW) (later we should add the code for VIEWs).@cBAT *BATmaterialize(BAT *b, size_t size){ BAT *bm, *ret = NULL; BATcheck(b, "BATmaterialize: bat required"); { int ht = b->htype, tt = b->ttype, vid = 0; int vv = (!ht && !tt); size_t oldsize = BATcount(b); size_t cnt = MAX(oldsize, size); Heap buns = *b->batBuns, *hh = b->hheap, *th = b->theap; int bunwidth = b->dims.bunwidth, hlc = b->hloc, tlc = b->tloc; int newbw = 0; BUN p = BUNfirst(b), q = BUNlast(b); char *nme = BBP_physical(b->batCacheid); int committed = BBP_status(b->batCacheid) & BBPEXISTING; ALGODEBUG THRprintf(GDKout, "#BATmaterialize(%d," SZFMT ");\n", (int) b->batCacheid, size); if (BAThdense(b) && ht == TYPE_void) { vid |= 1; ht = TYPE_oid; } if (BATtdense(b) && tt == TYPE_void) { vid |= 2; tt = TYPE_oid; } if (vid == 0) { /* no voids */ return b; } /* cleanup possible ACC's */ HASHdestroy(b); /* guess new bunwidth before setting the new dimensions, +3 to allow chr-on-int alignment */ b->batBuns->filename = NULL; if (HEAPalloc(b->batBuns, cnt + 3, 2 * MAX(sizeof(oid), b->dims.bunwidth)) < 0) { *b->batBuns = buns; return NULL; } if (hh) { b->hheap = (Heap*)GDKmalloc(sizeof(Heap)); *b->hheap = *hh; b->hheap->filename = NULL; if (ATOMheap(ht, b->hheap, cnt) < 0) { HEAPfree(b->batBuns); *b->batBuns = buns; b->hheap = hh; return NULL; } } if (th) { b->theap = (Heap*)GDKmalloc(sizeof(Heap)); *b->theap = *th; b->theap->filename = NULL; if (ATOMheap(tt, b->theap, cnt) < 0) { HEAPfree(b->batBuns); HEAPfree(b->hheap); GDKfree(b->hheap); *b->batBuns = buns; b->hheap = hh; b->theap = th; return NULL; } } /* point of no return */ b->htype = ht; b->ttype = tt; BATsetdims(b); DELTAinit(b); newbw = BUNsize(b); b->batDirty = TRUE; b->batDirtydesc = TRUE; b->batDirtybuns = TRUE; bm = BATmirror(b); bm->dims.headtype = b->dims.tailtype; bm->dims.tailtype = b->dims.headtype; bm->dims.headloc = b->dims.tailloc;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -