📄 alg1.cc
字号:
/*heap-lookaheadAuthor: Kendall Willets (kendall@willets.org, http://willets.org )This is a variant of a simple ordered vector merge of subsets, using aheap to hold iterators for each transaction T's subsets. Each T isrepresented once in the heap by an iterator with a "current" subset S,and runs of identical S's are pulled from the heap and counted.Each S is then bumped to the next subset S' of its particular T andreinserted into the heap. The ordering guarantees that all runs ofidentical subsets will be found and counted.This method does vector-merge, using a lookahead of Centries. A run of at least C entries indicates a frequent subset.The lookahead is done with a FIFO (ring buffer) F with C-1 elementspulled in order from the heap H.Optimization 1: in incrementing a subset S from the head of F, subsetsX in F can be skipped, except the tail S2. Reason: if X is in F andS < X < S2, X has a run of fewer than C entries in F, so X is not frequent.(S2 could be frequent if more S2's arrive). So S can "leapfrog" up to orbeyond S2.H - heap of all subsets, except:F - C-1 subsets currently in window (FIFO ring buffer)H/F entries <S,T> will point to transaction T andcurrent subset S of Talgorithm: insert all <S,T> into H move first C entries from H to F loop: remove min(<S,T>) from head of F remove min(<S2,T2>) from head of H if S == S2 S is frequent (save, or increment counter if S already found) increment S to S' > S2 else increment <S,T> to <S',T> until S' >= S2 insert <S', T> into H else if no S', remove <S,T> ( T is done ) put <S2, T2> on tail of F until H is emptyPlanned optimization 1: Result of comparing S and S2 can be used to find S'if position of first mismatch, newmask, etc. are returned. Need to break outa Trans::nxtmask( Trans const &T2, bool & equal ) method.Planned optimization 2: Counting backwards from T to {} will arrive at maximalfrequent subsets first. The 2^|M| - 1 frequent subsets of a maximal frequentsubset M can then hopefully be skipped, possibly by deleting M.Decrementing the bitmap will also save a modulo operation, since we're just--'ing down to 0 rather than up to 2^|T|.*/#include <stdio.h>#include <stdlib.h>#include <sys/types.h>#include <sys/stat.h>#include <fcntl.h>#include <unistd.h>#include <time.h>#include "Trans.h"#include "Heap.h"#include "Fifo.h"// sort routine for integer element vectors// cheesy bubble sortvoid sort( int *p, int n ){int tmp;int ii, jj;for( ii = 0; ii < n; ii ++ ) for( jj = 0; jj < n-1; jj++ ) if( p[ jj ] < p[jj + 1] ) { tmp = p[jj]; p[jj] = p[jj +1]; p[jj+1]=tmp; };}#define DISPLAY_LIMIT 3// hard-coded to read from binary data file// produced by IBM Almaden's synthetic data generator////int main(int argc, char **argv ) {int NX=0, C=0;char * fname = NULL;int fd;clock_t cstart, cend;printf("%d\n", argc);if( argc == 4 ) { fname = argv[1]; NX = atoi( argv[2] ); C = atoi( argv[3] ); };if( argc != 4 || fname == NULL || NX == 0 || C >= NX || (fd = open( fname, O_RDONLY )) < 0 ) { printf( "fname: %s NX: %d C: %d\n", fname, NX, C ); printf( "usage: %s filename trancount minsupport\n", argv[0] ); exit(0); };Heap H(NX);Fifo F(C-1);int i,c,t,n;int done=0, nfound =0;int minitem = 100000;bool readOK;for( i = 0; i < NX; i++ ) { int *pt; readOK = read( fd, &c, 4 ) && // customer ID - not used read( fd, &t, 4 ) && // Transaction ID - not used read( fd, &n, 4 ) && // #items (pt = new int[n]) != NULL && read( fd, pt, 4* n ); // itemset //for( int ii = 0; ii < n; ii++ ) if( pt[ii] < minitem ) minitem = pt[ii]; if( readOK ) { sort( pt, n ); H.ins( new Trans( pt, n ) ); } else break; };printf( "%d/%d records read\n", i, NX );int chck = H.chk();if( chck > 2 ) { printf( "heap check failed %d\n", chck ); H.prt( chck ); H.prt( chck/2 );};//H.heapify( H.nelt());Trans * cur = NULL;int ncur = 1;printf( "start minitem = %d\n", minitem );cstart=clock();for( i=C-1; i; i-- ) F.push(H.delmin());cur = H.delmin();cur->prt(0);printf( "delmin done\n");chck = H.chk();if( chck > 2 ) { printf( "heap check failed %d\n", chck ); H.prt( chck ); H.prt( chck/2 ); exit(0);};while( H.nelt() ) { //cur->prt(); if( *(cur = F.tail()) == *H.peek() ) { if( ncur == 0 ) { nfound++; ncur = C; } else ncur++; } else { // break in run - check if run >= C if( ncur >= C && cur->nitems() > DISPLAY_LIMIT ) cur->prt(0) ; ncur = 0; }; if( cur->nxtsub( *H.peek() ) ) H.swapsert( cur ); else { done++; if( (done % 100) == 0 ) printf( "%d done/ %d found\n", done, nfound ); cur = H.delmin(); }; F.push(cur);};cend = clock();printf( "time: %f\n", ((float) cend - (float) cstart )/(float) CLOCKS_PER_SEC );//printf( "ncmp: %d nintcmp: %d nscan: %d nswap: %d nsift: %d\n", ncmp, nintcmp, nscan, nswap, nsift );printf( "nfound: %d\n", nfound );};
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -