alg1.cc

来自「A C++ library which finds associations w」· CC 代码 · 共 196 行

196 行

/*heap-lookaheadAuthor:  Kendall Willets (kendall@willets.org,  http://willets.org )This is a variant of a simple ordered vector merge of subsets, using aheap to hold iterators for each transaction T's subsets.  Each T isrepresented once in the heap by an iterator with a "current" subset S,and runs of identical S's are pulled from the heap and counted.Each S is then bumped to the next subset S' of its particular T andreinserted into the heap.  The ordering guarantees that all runs ofidentical subsets will be found and counted.This method does vector-merge, using a lookahead of Centries.  A run of at least C entries indicates a frequent subset.The lookahead is done with a FIFO (ring buffer) F with C-1 elementspulled in order from the heap H.Optimization 1:  in incrementing a subset S from the head of F, subsetsX in F can be skipped, except the tail S2.  Reason:  if X is in F andS < X < S2, X has a run of fewer than C entries in F, so X is not frequent.(S2 could be frequent if more S2's arrive).  So S can "leapfrog" up to orbeyond S2.H - heap of all subsets, except:F - C-1 subsets currently in window (FIFO ring buffer)H/F entries <S,T> will point to transaction T andcurrent subset S of Talgorithm:  insert all <S,T> into H  move first C entries from H to F  loop:     remove min(<S,T>) from head of F     remove min(<S2,T2>) from head of H     if S == S2       S is frequent (save, or increment counter if S already found)       increment S to S' > S2     else       increment <S,T> to <S',T> until S' >= S2     insert <S', T> into H       else if no S', remove <S,T>  ( T is done )     put <S2, T2> on tail of F  until H is emptyPlanned optimization 1:  Result of comparing S and S2 can be used to find S'if position of first mismatch, newmask, etc. are returned.  Need to break outa Trans::nxtmask( Trans const &T2, bool & equal ) method.Planned optimization 2:  Counting backwards from T to {} will arrive at maximalfrequent subsets first.  The 2^|M| - 1 frequent subsets of a maximal frequentsubset M can then hopefully be skipped, possibly by deleting M.Decrementing the bitmap will also save a modulo operation, since we're just--'ing down to 0 rather than up to 2^|T|.*/#include <stdio.h>#include <stdlib.h>#include <sys/types.h>#include <sys/stat.h>#include <fcntl.h>#include <unistd.h>#include <time.h>#include "Trans.h"#include "Heap.h"#include "Fifo.h"// sort routine for integer element vectors//  cheesy bubble sortvoid sort( int *p, int n ){int tmp;int ii, jj;for( ii = 0; ii < n; ii ++ )	for( jj = 0; jj < n-1; jj++ )		if( p[ jj ] < p[jj + 1] ) { tmp = p[jj]; p[jj] = p[jj +1]; p[jj+1]=tmp; };}#define DISPLAY_LIMIT 3// hard-coded to read from binary data file// produced by IBM Almaden's synthetic data generator////int main(int argc, char **argv ) {int NX=0, C=0;char * fname = NULL;int fd;clock_t cstart, cend;printf("%d\n", argc);if( argc == 4 ) {	fname = argv[1];	NX = atoi( argv[2] );	C  = atoi( argv[3] );	};if( argc != 4 || fname == NULL || NX == 0 || C >= NX ||    (fd = open( fname, O_RDONLY )) < 0 ) {    	printf( "fname:  %s NX: %d C: %d\n", fname, NX, C );	printf( "usage:  %s filename trancount minsupport\n", argv[0] );	exit(0);	};Heap H(NX);Fifo F(C-1);int i,c,t,n;int done=0, nfound =0;int minitem = 100000;bool readOK;for( i = 0; i < NX; i++ )	{	int *pt;	readOK =		read( fd, &c, 4 ) &&     // customer ID - not used		read( fd, &t, 4 ) &&     // Transaction ID - not used		read( fd, &n, 4 ) &&     // #items		(pt = new int[n]) != NULL &&		read( fd, pt, 4* n );    // itemset	//for( int ii = 0; ii < n; ii++ ) if( pt[ii] < minitem ) minitem = pt[ii];	if( readOK ) {		sort( pt, n );		H.ins( new Trans( pt, n ) );		}	else	break;	};printf( "%d/%d records read\n", i, NX );int chck = H.chk();if( chck > 2 ) { printf( "heap check failed %d\n", chck ); H.prt( chck ); H.prt( chck/2 );};//H.heapify( H.nelt());Trans * cur = NULL;int ncur = 1;printf( "start minitem = %d\n", minitem );cstart=clock();for( i=C-1; i; i-- )	F.push(H.delmin());cur = H.delmin();cur->prt(0);printf( "delmin done\n");chck = H.chk();if( chck > 2 ) { printf( "heap check failed %d\n", chck ); H.prt( chck ); H.prt( chck/2 ); exit(0);};while( H.nelt() ) {	//cur->prt();	if( *(cur = F.tail()) == *H.peek() ) {		if( ncur == 0 ) {			nfound++;			ncur = C;			}		else ncur++;	} else {  // break in run - check if run >= C		if( ncur >= C && cur->nitems() > DISPLAY_LIMIT )			cur->prt(0)			;		ncur = 0;	};	if( cur->nxtsub( *H.peek() ) )		H.swapsert( cur );	else	{		done++;		if( (done % 100) == 0 ) printf( "%d done/ %d found\n", done, nfound );		cur = H.delmin();		};	F.push(cur);};cend = clock();printf( "time:  %f\n", ((float) cend - (float) cstart )/(float) CLOCKS_PER_SEC );//printf( "ncmp:  %d nintcmp: %d nscan: %d nswap: %d nsift: %d\n", ncmp, nintcmp, nscan, nswap, nsift );printf( "nfound: %d\n", nfound );};

alg1.cc - 源码说明

本页面展示了「A C++ library which finds associations within sets of items, using a fast in-memory algorithm」中的 alg1.cc 源码文件，采用 CC 编程语言编写，共 196 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。

虫虫下载站收录了大量与associations相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。

⌨️ 快捷键说明

复制代码Ctrl + C

搜索代码Ctrl + F

全屏模式F11

增大字号Ctrl + =

减小字号Ctrl + -

显示快捷键?