📄 treeminer.cpp,v
字号:
head 1.2;access;symbols;locks zaki:1.2; strict;comment @// @;1.2date 2001.06.24.19.16.01; author zaki; state Exp;branches;next 1.1;1.1date 2001.06.06.00.04.45; author zaki; state Exp;branches;next ;desc@@1.2log@added pruning.@text@#include<iostream>#include <unistd.h>#include <algorithm>#include <stdio.h>#include <stl.h>#include <list>using namespace std;//headers#include "treeminer.h"#include "timetrack.h"#include "calcdb.h"#include "eqclass.h"#include "stats.h"#include "hashtable.h"//global varschar infile[300];Dbase_Ctrl_Blk *DCB;Stats stats;double MINSUP_PER;int MINSUPPORT=-1;int DBASE_MAXITEM;int DBASE_NUM_TRANS;//default flagsbool output = false; //don't print freq subtreesbool output_idlist = false; //don't print idlistbool count_unique = true; //count support only once per treebool use_fullpath = false; //use reduced scope to conserve memsort_vals sort_type = nosort; //default is to sort in increasing orderalg_vals alg_type = treeminer; //default is to find all freq patternsprune_vals prune_type = prune; //pruneFreqHT FK; //to store freq subtrees for pruningvector<int> *ITCNT = NULL; //used for sorting F1bool F1cmp(int x, int y){ bool res = false; if ((*ITCNT)[x] < (*ITCNT)[y]) res = true; if (sort_type == incr) return res; else return !res;} void parse_args(int argc, char **argv){ extern char * optarg; int c; if (argc < 2) cout << "usage: treeminer -i<infile> -s <support>\n"; else{ while ((c=getopt(argc,argv,"a:fi:lop:s:S:uz:"))!=-1){ switch(c){ case 'a': alg_type = (alg_vals) atoi(optarg); break; case 'f': use_fullpath = true; break; case 'i': //input files sprintf(infile,"%s",optarg); break; case 'l': //print idlists along with freq subtrees output=true; output_idlist = true; break; case 'o': //print freq subtrees output = true; break; case 'p': prune_type = (prune_vals) atoi(optarg); break; case 's': //support value for L2 MINSUP_PER = atof(optarg); break; case 'S': //absolute support MINSUPPORT = atoi(optarg); break; case 'u': //count support multiple times per tree count_unique = false; break; case 'z': sort_type = (sort_vals) atoi(optarg); break; } } }}void get_F1(){ TimeTracker tt; double te; int i, j, it; const int arysz = 10; vector<int> itcnt(arysz,0); //count item frequency vector<int> flgs(arysz,-1); tt.Start(); DBASE_MAXITEM=0; DBASE_NUM_TRANS = 0; while(DCB->get_next_trans()) { //cout << "TRANS " << DCB->Cid << " " << DCB->Tid // << " " << DCB->TransSz << " -- "; //for (i=0; i < DCB->TransSz; i++) // cout << " " << DCB->TransAry[i]; //cout << endl; for (i=0; i < DCB->TransSz; i++){ it = DCB->TransAry[i]; if (it == BranchIt) continue; if (it >= DBASE_MAXITEM){ itcnt.resize(it+1,0); flgs.resize(it+1,-1); DBASE_MAXITEM = it+1; //cout << "IT " << DBASE_MAXITEM << endl; } if (count_unique){ if(flgs[it] == DCB->Cid) continue; else flgs[it] = DCB->Cid; } itcnt[it]++; } if (DCB->MaxTransSz < DCB->TransSz) DCB->MaxTransSz = DCB->TransSz; DBASE_NUM_TRANS++; } //for (i=0; i < DCB->TransSz; i++){ // it = DCB->TransAry[i]; // if (it != BranchIt){ // cout << it << " " << itcnt[it] << endl; // } //} //set the value of MINSUPPORT if (MINSUPPORT == -1) MINSUPPORT = (int) (MINSUP_PER*DBASE_NUM_TRANS+0.5); if (MINSUPPORT<1) MINSUPPORT=1; cout<<"DBASE_NUM_TRANS : "<< DBASE_NUM_TRANS << endl; cout<<"DBASE_MAXITEM : "<< DBASE_MAXITEM << endl; cout<<"MINSUPPORT : "<< MINSUPPORT << " (" << MINSUP_PER << ")" << endl; //count number of frequent items DCB->NumF1 = 0; for (i=0; i < DBASE_MAXITEM; i++) if (itcnt[i] >= MINSUPPORT) DCB->NumF1++; int *it_order = new int[DBASE_MAXITEM]; for (i=0; i < DBASE_MAXITEM; i++) it_order[i] = i; if (sort_type != nosort){ ITCNT = &itcnt; sort(&it_order[0], &it_order[DBASE_MAXITEM], F1cmp); } //construct forward and reverse mapping from items to freq items DCB->FreqIdx = new int [DCB->NumF1]; DCB->FreqMap = new int [DBASE_MAXITEM]; for (i=0,j=0; i < DBASE_MAXITEM; i++) { if (itcnt[it_order[i]] >= MINSUPPORT) { if (output) cout << i << " " << it_order[i] << " - " << itcnt[it_order[i]] << endl; DCB->FreqIdx[j] = it_order[i]; DCB->FreqMap[it_order[i]] = j; j++; } else DCB->FreqMap[it_order[i]] = -1; } cout<< "F1 - " << DCB->NumF1 << " " << DBASE_MAXITEM << endl; if (sort_type != nosort){ ITCNT = NULL; delete [] it_order; } te = tt.Stop(); stats.add(DBASE_MAXITEM, DCB->NumF1, te);}list<Eqclass *> * get_F2(){ int i,j; int it1, it2; int scnt; TimeTracker tt; double te; tt.Start(); list<Eqclass *> *F2list = new list<Eqclass *>; //itcnt2 is a matrix of pairs p, p.first is count, p.second is flag int **itcnt2 = new int*[DCB->NumF1]; int **flgs = new int*[DCB->NumF1]; //unsigned int **itcnt2 = new unsigned int *[DCB->NumF1]; for (i=0; i < DCB->NumF1; i++){ itcnt2[i] = new int [DCB->NumF1]; flgs[i] = new int [DCB->NumF1]; //cout << "alloc " << i << " " << itcnt2[i] << endl; for (j=0; j < DCB->NumF1; j++){ itcnt2[i][j] = 0; flgs[i][j] = -1; } } DCB->alloc_idlists(); while(DCB->get_next_trans()) { DCB->get_valid_trans(); DCB->make_vertical(); //DCB->print_trans(); //count a pair only once per cid for (i=0; i < DCB->TransSz; i++){ it1 = DCB->TransAry[i]; if (it1 != BranchIt){ scnt = 0; for (j=i+1; scnt >= 0 && j < DCB->TransSz; j++){ it2 = DCB->TransAry[j]; if (it2 != BranchIt){ scnt++; if (count_unique){ if (flgs[it1][it2] == DCB->Cid) continue; else flgs[it1][it2] = DCB->Cid; } //cout << "cnt " << it1 << " " << it2 << endl; itcnt2[it1][it2]++; } else scnt--; } } } } int F2cnt=0; // count frequent patterns and generate eqclass Eqclass *eq; for (i=0; i < DCB->NumF1; i++) { eq = NULL; for (j=0; j < DCB->NumF1; j++) { //cout << "access " << i << " " << j << endl; if (itcnt2[i][j] >= MINSUPPORT){ F2cnt++; if (eq == NULL){ eq = new Eqclass; eq->prefix().push_back(i); } eq->add_node(j,0); if (output) cout << DCB->FreqIdx[i] << " " << DCB->FreqIdx[j] << " - " << itcnt2[i][j] << endl; } } if (eq != NULL) F2list->push_back(eq); } for (i=0; i < DCB->NumF1; i++) { //cout << "dealloc " << i << " " << itcnt2[i] << endl; delete [] itcnt2[i]; //cout << "dealloc " << i << " " << flgs[i] << endl; delete [] flgs[i]; } delete [] itcnt2; delete [] flgs; cout << "F2 - " << F2cnt << " " << DCB->NumF1 * DCB->NumF1 << endl; te = tt.Stop(); stats.add(DCB->NumF1 * DCB->NumF1, F2cnt, te); return F2list;}static bool notfrequent (Eqnode &n){ //cout << "IN FREQ " << n.sup << endl; if (n.sup >= MINSUPPORT) return false; else return true;}void check_ins(idlist *l1, idlist *l2, Eqnode *ins, int st1, int st2, int en1, int en2){ static idnode *n1, *n2; static ival_type cmpval; bool found_flg = false; int pos1 = st1; //temporary position holder for n1 ival bool next2; //should we go to next n2 ival? //for each ival in n2, find the closest parent in n1 int tpos = ins->tlist.size(); while(st2 < en2){ n1 = &(*l1)[pos1]; n2 = &(*l2)[st2]; next2 = true; //by default we go to next n2 ival cmpval = ival::compare(n1->itscope, n2->itscope); switch (cmpval){ case sup: //n2 was under some n1 if (n1->path_equal(*n2)){ if (en1-st1 > 1 || use_fullpath){ ins->tlist.push_back(idnode(n2->cid, n1->parscope, n1->itscope.lb, n2->itscope)); } else{ ins->tlist.push_back(idnode(n2->cid,n2->itscope)); } if (!count_unique) ins->sup++; found_flg = true; } next2 = false; break; case before: //check next n1 ival for same n2 ival next2 = false; break; } if (next2 || pos1+1 == en1){ //go to next n2 ival pos1 = st1; st2++; } else pos1++; //go to next n1 ival } if (found_flg && count_unique) ins->sup++;} void check_outs(idlist *l1, idlist *l2, Eqnode *outs, int st1, int st2, int en1, int en2){ static idnode *n1, *n2; static ival_type cmpval; bool found_flg = false; bool next2; int pos1 = st1; //for each n2 ival find if there is a sibling to the left int tpos = outs->tlist.size(); while(st2 < en2){ n1 = &(*l1)[pos1]; n2 = &(*l2)[st2]; next2 = true; cmpval = ival::compare(n1->itscope, n2->itscope); switch (cmpval){ case sup: next2 = false; break; case before: //n1 is before n2. Check if n1.par is subset of or equal to n2.par if (n1->path_equal(*n2)){ if (en1 - st1 > 1 || use_fullpath){ outs->tlist.push_back(idnode(n2->cid, n1->parscope, n1->itscope.lb, n2->itscope)); } else{ outs->tlist.push_back(idnode(n2->cid,n2->itscope)); } if (!count_unique) outs->sup++; found_flg = true; } next2 = false; break; } if (next2 || pos1+1 == en1){ //go to next n2 ival pos1 = st1; st2++; } else pos1++; //go to next n1 ival } if (found_flg && count_unique) outs->sup++;}void get_intersect(idlist *l1, idlist *l2, Eqnode *ins, Eqnode *outs){
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -