⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 extl2.cc

📁 关联规则中的频繁项集生成算法genmax
💻 CC
字号:
#include <errno.h>#include <iostream.h>#include <stdio.h>#include <stdlib.h>#include <sys/types.h>#include <unistd.h>#include <sys/stat.h>#include <fcntl.h>#include <sys/mman.h>#include <strings.h>#include <limits.h>#include "extl2.h"#include "partition.h"#include "Graph.h"#include "Util.h"#include "temlist.h"#define isetbufsz 1024#define AVAILMEM (64*MBYTE) unsigned long int DB_size;int isetbuf[isetbufsz];int isetpos=0;int isetfd;unsigned char **cset_sup;unsigned int **set_sup;invdb *invDB;int EXTBLKSZ;extern Graph *F2Graph;extern char tempf[];invdb::invdb(int sz): GArray<GArray<int> *>(sz){   for (int i=0; i < totsize(); i++){      (*this)[i] = NULL;   }   }invdb::~invdb(){   for (int i=0; i < totsize(); i++){      if ((*this)[i]) delete (*this)[i];   }}void invdb::add_db(int tid, int it){   if ((*this)[tid] == NULL){      (*this)[tid] = new GArray<int>((int)DBASE_AVG_TRANS_SZ);   }   (*this)[tid]->add(it);}int cmp2it(const void *a, const void *b){   int *ary = (int *)a;   int *bry = (int *)b;   if (ary[0] < bry[0]) return -1;   else if (ary[0] > bry[0]) return 1;   else{      if (ary[1] < bry[1]) return -1;      else if (ary[1] > bry[1]) return 1;      else return 0;   }}int make_l1_pass(List<int> *F1){   int i;   int supsz;   int idxval = -1;      int maxsup = 0;      F2Graph = new Graph(100);   Graph::numF1 = 0;   for (i=0; i < DBASE_MAXITEM; i++){      idxval = partition_idxval(i);      if (idxval == -1) continue; //item has no tidlist            supsz = partition_get_idxsup(i);      if (diff_input) supsz = DBASE_NUM_TRANS - supsz;            DB_size+=supsz;      if (supsz >= MINSUPPORT){         F1 -> addatback(i);                  F1 ->last() ->set_sup (supsz);                  F2Graph->add_node(i,supsz);         Graph::numF1++;      }      if (supsz > maxsup) maxsup = supsz;   }   //cout << "Graph::numF1 " << Graph::numF1 << endl;      F2Graph->compact();   return maxsup;}void process_cust_invert(int curcnt, int *curit){   int i,j;   int it1, it2;   //for (i=0; i < curcnt; i++)   //   cout << " " << (*F2Graph)[curit[i]]->item();   //cout << endl;   if (use_char_extl2){      for (i=0; i < curcnt; i++){         it1 = curit[i];         if (cset_sup[it1]){            for (j=i+1; j < curcnt; j++){               it2 = curit[j];               if ((++cset_sup[it1][it2-it1-1]) == 0){                  if (isetpos+2 > isetbufsz){                     write(isetfd, (char *)isetbuf, isetpos*ITSZ);                     isetpos = 0;                  }                  isetbuf[isetpos++] = it1;                  isetbuf[isetpos++] = it2;               }            }         }      }   }   else{      for (i=0; i < curcnt; i++){         it1 = curit[i];         if (set_sup[it1]){            for (j=i+1; j < curcnt; j++){               it2 = curit[j];               set_sup[it1][it2-it1-1]++;            }         }      }         }}void process_diff_cust(int curcnt, int *curit){   int i,j;   static GArray<int> trans(100);   trans.reset();      //for (i=0; i < curcnt; i++)   //   cout << " " << curit[i];   //cout << endl;   i = 0; //iterate over numitems   j = 0; //idx into curit      for (j=0; j < curcnt; j++){      while (i < curit[j]){         if (partition_idxval((*F2Graph)[i]->item()) != -1) trans.add(i);         i++;      }      i++; //skip over curit[j]   }   for (; i < Graph::numF1; i++){      if (partition_idxval((*F2Graph)[i]->item()) != -1) trans.add(i);   }   process_cust_invert(trans.size(), trans.garray());   }void process_invert(int pnum){   int i,k;   int minv, maxv;   if (diff_input){      minv = DBASE_MINTRANS;      maxv = DBASE_MAXTRANS;         }   else{            minv = INT_MAX;      maxv = 0;      for (i=0; i < Graph::numF1; i++){         partition_get_minmaxtid(pnum, (*F2Graph)[i]->item(), minv, maxv);      }   }   //cout << "MINMAX " << minv << " " << maxv << endl;   if (invDB->totsize() < maxv-minv+1)      invDB->Realloc(maxv-minv+1);      int supsz;   int ivalsz=0;   int *ival = NULL;   for (i=0; i < Graph::numF1; i++){      supsz = partition_get_lidxsup(pnum, (*F2Graph)[i]->item());      if (ivalsz < supsz){         ivalsz = Util<int>::Realloc(supsz, ITSZ, ival);      }      partition_lclread_item(ival, pnum, (*F2Graph)[i]->item());      int cid;      int midx;      for (int pos=0; pos < supsz; pos++)      {         cid = ival[pos];         midx = cid - minv;         invDB->add_db(midx,i);      }   }   for (k=0; k < maxv-minv+1; k++){      if (diff_input){         if ((*invDB)[k]){            if (Graph::numF1-(*invDB)[k]->size() > 0)               process_diff_cust((*invDB)[k]->size(), (*invDB)[k]->garray());            (*invDB)[k]->reset();         }         else process_diff_cust(0, NULL);      }      else{         if ((*invDB)[k] && (*invDB)[k]->size() > 0){            //cout << "MVAL " << minv+k << " " << (*invDB)[k]->size() << endl;            //for (int x = 0; x < (*invDB)[k]->size(); x++){            //   cout << " " << (*F2Graph)[(*(*invDB)[k])[x]]->item();            //}            //cout << endl;                        process_cust_invert((*invDB)[k]->size(), (*invDB)[k]->garray());            (*invDB)[k]->reset();         }      }   }} void sort_get_l2(int &l2cnt, int fd){   //write 2-itemsets counts to file   int i, j, k, fcnt;   long sortflen;   int *sortary;   sortflen = lseek(fd, 0, SEEK_CUR);   if (sortflen < 0){      perror("SEEK SEQ");      exit(errno);   }   //cout << "SORT " << sortflen << endl;   if (sortflen > 0){#ifdef DEC      sortary = (int *) mmap((char *)NULL, sortflen,                             (PROT_READ|PROT_WRITE),                             (MAP_FILE|MAP_VARIABLE|MAP_PRIVATE), fd, 0);#else      sortary = (int *) mmap((char *)NULL, sortflen,                             (PROT_READ|PROT_WRITE),                             MAP_PRIVATE, fd, 0);#endif      if (sortary == (int *)-1){         perror("SEQFd MMAP ERROR");         exit(errno);      }            qsort(sortary, (sortflen/ITSZ)/2, 2*ITSZ, cmp2it);         }      int numel = sortflen/ITSZ;   i = 0;   fcnt = 0;   for (j=0; j < Graph::numF1;j++){      for (k=j+1; k < Graph::numF1;k++){         fcnt = 0;         if (sortflen > 0 && i < numel){            while (i < numel && j == sortary[i] && k == sortary[i+1]){               fcnt += 256;               i += 2;            }         }         if (cset_sup[j]) fcnt += (int) cset_sup[j][k-j-1];                  if (fcnt >= MINSUPPORT){            //F2Graph->add_adj(j, (*F2Graph)[k]->item(), fcnt);            F2Graph->add_adj(j, k, fcnt);            (*F2Graph)[j]->supsum() += fcnt;            (*F2Graph)[k]->supsum() += fcnt;            //cout << "LARGE " << (*F2Graph)[j]->item() << " " <<            //   (*F2Graph)[k]->item() << " SUPP " << fcnt << endl;            l2cnt++;         }      }   }   if (sortflen > 0) munmap((caddr_t)sortary, sortflen);}void get_l2(int &l2cnt){   int j,k;   int fcnt;      for (j=0; j < Graph::numF1;j++){      if (set_sup[j]){         for (k=j+1; k < Graph::numF1;k++){            fcnt = set_sup[j][k-j-1];            if (fcnt >= MINSUPPORT){               F2Graph->add_adj(j, k, fcnt);               (*F2Graph)[j]->supsum() += fcnt;               (*F2Graph)[k]->supsum() += fcnt;               //cout << "LARGE " << j << " " << k << " " <<               //   (*F2Graph)[j]->item() << " " <<               //   (*F2Graph)[k]->item() << " SUPP " << fcnt << endl;                l2cnt++;            }         }      }   }}void get_ext_l2(int &l2cnt){   int i;   int mem_used=0;   EXTBLKSZ = num_partitions+(DBASE_NUM_TRANS+num_partitions-1)/num_partitions;   int tsz = (int) (DBASE_AVG_TRANS_SZ);   invDB = new invdb(EXTBLKSZ);   mem_used += EXTBLKSZ*ITSZ;   mem_used += (int) (EXTBLKSZ*tsz*ITSZ);   //cout << "CURITSZ " << tsz << " " << EXTBLKSZ << " " << mem_used << endl;      char TMPF[300];   sprintf(TMPF,"%siset",tempf);   if (use_char_extl2){      if ((isetfd = open(TMPF, (O_RDWR|O_CREAT|O_TRUNC), 0666)) < 0){         perror("Can't open out file");         exit (errno);            }      cset_sup = new unsigned char *[Graph::numF1];        // support for 2-itemsets      bzero((void *)cset_sup, Graph::numF1*sizeof(unsigned char *));         mem_used += Graph::numF1*sizeof(unsigned char *);   }   else{      set_sup = new unsigned int *[Graph::numF1];        // support for 2-itemsets      bzero((void *)set_sup, Graph::numF1*sizeof(unsigned int *));         mem_used += Graph::numF1*sizeof(unsigned int *);         }   int low, high;         int itsz;   if (use_char_extl2) itsz = sizeof(unsigned char);   else itsz = sizeof(unsigned int);      for (low = 0; low < Graph::numF1; low = high){      if (use_char_extl2){         isetpos = 0;         lseek(isetfd, 0, SEEK_SET);      }            for (high = low; high < Graph::numF1 &&              (mem_used+(Graph::numF1-high-1)*itsz) < AVAILMEM; high++){         if (Graph::numF1-high-1 > 0){            if (use_char_extl2){               cset_sup[high] = new unsigned char [Graph::numF1-high-1];               bzero((void *)cset_sup[high], (Graph::numF1-high-1)*itsz);            }            else{               set_sup[high] = new unsigned int [Graph::numF1-high-1];               //cout << "ALLOC " << high << " " << set_sup[high] << " "               //     << Graph::numF1-high-1 << endl;               bzero((void *)set_sup[high], (Graph::numF1-high-1)*itsz);            }         }         mem_used += (Graph::numF1-high-1) * itsz;         //cout << "MEMUSEDLOOP " << mem_used << " " << endl;      }      //cout << "MEMUSEDLOOP " << mem_used << endl;        //cout << "LOWHIGH " << high << " " << low << endl;      for (int p=0; p < num_partitions; p++){         process_invert(p);      }            if (use_char_extl2){         if (isetpos > 0){            write(isetfd, (char *)isetbuf, isetpos*ITSZ);            isetpos = 0;         }         sort_get_l2(l2cnt, isetfd);      }      else get_l2(l2cnt);            // reclaim memory      for (i = low; i < high; i++)      {         if (use_char_extl2){            //cout << "i " << i << " " << cset_sup[i] << endl << flush;            if (cset_sup[i]) delete [] cset_sup[i];            cset_sup[i] = NULL;         }         else{            //cout << "DEL " << i << " " << set_sup[i] << endl << flush;            if (set_sup[i]) delete [] set_sup[i];            set_sup[i] = NULL;         }         mem_used -= (Graph::numF1-i-1) * itsz;      }   }      if (use_char_extl2){      close(isetfd);      unlink(TMPF);      delete [] cset_sup;   }   else delete [] set_sup;   delete invDB;}void get_file_l2(char *fname, int &l2cnt){   int *cntary;   int fd = open(fname, O_RDONLY);   if (fd < 1){      perror("can't open l2 file");      exit(errno);   }      int flen = lseek(fd,0,SEEK_END);   if (flen > 0){#ifdef DEC      cntary = (int *) mmap((char *)NULL, flen, PROT_READ,                             (MAP_FILE|MAP_VARIABLE|MAP_PRIVATE), fd, 0);#else      cntary = (int *) mmap((char *)NULL, flen, PROT_READ,                             MAP_PRIVATE, fd, 0);#endif      if (cntary == (int *)-1){         perror("MMAP ERROR:cntary");           exit(errno);      }            // build F2graph -- large 2-itemset relations      int lim = flen/ITSZ;      //for (int i=0; i < lim; i += 3){      int i,j,k,res;      int git[2];      for (j=0, i=0; j < Graph::numF1 && i < lim;j++){         for (k=j+1; k < Graph::numF1 && i < lim; k++){            git[0] = (*F2Graph)[j]->item();            git[1] = (*F2Graph)[k]->item();            //if (git[0] < cntary[i]) res = -1;            //else if (git[0] > cntary[i]) res = 1;            //else{            //   if (git[0] < cntary[i+1]) res = -1;            //   else if (git[0] > cntary[i+1]) res = 1;            //   else res = 0;            //}                        res = cmp2it(&git[0], &cntary[i]);            if (res == 0){               if (cntary[i+2] >= MINSUPPORT){                  //F2Graph->add_adj(j, cntary[i+1], cntary[i+2]);                  F2Graph->add_adj(j, k, cntary[i+2]);                  (*F2Graph)[j]->supsum() += cntary[i+2];                  (*F2Graph)[k]->supsum() += cntary[i+2];                  //cout << "LARGE " << cntary[i] << " " << cntary[i+1]                  //     << " " << cntary[i+2] << endl;                  l2cnt++;               }               i += 3;            }            else if (res > 0){               i += 3;               k--;            }         }      }            munmap((caddr_t)cntary, flen);   }   close(fd);}int make_l2_pass(boolean ext_l2_pass, char *it2f){   int i;   int l2cnt=0;   if (ext_l2_pass) get_ext_l2(l2cnt);   else get_file_l2(it2f, l2cnt);   //cout << "L2 : " << l2cnt << endl;      for (i=0; i < F2Graph->size(); i++){      if ((*F2Graph)[i]->garray() != NULL) (*F2Graph)[i]->compact();      //cout << "NODE " << i << " " << (*F2Graph)[i]->supsum() << " : ";      //if ((*F2Graph)[i]->garray() != NULL) cout << " " << *(*F2Graph)[i];      //cout << endl;   }   //F2Graph->sort();   //for (i=0; i < F2Graph->size(); i++){   //   if ((*F2Graph)[i]->garray() != NULL) (*F2Graph)[i]->compact();   //   //cout << (*F2Graph)[i]->item();   //   //cout << "NODE " << i << " : ";   //   //if ((*F2Graph)[i]->array() != NULL) cout << " " << *(*F2Graph)[i];   //   //cout << endl;   //}   //cout << "GRAPH DENSITY " <<   //   (2.0 *l2cnt)/(F2Graph->size()*(F2Graph->size()-1)) << endl;   return l2cnt;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -