⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 dci.cc

📁 this will produce the dci program in the src/ directory, or in /path/to/your/installation/bin if y
💻 CC
📖 第 1 页 / 共 3 页
字号:
// Copyright (C) 2003 salvatore orlando <salvatore.orlando@unive.it>//  // This program is free software; you can redistribute it and/or modify// it under the terms of the GNU General Public License as published by// the Free Software Foundation; either version 2 of the License, or// (at your option) any later version.// // This program is distributed in the hope that it will be useful,// but WITHOUT ANY WARRANTY; without even the implied warranty of// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the// GNU General Public License for more details.// // You should have received a copy of the GNU General Public License// along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.#include <iostream>#include <vector>#include <cmath>using namespace std;#include <stdio.h>#include "items.hh"#include "transaction.hh"#include "database.hh"#include "candidates.hh"#include "frequents.hh"#include "utils.hh"#include "direct_count.hh"#include "vertical.hh"#define TEMPORARY_DATASET "dataset.tmp"#define PRUNE_FACTOR_PROJ       0.04#define PRUNE_FACTOR_NEWLIST    0.8#define PRUNE_FACTOR_COMPLEXITY 2char OUTF[128] = ""; // output filebool write_output;   // dump frequent itemsets to file ? int first_scan(Data *d, dci_items& counters, unsigned int& max_trans_len);template <class T, class T1>void following_scans(int max_trans_len, dci_items& counters);template <class T, class T1>set_of_frequents<T,T1> *second_scan(int max_trans_len, dci_items& counters, 				    cand_2_iter<T1>& c, 				    DCI_vertical_dataset<T>& DCI_dataset);template <class T, class T1>void DCP_iter(int iter, int max_trans_len, 	      dci_items& counters, 	      DCP_candidates<T,T1>& c, set_of_frequents<T,T1>& next_freq,	      DCI_vertical_dataset<T>& DCI_dataset);template <class T, class T1>void DCI_iter_diffuse(int iter, dci_items& counters, 		     set_of_frequents<T,T1>& freq, 		     set_of_frequents<T,T1>& next_freq, 		     DCI_vertical_dataset<T>& DCI_dataset);template <class T, class T1>void DCI_iter_diffuse_key(int iter, dci_items& counters, 		     set_of_frequents<T,T1>& freq, 		     set_of_frequents<T,T1>& next_freq, 		     DCI_vertical_dataset<T>& DCI_dataset);template <class T, class T1>void DCI_iter_compact(int iter, dci_items& counters, 		    set_of_frequents<T,T1>& freq, 		    set_of_frequents<T,T1>& next_freq, 		    DCI_vertical_dataset<T>& DCI_dataset);template <class T, class T1>void DCI_iter_compact_key(int iter, dci_items& counters, 		    set_of_frequents<T,T1>& freq, 		    set_of_frequents<T,T1>& next_freq, 		    DCI_vertical_dataset<T>& DCI_dataset);int main(int argc, char ** argv){  Chronos all_time;  all_time.StartChronos();  if(argc < 3 || argc > 4) {    cerr << "Usage: " << argv[0] <<       " input_file min_count [output_file]" << endl;    return 1;  }    unsigned int min_count;  min_count = atoi(argv[2]);  if (min_count <= 0) {    cerr << "Usage: " << argv[0] << " input_file min_count" << endl;    cerr << "        Value of min_count not allowed (min_count > 0)\n";    return 2;  }  dci_items counters(min_count);  // counters for singleton  Data *d;                        // database object used only for the   d = new Data(argv[1]);          // first scan  if (!d->isOpen()) {    cerr << "input file " << argv[1] 	 << " cannot be opened!" << endl;    exit(1);  }      if (argc == 4) {    sprintf(OUTF, "%s", argv[3]);    write_output = true;  }    else    write_output = false;    // **************************************************  // First iteration   // **************************************************  unsigned int max_support;  // the same as nr_of_trans  unsigned int max_trans_len;  max_support = first_scan(d, counters, max_trans_len);  // m1 is the number of items (singleton)   // that are found to be frequent  int m1 = counters.get_m1();  delete d;  if (max_support == 0 || m1 < 2) {    // there is nothing to mine...    cout << "Total time: " << all_time.ReadChronos() << endl;    return 0;  }    // **************************************************  // Second and Following iterations   // since we now know how many distinct items are frequent  // and their maximum support, we can optimize the amount  // of memory used to store itemsets and their counters  // **************************************************    m1++; // we need one extra element to store the key-pattern flag (-1)  if (m1 < 256   &&    max_support < 256)    following_scans<unsigned char, unsigned char>(max_trans_len, counters);  else if (m1 < 256  && max_support < 256*256)    following_scans<unsigned char, unsigned short int>(max_trans_len, counters);    else if (m1 < 256  && max_support >= 256*256)    following_scans<unsigned char, unsigned int>(max_trans_len, counters);    else if (m1 < 256*256  && max_support < 256)    following_scans<unsigned short int, unsigned char>(max_trans_len, counters);    else if (m1 >= 256*256  && max_support < 256)    following_scans<unsigned int, unsigned char>(max_trans_len, counters);    else if (m1 < 256*256  && max_support < 256*256)    following_scans<unsigned short int, unsigned short int>(max_trans_len, counters);  else if (m1 < 256*256  && max_support >= 256*256)    following_scans<unsigned short int, unsigned int>(max_trans_len, counters);  else if (m1 >= 256*256 && max_support < 256*256)    following_scans<unsigned int, unsigned short int>(max_trans_len, counters);  else    following_scans<unsigned int, unsigned int>(max_trans_len, counters);#ifdef VERBOSE	  cout << "Total time: " << all_time.ReadChronos() << endl;#endif  unlink(TEMPORARY_DATASET);  return 0;}// First iteration: counting frequent items	int first_scan(Data *d, dci_items& counters, unsigned int& max_trans_len){  Chronos time;  time.StartChronos();  float min_supp;  bool create=true;  // create a temporary binary representation of the dataset  // on disk. we will keep writing on the same file during   // all subsequent scans  binFSout<unsigned int> oo(TEMPORARY_DATASET, create);  if(!oo.isOpen()) {    cerr << TEMPORARY_DATASET << " could not be opened!" << endl;    exit(1);  }  int totalsize=0;  max_trans_len = 0;  vector<unsigned int> t;  // counting loop  while(d->getNextTransaction(t) != 0) {    // read next transaction t from db    for(unsigned int i=0; i<t.size(); i++)  // for each item in t ...      counters.insert_item(t[i]);           // ... increase corresp. counter    if (t.size() > max_trans_len)           // keep track of max trans length      max_trans_len = t.size();     oo.writeTransaction(t);                 // write t to temp file        totalsize += t.size();                  // temp file size  }    min_supp = 100.0 * (double) counters.min_count /    (double) oo.get_num_of_trans();  counters.set_num_of_trans(oo.get_num_of_trans());  //  remap item identifiers to 1:m1  int first_freq = counters.remap_items(write_output);  if (write_output) { // dump frequents to file     FSout o(OUTF, 1); // Iter=1    if(!o.isOpen()) {      cerr << OUTF << " could not be opened for writing!" << endl;      exit(1);    }    char count_print[32];    // write empty itemset    int num_written = sprintf(count_print, "(%d)\n", oo.get_num_of_trans());    o.printSet(count_print, num_written);    //  output frequent items     for (unsigned int i=first_freq; i < counters.get_m(); i++) {           o.printSet(counters.unmap_ascii[i-first_freq], 		 counters.unmap_ascii_len[i-first_freq]);      num_written = sprintf(count_print, "(%d)\n", 			    (int) counters.get_count(i));      o.printSet(count_print, num_written);  // print for each suffix[i]    }  }    counters.delete_counts();#ifdef VERBOSE    cout << "# Database statistics:\n#\t"        << oo.get_num_of_trans() <<" transactions\n#\t"        << counters.get_m() << " different items\n#\t"       << "Max transaction length = " << max_trans_len <<"\n#\n# ";  cout << "Min support = " << min_supp << "%    "       << "Min count   = " << counters.min_count << endl << endl;  print_statistics("DCP", 1, counters.get_m(), counters.get_m1(), time.ReadChronos());#else  printf("1\n%d\n",counters.get_m1()); // 1 is the empty set#endif  return(counters.get_max_supp());}// second and following iterations: // the second with direct count, the others with DCP or DCItemplate <class T, class T1>void following_scans(int max_trans_len, dci_items& counters){  set_of_frequents<T, T1> *freq;         // frequent itemsets  int k=2;                               // freq. itemset size   DCI_vertical_dataset<T> DCI_dataset;   // vertical dataset  bool DCI = false;                      // DCI or DCP?  unsigned int max_count;                   // Check if the in-core vertical dataset VD can be created  if (DCI_dataset.VD_can_be_allocated(counters.get_m1(),				      counters.get_num_of_trans())) {    DCI_dataset.init_VD();    DCI = true;  }  max_count = counters.get_max_supp();  // ------------------------------------  // Second iteration using a prefix table  // ------------------------------------  cand_2_iter<T1>   *c;                  // candidates for 2nd iteration  c = new cand_2_iter<T1>(counters.get_m1());  freq = second_scan<T, T1>(max_trans_len, counters, *c, DCI_dataset);  if (freq == NULL)    return;    if (freq->get_num_frequents() < 3) {    // there is nothing else to mine ...    delete freq;    return;  }  // ------------------------------------  // third and following iterations  // ------------------------------------   DCP_candidates<T,T1> *cand;  cand = new DCP_candidates<T,T1>(k); // allocate cand the first time  // Iterate with DCP until DCI can start  while (!DCI) {     k++;      // increment iter counter    // Check if the in-core vertical dataset VD can be created    if (DCI_dataset.VD_can_be_allocated(counters.get_mk(), 					counters.get_num_of_trans())) {      // next iter with DCI       DCI_dataset.init_VD();      DCI = true;     }          gen_candidates<T, T1>(*freq, *cand, counters, k, *c);    if (cand->get_num_candidates() == 0) {      cout << "no more candidates !\n";      return;    }    if (k==3)       delete c;          DCP_iter<T, T1>(k, max_trans_len, counters, *cand, *freq, DCI_dataset);    if (freq->get_num_frequents() < k+1) {      // nothing to mine ...      delete cand;      delete freq;      return;    }  }  delete cand; // DCI doesn't use candidates  // ---------------------------------------------  // Iterations with DCI, i.e. using vertical dataset   // and intersections.  // ---------------------------------------------  // check the vertical dataset to choose between   // sparse or dense optimizations  // ALSO reorder the columns of the vertical dataset!!  DCI_dataset.chk_compact_vertical(counters.get_mk());  // initialize key-pattern flags  freq->init_keys();  // create a set of frequent itemsets   set_of_frequents<T, T1> *next_freq = new set_of_frequents<T, T1>(k);   set_of_frequents<T, T1> *tmp_freq; // pointer used for swapping  // decide if it is convenient to use the   // key-pattern optimization or not   bool use_keys = counters.use_key_patterns();  while(1) {    k++;      // increment iter counter    if (use_keys) {       // few key patterns are expected      // it is faster to try to infer itemsets       // supports from non-key patterns      DCI_iter_compact_key<T, T1>(k, counters, *freq, *next_freq, DCI_dataset);    } else {       // too many key patterns are expected      // it is faster to *count* itemset supports      // but we can still check if the dataset       // is compact or not.      if (DCI_dataset.is_compact())	DCI_iter_compact<T, T1>(k, counters, *freq, *next_freq, DCI_dataset);      else	DCI_iter_diffuse<T, T1>(k, counters, *freq, *next_freq, DCI_dataset);    }    if (next_freq->get_num_frequents() < k) {      // nothing to mine      delete freq;      delete next_freq;      return;    }    else {      // swap sets of frequent itemsets       tmp_freq = freq;      freq = next_freq;      next_freq = tmp_freq;    }  }    }template <class T, class T1>set_of_frequents<T,T1> *second_scan(int max_trans_len, dci_items& counters, 				    cand_2_iter<T1>& c, 				    DCI_vertical_dataset<T>& DCI_dataset)  // Second iteration with direct count of 2-itemsets.   // If possible (enough memory) builds VD on the fly  // for third and subsequent iterations{  Chronos time;  time.StartChronos();   binFSin<unsigned int> ii(TEMPORARY_DATASET);  if(!ii.isOpen()) {    cerr << TEMPORARY_DATASET << " could not be opened!" << endl;    exit(1);  }    bool create=false;  binFSout<T> oo(TEMPORARY_DATASET, create);  if(!oo.isOpen()) {    cerr << TEMPORARY_DATASET << " could not be opened!" << endl;    exit(1);  }  int m1 = counters.get_m1();  Transaction<unsigned int> t_in(max_trans_len);  Transaction<T> t_out(max_trans_len);    // 2nd database scan  // direct count of 2-itemsets  unsigned int n_tr = 0;  while(ii.getNextTransaction(t_in)) {    prune_and_map_and_ord_first_iter(t_in, t_out, counters);    if (t_out.length >= 2) {      int x0;      int index_init;      for (int t0=0; t0 < (int) t_out.length-1; t0++) {	x0 = (int) t_out.t[t0];	index_init = direct_position2_init(x0, m1);	for (int t1=t0+1; t1 < (int) t_out.length; t1++)	  c.incr_cand_count(index_init + (int) t_out.t[t1]);      }       if (DCI_dataset.VD_is_allocated()) { 	// write the trans in VD on the fly	DCI_dataset.write_t_in_VD(n_tr, t_out);	n_tr++;      }      else 	// write Dk	oo.writeTransaction(t_out);          }  }      // output frequent 2-itemsets and set global pruning mask

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -