📄 vertical.cc

📁 DCI 算法源代码 DCI: A hybrid Algorithm for Frequent Set Counting
💻 CC
📖 第 1 页 / 共 2 页
字号:
12 下一页
// Copyright (C) 2003 salvatore orlando <salvatore.orlando@unive.it>//  // This program is free software; you can redistribute it and/or modify// it under the terms of the GNU General Public License as published by// the Free Software Foundation; either version 2 of the License, or// (at your option) any later version.// // This program is distributed in the hope that it will be useful,// but WITHOUT ANY WARRANTY; without even the implied warranty of// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the// GNU General Public License for more details.// // You should have received a copy of the GNU General Public License// along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.#include "memory.hh"#include "vertical.hh"template class DCI_vertical_dataset<unsigned int>;template class DCI_vertical_dataset<unsigned short int>;template class DCI_vertical_dataset<unsigned char>;template <class T>inline bool DCI_vertical_dataset<T>::candidate_is_frequent_diffuse(T *cand, 			     int prefix_len,			     int iter, 			     int min_count, 			     int& count, 			     DCI_statistics& stats,			     bool pruning){  unsigned int *pp, *pp1;  unsigned int *curr_cache;  if ((prefix_len == 0) || (prefix_len == 1)) {     // cache content cannot be reused, perform     // first intersection and store it in cache    // remember runs of 0's in and_index    curr_cache = CACHE_tidlist + tid_list_size;    pp = VD + (cand[0] * tid_list_size);    pp1 = VD + (cand[1] * tid_list_size);    and_tid_list_set_index(curr_cache, pp, pp1, and_index, tid_list_size);    stats.n_and_reduced = and_index[1];    if (pruning) {      stats.nn_and++;      stats.num_and += tid_list_size;    }    prefix_len = 2;  }  curr_cache = CACHE_tidlist + (prefix_len * tid_list_size);    if (stats.n_and_reduced < 0.9 * tid_list_size) {    for (int k = prefix_len; k < iter - 1; k++) {      // performs intersections by exploiting       // runs of 0's stored in and_index      pp = VD + (cand[k] * tid_list_size);      and_tid_list_use_index(curr_cache, curr_cache - tid_list_size, pp, 			     and_index, tid_list_size);      curr_cache += tid_list_size;    }    // performs last intersection  by exploiting     // runs of 0's stored in and_index.    // Counts 1's while intersecting    pp = VD + (cand[iter - 1] * tid_list_size);    count = and_tid_list_count_1_use_index(curr_cache, 					   curr_cache - tid_list_size,  					   pp, and_index, tid_list_size);        if (pruning) {      stats.n_and_index += (iter-prefix_len);      stats.num_and += (iter-prefix_len)*stats.n_and_reduced;    }  } else {    for (int k = prefix_len; k < iter - 1; k++) {      // performs intersections by exploiting       // runs of 0's stored in and_index            pp = VD + (cand[k] * tid_list_size);      and_tid_list(curr_cache, curr_cache - tid_list_size, pp, tid_list_size);      curr_cache += tid_list_size;    }    // performs last intersection  by exploiting     // runs of 0's stored in and_index.    // Counts 1's while intersecting    pp = VD + (cand[iter - 1] * tid_list_size);    count = and_tid_list_count_1(curr_cache, curr_cache - tid_list_size,				 pp, tid_list_size);    if (pruning) {      stats.nn_and += (iter-prefix_len);      stats.num_and += (iter-prefix_len)*tid_list_size;    }  }       if (count >= min_count) {    // the candidate is frequent !    // set pruning and reordering masks    //for (int i = 0; i < iter; i++)    //  flag_item[cand[i]] = 1;    //first_items[cand[0]] += 1;    if (pruning) {      if (stats.n_and_reduced < 0.9 * tid_list_size)	or_tid_list_use_index(prune_mask, prune_mask,  			      CACHE_tidlist + ((iter - 1) * tid_list_size), 			      and_index, tid_list_size);      else	or_tid_list(prune_mask, prune_mask,  		    CACHE_tidlist + ((iter - 1) * tid_list_size), 		    tid_list_size);	      stats.n_or_index++;    }    return true;  }  else     return false;}template <class T> inline bool DCI_vertical_dataset<T>::candidate_is_frequent_compact(T *cand, 						     int prefix_len,						     int iter, 						     int min_count, 						     int& count, 						     DCI_statistics& stats){  unsigned int *pp, *pp1;  bool use_section = false;    if (prefix_len == 0) {    if (is_included(cand[0])) {       // all items in the EQ sect !      num_bit_set_init = n_bit_set_eq;      use_section = true;    }    prefix_len++;  }   else {    if (is_included(cand[prefix_len - 1]))      // almost an item in the EQ sect !      use_section = true;  }  unsigned int *curr_cache;  if (prefix_len == 1) {    curr_cache = CACHE_tidlist + tid_list_size;        if (!use_section) {      pp = VD + (cand[0] * tid_list_size);      pp1 = VD + (cand[1] * tid_list_size);      if (!is_included(cand[1])) { 	// both are NOT included in the EQ section: normal intersection	and_tid_list(curr_cache, pp, pp1, tid_list_size);      } else { 	// the second is included in the EQ section	// intersect and count up to sz_sect_eq 	num_bit_set_init = and_tid_list_count_1(curr_cache, pp, pp1, 						sz_sect_eq); 	  	use_section = true;	and_tid_list_use_section(curr_cache, pp, pp1, 				 sz_sect_eq, tid_list_size);      }    } else {      pp = VD + (cand[0] * tid_list_size);      pp1 = VD + (cand[1] * tid_list_size);            and_tid_list_use_section(curr_cache, pp,pp1, sz_sect_eq, tid_list_size);    }    prefix_len++;  }    int k = prefix_len;  curr_cache = CACHE_tidlist + (k * tid_list_size);  if (!use_section) {        for (; k < iter - 1; k++) {      pp = VD + (cand[k] * tid_list_size);          if (!is_included(cand[k])) { 	// cand[k] is not included in the EQ section	and_tid_list(curr_cache, curr_cache - tid_list_size, 		     pp, tid_list_size);	curr_cache += tid_list_size; 	// increment cache pointer      } else { 	// cand[k] is the fisrt included in the EQ section	num_bit_set_init = and_tid_list_count_1(curr_cache, 						curr_cache - tid_list_size, 						pp, sz_sect_eq); 	// partial counting !!!!!!!	use_section = true;	break;      }    }  }  if (use_section) {      for (; k < iter - 1; k++) {	pp = VD + (cand[k] * tid_list_size);	and_tid_list_use_section(curr_cache, curr_cache - tid_list_size, 				 pp, sz_sect_eq, tid_list_size);	curr_cache += tid_list_size; // increment cache pointer      }  }  // last intersection  // Counts 1's while intersecting  pp = VD + (cand[iter - 1] * tid_list_size);  if (use_section) {    count = and_tid_list_count_1_use_section(curr_cache, 					     curr_cache - tid_list_size,					     pp, sz_sect_eq, tid_list_size);    count += num_bit_set_init;   } else {    count = and_tid_list_count_1(curr_cache, curr_cache - tid_list_size,				 pp, tid_list_size);  }      if (count >= min_count)     // cand is frequent !    return true;  else     return false;}template <class T>int DCI_vertical_dataset<T>::chk_compact_vertical(int n_frequent_items)  // heuristic used to determine if in the vertical dataset  // there is a region where tid lists resamble one with the other  //   // the area of the region of similarity must be above a given   // threshold for the dataset to be dense. {  int start, end, cc=0, perc=0;  unsigned int *mask, *last_row, *result;  float perc_eq_limit[] = { 95.0, 90.0, 85.0, 80.0, 75.0, 			    70.0, 65.0, 60.0};  int max_perc_ind = sizeof(perc_eq_limit) / sizeof(float);  int curr_perc = 0;  int curr_perc_OK = 0;    int start_OK, perc_OK;  mask = new unsigned int [tid_list_size];  result = new unsigned int [tid_list_size];  for (unsigned int i=0; i<tid_list_size; i++)    mask[i] = 0xffffffff;    end = n_frequent_items;    last_row = &VD[(end-1) * tid_list_size];  start_OK = end-1;  perc_OK = 0;  start = end-2;   while(start >= 0) {    // ----------------------------------------------------    // find the percentage of identical elements in     // the last start-end tidlists (most frequent items).    float perc_tmp;    int cc_tmp;    bzero(result, tid_list_size*sizeof(unsigned int)); 	    for (unsigned int i=0; i<tid_list_size; i++) {      result[i] = ~(VD[start * tid_list_size + i] ^ last_row[i]);      result[i] = (result[i] & mask[i]);    }    cc_tmp = count_1_bits(result, tid_list_size);    perc_tmp =  100.0 * (float) cc_tmp /       (float) (tid_list_size*sizeof(int)*8);    // ----------------------------------------------------        bool found = false;    // ----------------------------------------------------    // now test all percentage range limits     // given in perc_eq_limit    // ----------------------------------------------------    while (curr_perc < max_perc_ind) {      if (perc_tmp >= perc_eq_limit[curr_perc]) {	// we found the percentage range	// we store the partial result in mask	// and go on with another item	memcpy((void *) mask, (void *) result, tid_list_size*sizeof(int));	cc=cc_tmp;	perc_OK = (int) perc_tmp;	start_OK = start;	found = true;	curr_perc_OK = curr_perc;	start--;	break;      } else if ((end-start) < n_frequent_items/7)	// percentage is lower	// we consider another range if the number of 	// tidlists considered so far is not too big	curr_perc++;      else	// percentage is low and there are no more enough 	// items to consider. 	break;    }
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -