📄 vertical.cc
字号:
// Copyright (C) 2003 salvatore orlando <salvatore.orlando@unive.it>// // This program is free software; you can redistribute it and/or modify// it under the terms of the GNU General Public License as published by// the Free Software Foundation; either version 2 of the License, or// (at your option) any later version.// // This program is distributed in the hope that it will be useful,// but WITHOUT ANY WARRANTY; without even the implied warranty of// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the// GNU General Public License for more details.// // You should have received a copy of the GNU General Public License// along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.#include "memory.hh"#include "vertical.hh"template class DCI_vertical_dataset<unsigned int>;template class DCI_vertical_dataset<unsigned short int>;template class DCI_vertical_dataset<unsigned char>;template <class T>inline bool DCI_vertical_dataset<T>::candidate_is_frequent_diffuse(T *cand, int prefix_len, int iter, int min_count, int& count, DCI_statistics& stats, bool pruning){ unsigned int *pp, *pp1; unsigned int *curr_cache; if ((prefix_len == 0) || (prefix_len == 1)) { // cache content cannot be reused, perform // first intersection and store it in cache // remember runs of 0's in and_index curr_cache = CACHE_tidlist + tid_list_size; pp = VD + (cand[0] * tid_list_size); pp1 = VD + (cand[1] * tid_list_size); and_tid_list_set_index(curr_cache, pp, pp1, and_index, tid_list_size); stats.n_and_reduced = and_index[1]; if (pruning) { stats.nn_and++; stats.num_and += tid_list_size; } prefix_len = 2; } curr_cache = CACHE_tidlist + (prefix_len * tid_list_size); if (stats.n_and_reduced < 0.9 * tid_list_size) { for (int k = prefix_len; k < iter - 1; k++) { // performs intersections by exploiting // runs of 0's stored in and_index pp = VD + (cand[k] * tid_list_size); and_tid_list_use_index(curr_cache, curr_cache - tid_list_size, pp, and_index, tid_list_size); curr_cache += tid_list_size; } // performs last intersection by exploiting // runs of 0's stored in and_index. // Counts 1's while intersecting pp = VD + (cand[iter - 1] * tid_list_size); count = and_tid_list_count_1_use_index(curr_cache, curr_cache - tid_list_size, pp, and_index, tid_list_size); if (pruning) { stats.n_and_index += (iter-prefix_len); stats.num_and += (iter-prefix_len)*stats.n_and_reduced; } } else { for (int k = prefix_len; k < iter - 1; k++) { // performs intersections by exploiting // runs of 0's stored in and_index pp = VD + (cand[k] * tid_list_size); and_tid_list(curr_cache, curr_cache - tid_list_size, pp, tid_list_size); curr_cache += tid_list_size; } // performs last intersection by exploiting // runs of 0's stored in and_index. // Counts 1's while intersecting pp = VD + (cand[iter - 1] * tid_list_size); count = and_tid_list_count_1(curr_cache, curr_cache - tid_list_size, pp, tid_list_size); if (pruning) { stats.nn_and += (iter-prefix_len); stats.num_and += (iter-prefix_len)*tid_list_size; } } if (count >= min_count) { // the candidate is frequent ! // set pruning and reordering masks //for (int i = 0; i < iter; i++) // flag_item[cand[i]] = 1; //first_items[cand[0]] += 1; if (pruning) { if (stats.n_and_reduced < 0.9 * tid_list_size) or_tid_list_use_index(prune_mask, prune_mask, CACHE_tidlist + ((iter - 1) * tid_list_size), and_index, tid_list_size); else or_tid_list(prune_mask, prune_mask, CACHE_tidlist + ((iter - 1) * tid_list_size), tid_list_size); stats.n_or_index++; } return true; } else return false;}template <class T> inline bool DCI_vertical_dataset<T>::candidate_is_frequent_compact(T *cand, int prefix_len, int iter, int min_count, int& count, DCI_statistics& stats){ unsigned int *pp, *pp1; bool use_section = false; if (prefix_len == 0) { if (is_included(cand[0])) { // all items in the EQ sect ! num_bit_set_init = n_bit_set_eq; use_section = true; } prefix_len++; } else { if (is_included(cand[prefix_len - 1])) // almost an item in the EQ sect ! use_section = true; } unsigned int *curr_cache; if (prefix_len == 1) { curr_cache = CACHE_tidlist + tid_list_size; if (!use_section) { pp = VD + (cand[0] * tid_list_size); pp1 = VD + (cand[1] * tid_list_size); if (!is_included(cand[1])) { // both are NOT included in the EQ section: normal intersection and_tid_list(curr_cache, pp, pp1, tid_list_size); } else { // the second is included in the EQ section // intersect and count up to sz_sect_eq num_bit_set_init = and_tid_list_count_1(curr_cache, pp, pp1, sz_sect_eq); use_section = true; and_tid_list_use_section(curr_cache, pp, pp1, sz_sect_eq, tid_list_size); } } else { pp = VD + (cand[0] * tid_list_size); pp1 = VD + (cand[1] * tid_list_size); and_tid_list_use_section(curr_cache, pp,pp1, sz_sect_eq, tid_list_size); } prefix_len++; } int k = prefix_len; curr_cache = CACHE_tidlist + (k * tid_list_size); if (!use_section) { for (; k < iter - 1; k++) { pp = VD + (cand[k] * tid_list_size); if (!is_included(cand[k])) { // cand[k] is not included in the EQ section and_tid_list(curr_cache, curr_cache - tid_list_size, pp, tid_list_size); curr_cache += tid_list_size; // increment cache pointer } else { // cand[k] is the fisrt included in the EQ section num_bit_set_init = and_tid_list_count_1(curr_cache, curr_cache - tid_list_size, pp, sz_sect_eq); // partial counting !!!!!!! use_section = true; break; } } } if (use_section) { for (; k < iter - 1; k++) { pp = VD + (cand[k] * tid_list_size); and_tid_list_use_section(curr_cache, curr_cache - tid_list_size, pp, sz_sect_eq, tid_list_size); curr_cache += tid_list_size; // increment cache pointer } } // last intersection // Counts 1's while intersecting pp = VD + (cand[iter - 1] * tid_list_size); if (use_section) { count = and_tid_list_count_1_use_section(curr_cache, curr_cache - tid_list_size, pp, sz_sect_eq, tid_list_size); count += num_bit_set_init; } else { count = and_tid_list_count_1(curr_cache, curr_cache - tid_list_size, pp, tid_list_size); } if (count >= min_count) // cand is frequent ! return true; else return false;}template <class T>int DCI_vertical_dataset<T>::chk_compact_vertical(int n_frequent_items) // heuristic used to determine if in the vertical dataset // there is a region where tid lists resamble one with the other // // the area of the region of similarity must be above a given // threshold for the dataset to be dense. { int start, end, cc=0, perc=0; unsigned int *mask, *last_row, *result; float perc_eq_limit[] = { 95.0, 90.0, 85.0, 80.0, 75.0, 70.0, 65.0, 60.0}; int max_perc_ind = sizeof(perc_eq_limit) / sizeof(float); int curr_perc = 0; int curr_perc_OK = 0; int start_OK, perc_OK; mask = new unsigned int [tid_list_size]; result = new unsigned int [tid_list_size]; for (unsigned int i=0; i<tid_list_size; i++) mask[i] = 0xffffffff; end = n_frequent_items; last_row = &VD[(end-1) * tid_list_size]; start_OK = end-1; perc_OK = 0; start = end-2; while(start >= 0) { // ---------------------------------------------------- // find the percentage of identical elements in // the last start-end tidlists (most frequent items). float perc_tmp; int cc_tmp; bzero(result, tid_list_size*sizeof(unsigned int)); for (unsigned int i=0; i<tid_list_size; i++) { result[i] = ~(VD[start * tid_list_size + i] ^ last_row[i]); result[i] = (result[i] & mask[i]); } cc_tmp = count_1_bits(result, tid_list_size); perc_tmp = 100.0 * (float) cc_tmp / (float) (tid_list_size*sizeof(int)*8); // ---------------------------------------------------- bool found = false; // ---------------------------------------------------- // now test all percentage range limits // given in perc_eq_limit // ---------------------------------------------------- while (curr_perc < max_perc_ind) { if (perc_tmp >= perc_eq_limit[curr_perc]) { // we found the percentage range // we store the partial result in mask // and go on with another item memcpy((void *) mask, (void *) result, tid_list_size*sizeof(int)); cc=cc_tmp; perc_OK = (int) perc_tmp; start_OK = start; found = true; curr_perc_OK = curr_perc; start--; break; } else if ((end-start) < n_frequent_items/7) // percentage is lower // we consider another range if the number of // tidlists considered so far is not too big curr_perc++; else // percentage is low and there are no more enough // items to consider. break; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -