📄 closureopt.java

📁 实现APRIORI算法
💻 JAVA
字号:
/*  ClosureOpt.java  (P)2001 Laurentiu Cristofor*//*laur.dm.ar - A Java package for association rule mining Copyright (C) 2002  Laurentiu CristoforThis program is free software; you can redistribute it and/or modifyit under the terms of the GNU General Public License as published bythe Free Software Foundation; either version 2 of the License, or (atyour option) any later version.This program is distributed in the hope that it will be useful, butWITHOUT ANY WARRANTY; without even the implied warranty ofMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNUGeneral Public License for more details.You should have received a copy of the GNU General Public Licensealong with this program; if not, write to the Free SoftwareFoundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307USAThe laur.dm.ar package was written by Laurentiu Cristofor (laur@cs.umb.edu).*/package laur.dm.ar;import java.util.*;import java.io.IOException;/**   This class implements the Closure algorithm for finding frequent   itemsets. This implementation is based on Closure.java and it   contains a few optimizations.<P>   (see "Galois Connections and Data Mining"    by Dana Cristofor, Laurentiu Cristofor, and Dan Simovici    from UMass/Boston 2000)      @version 1.0   @author Laurentiu Cristofor*/public class ClosureOpt extends FrequentItemsetsMiner{  private static final int INITIAL_CAPACITY = 10000;  // our collections of itemsets  private ArrayList candidates;  private ArrayList k_frequent;  /// private ArrayList frequent;  // the counts matrix  private long[][] counts;  // the hashtrees associated with candidates and k_frequent  private HashTree ht_candidates;  private HashTree ht_k_frequent;  // this remembers the number of passes over the dataset  private int pass_num;  // this keeps track of the current step which also represents  // the cardinality of the candidate itemsets  private int step;  // useful information  private long num_rows;  private int num_cols;  private long min_weight;  /**   * Find the frequent itemsets in a database   *   * @param dbReader   the object used to read from the database   * @param cacheWriter   the object used to write to the cache   * if this is null, then nothing will be saved, this is useful   * for benchmarking   * @param minSupport   the minimum support   * @return   the number of passes executed over the database   */  public int findFrequentItemsets(DBReader dbReader, 				  DBCacheWriter cacheWriter,				  double minSupport)  {    // save the following into member fields    db_reader = dbReader;    cache_writer = cacheWriter;    num_rows = dbReader.getNumRows();    num_cols = (int)dbReader.getNumColumns();    min_weight = (long)(num_rows * minSupport);    // initialize the collections    candidates = new ArrayList(INITIAL_CAPACITY);    k_frequent = new ArrayList(INITIAL_CAPACITY);    /// frequent = new ArrayList(INITIAL_CAPACITY);    // initialize the hash tree    ht_k_frequent = new HashTree(k_frequent);    // The initial candidates are all 1-itemsets    Itemset is;    for (int i = 1; i <= num_cols; i++)      {	is = new Itemset(1);	is.add(i);	candidates.add(is);      }        // we start with first pass    for (step = 1, pass_num = 1; ; step++, pass_num++)      {	// reinitialize the counts matrix	counts = new long[candidates.size()][num_cols];	// check for user-requested abort	checkAbort();	// compute the weight of each candidate	if (step == 1) 	  // during step 1 we do not need to use a hashtree since it	  // is trivial to determine whether an itemset of size 1	  // is or is not contained in a row	  weighCandidatesWithoutHashTree();	else	  weighCandidates();	// check for user-requested abort	checkAbort();	// during this step if candidates are of cardinality k	// we will find out all k+1 frequent itemsets and place	// them in k_frequent. Thus we skip one pass over the	// dataset compared to the Apriori approach.	evaluateCandidates();	// we increment the step according to the above optimization	step ++;	// if last pass didn't produce any frequent itemsets	// then we can stop the algorithm	if (k_frequent.size() == 0)	  break;	// if we also examined the top itemset (the one containing	// all items) then we're done, nothing more to do.	if (step >= db_reader.getNumColumns())	  break;	// check for user-requested abort	checkAbort();	// generate new candidates from frequent itemsets	generateCandidates();	// exit if no more candidates	if (candidates.size() == 0)	  break;      }    return pass_num;  }  // this procedure performs the first scan of the database  // and computes the weight of each candidate  private void weighCandidatesWithoutHashTree()  {    try      {	Itemset row = db_reader.getFirstRow();	// also we update the counts table here	for (int i = 0; i < row.size(); i++)	  for (int j = 0; j < row.size(); j++)	    counts[row.get(i) - 1][row.get(j) - 1]++;	while (db_reader.hasMoreRows())	  {	    row = db_reader.getNextRow();	    // also we update the counts table here	    for (int i = 0; i < row.size(); i++)	      for (int j = 0; j < row.size(); j++)		counts[row.get(i) - 1][row.get(j) - 1]++;	  }	// update weights for candidates according to the 	// information collected in counts[][]	for (int i = 0; i < candidates.size(); i++)	  ((Itemset)candidates.get(i)).setWeight(counts[i][i]);      }    catch (Exception e)      {	System.err.println("Error scanning database!!!\n" + e);      }  }  // this procedure scans the database and computes the weight of each  // candidate  private void weighCandidates()  {    ht_candidates.prepareForDescent();    try      {	Itemset row = db_reader.getFirstRow();	// also we update the counts table here	ht_candidates.update(row, counts);	while (db_reader.hasMoreRows())	  {	    row = db_reader.getNextRow();	    // also we update the counts table here	    ht_candidates.update(row, counts);	  }      }    catch (Exception e)      {	System.err.println("Error scanning database!!!\n" + e);      }  }  // this procedure checks to see which itemsets are frequent  private void evaluateCandidates()  {    Itemset is, is_cl;    for (int i = 0; i < candidates.size(); i++)      // if this is a frequent itemset      if ((is = (Itemset)candidates.get(i)).getWeight() >= min_weight)	{	  // compute support of itemset	  is.setSupport((double)is.getWeight()/(double)num_rows);	  // write itemset to the cache	  try	    {	      if (cache_writer != null)		cache_writer.writeItemset(is);	    }	  catch (IOException e)	    {	      System.err.println("Error writing cache!!!\n" + e);	    }	  // then add it to the frequent collection	  /// frequent.add(is);	  // now look for closures of this itemset.	  // we're looking to add to this itemset new	  // items that appear more than min_weight times	  // together with this itemset.	  // we start looking for such items starting from	  // the next item after the last item of this itemset	  // (this is in accordance with the way we generate candidates)	  for (int item = is.get(is.size() - 1) + 1; 	       item <= num_cols; item++)	    if (counts[i][item - 1] >= min_weight)	      {		// create a new itemset		is_cl = new Itemset(is);		is_cl.add(item);		// set the weight and support of this new itemset		is_cl.setWeight(counts[i][item - 1]);		is_cl.setSupport((double)is_cl.getWeight()/(double)num_rows);		// write itemset to the cache		try		  {		    if (cache_writer != null)		      cache_writer.writeItemset(is_cl);		  }		catch (IOException e)		  {		    System.err.println("Error writing cache!!!\n" + e);		  }		// add it to the frequent and k_frequent collections		/// frequent.add(is_cl);		k_frequent.add(is_cl);		ht_k_frequent.add(k_frequent.size() - 1);	      }	}    // reinitialize candidates for next step    candidates.clear();    ht_candidates = new HashTree(candidates);  }  // this procedure generates new candidates out of itemsets  // that are frequent according to the procedure described  // by Agrawal a.o.  private void generateCandidates()  {    ht_k_frequent.prepareForDescent();    if (k_frequent.size() == 0)      return;    for (int i = 0; i < k_frequent.size() - 1; i++)      for (int j = i + 1; j < k_frequent.size(); j++)	if (!getCandidate(i, j))	  break;    // reinitialize k_frequent for next step    k_frequent.clear();    ht_k_frequent = new HashTree(k_frequent);  }  // this procedure tries to combine itemsets i and j and returns  // true if succesful, false if it can't combine them  private boolean getCandidate(int i, int j)  {    Itemset is_i = (Itemset)k_frequent.get(i);    Itemset is_j = (Itemset)k_frequent.get(j);    // if we cannot combine element i with j then we shouldn't    // waste time for bigger j's. This is because we keep the    // collections ordered, an important detail in this implementation    if (!is_i.canCombineWith(is_j))      return false;    else      {	Itemset is = is_i.combineWith(is_j);	// a real k-frequent itemset has k (k-1)-frequent subsets	if (ht_k_frequent.countSubsets(is) == is.size())	  {	    candidates.add(is);	    ht_candidates.add(candidates.size() - 1);	  }	return true;      }  }}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -