apriori.java

来自「一个数据挖掘软件ALPHAMINERR的整个过程的JAVA版源代码」· Java 代码 · 共 999 行 · 第 1/3 页
JAVA
999 行
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    Apriori.java
 *    Copyright (C) 1999 Eibe Frank,Mark Hall
 *
 */

package weka.associations;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.Reader;
import java.util.Enumeration;
import java.util.Hashtable;

import weka.core.AttributeStats;
import weka.core.FastVector;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.SelectedTag;
import weka.core.Tag;
import weka.core.Utils;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.Remove;

/**
 * Class implementing an Apriori-type algorithm. Iteratively reduces the minimum
 * support until it finds the required number of rules with the given minimum 
 * confidence. <p>
 *
 * Reference: R. Agrawal, R. Srikant (1994). <i>Fast algorithms for
 * mining association rules in large databases </i>. Proc
 * International Conference on Very Large Databases,
 * pp. 478-499. Santiage, Chile: Morgan Kaufmann, Los Altos, CA. <p>
 *
 * Valid options are:<p>
 *   
 * -N required number of rules <br>
 * The required number of rules (default: 10). <p>
 *
 * -T type of metric by which to sort rules <br>
 * 0 = confidence | 1 = lift | 2 = leverage | 3 = Conviction. <p>
 *
 * -C minimum confidence of a rule <br>
 * The minimum confidence of a rule (default: 0.9). <p>
 *
 * -D delta for minimum support <br>
 * The delta by which the minimum support is decreased in
 * each iteration (default: 0.05). <p>
 *
 * -U upper bound for minimum support <br>
 * The upper bound for minimum support. Don't explicitly look for 
 * rules with more than this level of support. <p>
 *
 * -M lower bound for minimum support <br>
 * The lower bound for the minimum support (default = 0.1). <p>
 *
 * -S significance level <br>
 * If used, rules are tested for significance at
 * the given level. Slower (default = no significance testing). <p>
 *
 * -R <br>
 * If set then columns that contain all missing values are removed from
 * the data.
 *
 * -I <br>
 * If set the itemsets found are also output (default = no). <p>
 *
 * @author Eibe Frank (eibe@cs.waikato.ac.nz)
 * @author Mark Hall (mhall@cs.waikato.ac.nz)
 * @version $Revision$ */
public class Apriori extends Associator implements OptionHandler {
  
  /** The minimum support. */
  protected double m_minSupport;

  /** The upper bound on the support */
  protected double m_upperBoundMinSupport;

  /** The lower bound for the minimum support. */
  protected double m_lowerBoundMinSupport;

  /** Metric types. */
  protected static final int CONFIDENCE = 0;
  protected static final int LIFT = 1;
  protected static final int LEVERAGE = 2;
  protected static final int CONVICTION = 3;
  public static final Tag [] TAGS_SELECTION = {
    new Tag(CONFIDENCE, "Confidence"),
    new Tag(LIFT, "Lift"),
    new Tag(LEVERAGE, "Leverage"),
    new Tag(CONVICTION, "Conviction")
      };

  /** The selected metric type. */
  protected int m_metricType = CONFIDENCE;

  /** The minimum metric score. */
  protected double m_minMetric;

  /** The maximum number of rules that are output. */
  protected int m_numRules;

  /** Delta by which m_minSupport is decreased in each iteration. */
  protected double m_delta;

  /** Significance level for optional significance test. */
  protected double m_significanceLevel;

  /** Number of cycles used before required number of rules was one. */
  protected int m_cycles;

  /** The set of all sets of itemsets L. */
  protected FastVector m_Ls;

  /** The same information stored in hash tables. */
  protected FastVector m_hashtables;

  /** The list of all generated rules. */
  protected FastVector[] m_allTheRules;

  /** The instances (transactions) to be used for generating 
      the association rules. */
  protected Instances m_instances;

  /** Output itemsets found? */
  protected boolean m_outputItemSets;

  protected boolean m_removeMissingCols;

  /** Report progress iteratively */
  protected boolean m_verbose;

  /**
   * Returns a string describing this associator
   * @return a description of the evaluator suitable for
   * displaying in the explorer/experimenter gui
   */
  public String globalInfo() {
    return "Finds association rules.";
  }

  /**
   * Constructor that allows to sets default values for the 
   * minimum confidence and the maximum number of rules
   * the minimum confidence.
   */
  public Apriori() {

    resetOptions();
  }

  /**
   * Resets the options to the default values.
   */
  public void resetOptions() {
    
    m_removeMissingCols = false;
    m_verbose = false;
    m_delta = 0.05;
    m_minMetric = 0.90;
    m_numRules = 10;
    m_lowerBoundMinSupport = 0.1;
    m_upperBoundMinSupport = 1.0;
    m_significanceLevel = -1;
    m_outputItemSets = false;
  }

  /**
   * Removes columns that are all missing from the data
   * @param instances the instances
   * @return a new set of instances with all missing columns removed
   */
  private Instances removeMissingColumns(Instances instances) 
    throws Exception {
    int numInstances = instances.numInstances();
    StringBuffer deleteString = new StringBuffer();
    int removeCount = 0;
    boolean first = true;
    int maxCount = 0;
    
    for (int i=0;i<instances.numAttributes();i++) {
      AttributeStats as = instances.attributeStats(i);
      if (m_upperBoundMinSupport == 1.0 && maxCount != numInstances) {
	// see if we can decrease this by looking for the most frequent value
	int [] counts = as.nominalCounts;
	if (counts[Utils.maxIndex(counts)] > maxCount) {
	  maxCount = counts[Utils.maxIndex(counts)];
	}
      }
      if (as.missingCount == numInstances) {
	if (first) {
	  deleteString.append((i+1));
	  first = false;
	} else {
	  deleteString.append(","+(i+1));
	}
	removeCount++;
      }
    }
    if (m_verbose) {
      System.err.println("Removed : "+removeCount+" columns with all missing "
			 +"values.");
    }
    if (m_upperBoundMinSupport == 1.0 && maxCount != numInstances) {
      m_upperBoundMinSupport = (double)maxCount / (double)numInstances;
      if (m_verbose) {
	System.err.println("Setting upper bound min support to : "
			   +m_upperBoundMinSupport);
      }
    }

    if (deleteString.toString().length() > 0) {
      Remove af = new Remove();
      af.setAttributeIndices(deleteString.toString());
      af.setInvertSelection(false);
      af.setInputFormat(instances);
      Instances newInst = Filter.useFilter(instances, af);

      return newInst;
    }
    return instances;
  }

  /**
   * Method that generates all large itemsets with a minimum support, and from
   * these all association rules with a minimum confidence.
   *
   * @param instances the instances to be used for generating the associations
   * @exception Exception if rules can't be built successfully
   */
  public void buildAssociations(Instances instances) throws Exception {

    double[] confidences, supports;
    int[] indices;
    FastVector[] sortedRuleSet;
    int necSupport=0;

    if (instances.checkForStringAttributes()) {
      throw new Exception("Can't handle string attributes!");
    }

    if (m_removeMissingCols) {
      instances = removeMissingColumns(instances);
    }

    // Decrease minimum support until desired number of rules found.
    m_cycles = 0;
    m_minSupport = m_upperBoundMinSupport - m_delta;
    m_minSupport = (m_minSupport < m_lowerBoundMinSupport) 
      ? m_lowerBoundMinSupport 
      : m_minSupport;
    do {

      // Reserve space for variables
      m_Ls = new FastVector();
      m_hashtables = new FastVector();
      m_allTheRules = new FastVector[6];
      m_allTheRules[0] = new FastVector();
      m_allTheRules[1] = new FastVector();
      m_allTheRules[2] = new FastVector();
      if (m_metricType != CONFIDENCE || m_significanceLevel != -1) {
	m_allTheRules[3] = new FastVector();
	m_allTheRules[4] = new FastVector();
	m_allTheRules[5] = new FastVector();
      }
      sortedRuleSet = new FastVector[6];
      sortedRuleSet[0] = new FastVector();
      sortedRuleSet[1] = new FastVector();
      sortedRuleSet[2] = new FastVector();
      if (m_metricType != CONFIDENCE || m_significanceLevel != -1) {
	sortedRuleSet[3] = new FastVector();
	sortedRuleSet[4] = new FastVector();
	sortedRuleSet[5] = new FastVector();
      }

      // Find large itemsets and rules
      findLargeItemSets(instances);
      if (m_significanceLevel != -1 || m_metricType != CONFIDENCE) 
	findRulesBruteForce();
      else
	findRulesQuickly();
      
      // Sort rules according to their support
      supports = new double[m_allTheRules[2].size()];
      for (int i = 0; i < m_allTheRules[2].size(); i++) 
	supports[i] = (double)((ItemSet)m_allTheRules[1].elementAt(i)).support();
      indices = Utils.stableSort(supports);
      for (int i = 0; i < m_allTheRules[2].size(); i++) {
	sortedRuleSet[0].addElement(m_allTheRules[0].elementAt(indices[i]));
	sortedRuleSet[1].addElement(m_allTheRules[1].elementAt(indices[i]));
	sortedRuleSet[2].addElement(m_allTheRules[2].elementAt(indices[i]));
	if (m_metricType != CONFIDENCE || m_significanceLevel != -1) {
	  sortedRuleSet[3].addElement(m_allTheRules[3].elementAt(indices[i]));
	  sortedRuleSet[4].addElement(m_allTheRules[4].elementAt(indices[i]));
	  sortedRuleSet[5].addElement(m_allTheRules[5].elementAt(indices[i]));
	}
      }

      // Sort rules according to their confidence
      m_allTheRules[0].removeAllElements();
      m_allTheRules[1].removeAllElements();
      m_allTheRules[2].removeAllElements();
      if (m_metricType != CONFIDENCE || m_significanceLevel != -1) {
	m_allTheRules[3].removeAllElements();
	m_allTheRules[4].removeAllElements();
	m_allTheRules[5].removeAllElements();
      }
      confidences = new double[sortedRuleSet[2].size()];
      int sortType = 2 + m_metricType;

      for (int i = 0; i < sortedRuleSet[2].size(); i++) 
	confidences[i] = 
	  ((Double)sortedRuleSet[sortType].elementAt(i)).doubleValue();
      indices = Utils.stableSort(confidences);
      for (int i = sortedRuleSet[0].size() - 1; 
	   (i >= (sortedRuleSet[0].size() - m_numRules)) && (i >= 0); i--) {
apriori.java - 源码说明

本页面展示了「一个数据挖掘软件ALPHAMINERR的整个过程的JAVA版源代码」中的 apriori.java 源码文件，采用 Java 编程语言编写，共 999 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与ALPHAMINERR相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?