apriori.java

来自「数据挖掘关联规则算法：Apriori算法源代码」· Java 代码 · 共 1,465 行 · 第 1/4 页
JAVA
1,465 行
/* *    This program is free software; you can redistribute it and/or modify *    it under the terms of the GNU General Public License as published by *    the Free Software Foundation; either version 2 of the License, or *    (at your option) any later version. * *    This program is distributed in the hope that it will be useful, *    but WITHOUT ANY WARRANTY; without even the implied warranty of *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *    GNU General Public License for more details. * *    You should have received a copy of the GNU General Public License *    alo0ng with this program; if not, write to the Free Software *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* *    Apriori.java *    Copyright (C) 1999 University of Waikato, Hamilton, New Zealand * */package weka.associations;import weka.core.AttributeStats;import weka.core.Capabilities;import weka.core.FastVector;import weka.core.Instances;import weka.core.Option;import weka.core.OptionHandler;import weka.core.RevisionUtils;import weka.core.SelectedTag;import weka.core.Tag;import weka.core.TechnicalInformation;import weka.core.TechnicalInformationHandler;import weka.core.Utils;import weka.core.Capabilities.Capability;import weka.core.TechnicalInformation.Field;import weka.core.TechnicalInformation.Type;import weka.filters.Filter;import weka.filters.unsupervised.attribute.Remove;import java.util.Enumeration;import java.util.Hashtable;/** <!-- globalinfo-start --> * Class implementing an Apriori-type algorithm. Iteratively reduces the minimum support until it finds the required number of rules with the given minimum confidence.<br/> * The algorithm has an option to mine class association rules. It is adapted as explained in the second reference.<br/> * <br/> * For more information see:<br/> * <br/> * R. Agrawal, R. Srikant: Fast Algorithms for Mining Association Rules in Large Databases. In: 20th International Conference on Very Large Data Bases, 478-499, 1994.<br/> * <br/> * Bing Liu, Wynne Hsu, Yiming Ma: Integrating Classification and Association Rule Mining. In: Fourth International Conference on Knowledge Discovery and Data Mining, 80-86, 1998. * <p/> <!-- globalinfo-end --> * <!-- technical-bibtex-start --> * BibTeX: * <pre> * &#64;inproceedings{Agrawal1994, *    author = {R. Agrawal and R. Srikant}, *    booktitle = {20th International Conference on Very Large Data Bases}, *    pages = {478-499}, *    publisher = {Morgan Kaufmann, Los Altos, CA}, *    title = {Fast Algorithms for Mining Association Rules in Large Databases}, *    year = {1994} * } *  * &#64;inproceedings{Liu1998, *    author = {Bing Liu and Wynne Hsu and Yiming Ma}, *    booktitle = {Fourth International Conference on Knowledge Discovery and Data Mining}, *    pages = {80-86}, *    publisher = {AAAI Press}, *    title = {Integrating Classification and Association Rule Mining}, *    year = {1998} * } * </pre> * <p/> <!-- technical-bibtex-end --> * <!-- options-start --> * Valid options are: <p/> *  * <pre> -N &lt;required number of rules output&gt; *  The required number of rules. (default = 10)</pre> *  * <pre> -T &lt;0=confidence | 1=lift | 2=leverage | 3=Conviction&gt; *  The metric type by which to rank rules. (default = confidence)</pre> *  * <pre> -C &lt;minimum metric score of a rule&gt; *  The minimum confidence of a rule. (default = 0.9)</pre> *  * <pre> -D &lt;delta for minimum support&gt; *  The delta by which the minimum support is decreased in *  each iteration. (default = 0.05)</pre> *  * <pre> -U &lt;upper bound for minimum support&gt; *  Upper bound for minimum support. (default = 1.0)</pre> *  * <pre> -M &lt;lower bound for minimum support&gt; *  The lower bound for the minimum support. (default = 0.1)</pre> *  * <pre> -S &lt;significance level&gt; *  If used, rules are tested for significance at *  the given level. Slower. (default = no significance testing)</pre> *  * <pre> -I *  If set the itemsets found are also output. (default = no)</pre> *  * <pre> -R *  Remove columns that contain all missing values (default = no)</pre> *  * <pre> -V *  Report progress iteratively. (default = no)</pre> *  * <pre> -A *  If set class association rules are mined. (default = no)</pre> *  * <pre> -c &lt;the class index&gt; *  The class index. (default = last)</pre> *  <!-- options-end --> * * @author Eibe Frank (eibe@cs.waikato.ac.nz) * @author Mark Hall (mhall@cs.waikato.ac.nz) * @author Stefan Mutter (mutter@cs.waikato.ac.nz) * @version $Revision: 1.31 $ */public class Apriori   extends AbstractAssociator   implements OptionHandler, CARuleMiner, TechnicalInformationHandler {    /** for serialization */  static final long serialVersionUID = 3277498842319212687L;    /** The minimum support. */  protected double m_minSupport;  /** The upper bound on the support */  protected double m_upperBoundMinSupport;  /** The lower bound for the minimum support. */  protected double m_lowerBoundMinSupport;  /** Metric type: Confidence */  protected static final int CONFIDENCE = 0;  /** Metric type: Lift */  protected static final int LIFT = 1;  /** Metric type: Leverage */  protected static final int LEVERAGE = 2;  /** Metric type: Conviction */  protected static final int CONVICTION = 3;  /** Metric types. */  public static final Tag [] TAGS_SELECTION = {    new Tag(CONFIDENCE, "Confidence"),    new Tag(LIFT, "Lift"),    new Tag(LEVERAGE, "Leverage"),    new Tag(CONVICTION, "Conviction")      };  /** The selected metric type. */  protected int m_metricType = CONFIDENCE;  /** The minimum metric score. */  protected double m_minMetric;  /** The maximum number of rules that are output. */  protected int m_numRules;  /** Delta by which m_minSupport is decreased in each iteration. */  protected double m_delta;  /** Significance level for optional significance test. */  protected double m_significanceLevel;  /** Number of cycles used before required number of rules was one. */  protected int m_cycles;  /** The set of all sets of itemsets L. */  protected FastVector m_Ls;  /** The same information stored in hash tables. */  protected FastVector m_hashtables;  /** The list of all generated rules. */  protected FastVector[] m_allTheRules;  /** The instances (transactions) to be used for generating       the association rules. */  protected Instances m_instances;  /** Output itemsets found? */  protected boolean m_outputItemSets;  /** Remove columns with all missing values */  protected boolean m_removeMissingCols;  /** Report progress iteratively */  protected boolean m_verbose;    /** Only the class attribute of all Instances.*/  protected Instances m_onlyClass;    /** The class index. */    protected int m_classIndex;    /** Flag indicating whether class association rules are mined. */  protected boolean m_car;  /**   * Returns a string describing this associator   * @return a description of the evaluator suitable for   * displaying in the explorer/experimenter gui   */  public String globalInfo() {    return         "Class implementing an Apriori-type algorithm. Iteratively reduces "      + "the minimum support until it finds the required number of rules with "      + "the given minimum confidence.\n"      + "The algorithm has an option to mine class association rules. It is "      + "adapted as explained in the second reference.\n\n"      + "For more information see:\n\n"      + getTechnicalInformation().toString();  }    /**   * Returns an instance of a TechnicalInformation object, containing    * detailed information about the technical background of this class,   * e.g., paper reference or book this class is based on.   *    * @return the technical information about this class   */  public TechnicalInformation getTechnicalInformation() {    TechnicalInformation 	result;    TechnicalInformation 	additional;        result = new TechnicalInformation(Type.INPROCEEDINGS);    result.setValue(Field.AUTHOR, "R. Agrawal and R. Srikant");    result.setValue(Field.TITLE, "Fast Algorithms for Mining Association Rules in Large Databases");    result.setValue(Field.BOOKTITLE, "20th International Conference on Very Large Data Bases");    result.setValue(Field.YEAR, "1994");    result.setValue(Field.PAGES, "478-499");    result.setValue(Field.PUBLISHER, "Morgan Kaufmann, Los Altos, CA");    additional = result.add(Type.INPROCEEDINGS);    additional.setValue(Field.AUTHOR, "Bing Liu and Wynne Hsu and Yiming Ma");    additional.setValue(Field.TITLE, "Integrating Classification and Association Rule Mining");    additional.setValue(Field.BOOKTITLE, "Fourth International Conference on Knowledge Discovery and Data Mining");    additional.setValue(Field.YEAR, "1998");    additional.setValue(Field.PAGES, "80-86");    additional.setValue(Field.PUBLISHER, "AAAI Press");        return result;  }  /**   * Constructor that allows to sets default values for the    * minimum confidence and the maximum number of rules   * the minimum confidence.   */  public Apriori() {    resetOptions();  }  /**   * Resets the options to the default values.   */  public void resetOptions() {        m_removeMissingCols = false;    m_verbose = false;    m_delta = 0.05;    m_minMetric = 0.90;    m_numRules = 10;    m_lowerBoundMinSupport = 0.1;    m_upperBoundMinSupport = 1.0;    m_significanceLevel = -1;    m_outputItemSets = false;    m_car = false;    m_classIndex = -1;  }  /**   * Removes columns that are all missing from the data   * @param instances the instances   * @return a new set of instances with all missing columns removed   * @throws Exception if something goes wrong   */  protected Instances removeMissingColumns(Instances instances)     throws Exception {        int numInstances = instances.numInstances();    StringBuffer deleteString = new StringBuffer();    int removeCount = 0;    boolean first = true;    int maxCount = 0;        for (int i=0;i<instances.numAttributes();i++) {      AttributeStats as = instances.attributeStats(i);      if (m_upperBoundMinSupport == 1.0 && maxCount != numInstances) {	// see if we can decrease this by looking for the most frequent value	int [] counts = as.nominalCounts;	if (counts[Utils.maxIndex(counts)] > maxCount) {	  maxCount = counts[Utils.maxIndex(counts)];	}      }      if (as.missingCount == numInstances) {	if (first) {	  deleteString.append((i+1));	  first = false;	} else {	  deleteString.append(","+(i+1));	}	removeCount++;      }    }    if (m_verbose) {      System.err.println("Removed : "+removeCount+" columns with all missing "			 +"values.");    }    if (m_upperBoundMinSupport == 1.0 && maxCount != numInstances) {      m_upperBoundMinSupport = (double)maxCount / (double)numInstances;      if (m_verbose) {	System.err.println("Setting upper bound min support to : "			   +m_upperBoundMinSupport);      }    }    if (deleteString.toString().length() > 0) {      Remove af = new Remove();      af.setAttributeIndices(deleteString.toString());      af.setInvertSelection(false);      af.setInputFormat(instances);      Instances newInst = Filter.useFilter(instances, af);      return newInst;    }    return instances;  }  /**   * Returns default capabilities of the classifier.   *   * @return      the capabilities of this classifier   */  public Capabilities getCapabilities() {    Capabilities result = super.getCapabilities();    // attributes    result.enable(Capability.NOMINAL_ATTRIBUTES);    result.enable(Capability.MISSING_VALUES);    // class (can handle a nominal class if CAR rules are selected). This    result.enable(Capability.NO_CLASS);    result.enable(Capability.NOMINAL_CLASS);    result.enable(Capability.MISSING_CLASS_VALUES);        return result;  }  /**   * Method that generates all large itemsets with a minimum support, and from   * these all association rules with a minimum confidence.   *
apriori.java - 源码说明

本页面展示了「数据挖掘关联规则算法：Apriori算法源代码」中的 apriori.java 源码文件，采用 Java 编程语言编写，共 1,465 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与Apriori相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?