📄 apriori.java
字号:
/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * alo0ng with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* * Apriori.java * Copyright (C) 1999 University of Waikato, Hamilton, New Zealand * */package weka.associations;import weka.core.AttributeStats;import weka.core.Capabilities;import weka.core.FastVector;import weka.core.Instances;import weka.core.Option;import weka.core.OptionHandler;import weka.core.RevisionUtils;import weka.core.SelectedTag;import weka.core.Tag;import weka.core.TechnicalInformation;import weka.core.TechnicalInformationHandler;import weka.core.Utils;import weka.core.Capabilities.Capability;import weka.core.TechnicalInformation.Field;import weka.core.TechnicalInformation.Type;import weka.filters.Filter;import weka.filters.unsupervised.attribute.Remove;import java.util.Enumeration;import java.util.Hashtable;/** <!-- globalinfo-start --> * Class implementing an Apriori-type algorithm. Iteratively reduces the minimum support until it finds the required number of rules with the given minimum confidence.<br/> * The algorithm has an option to mine class association rules. It is adapted as explained in the second reference.<br/> * <br/> * For more information see:<br/> * <br/> * R. Agrawal, R. Srikant: Fast Algorithms for Mining Association Rules in Large Databases. In: 20th International Conference on Very Large Data Bases, 478-499, 1994.<br/> * <br/> * Bing Liu, Wynne Hsu, Yiming Ma: Integrating Classification and Association Rule Mining. In: Fourth International Conference on Knowledge Discovery and Data Mining, 80-86, 1998. * <p/> <!-- globalinfo-end --> * <!-- technical-bibtex-start --> * BibTeX: * <pre> * @inproceedings{Agrawal1994, * author = {R. Agrawal and R. Srikant}, * booktitle = {20th International Conference on Very Large Data Bases}, * pages = {478-499}, * publisher = {Morgan Kaufmann, Los Altos, CA}, * title = {Fast Algorithms for Mining Association Rules in Large Databases}, * year = {1994} * } * * @inproceedings{Liu1998, * author = {Bing Liu and Wynne Hsu and Yiming Ma}, * booktitle = {Fourth International Conference on Knowledge Discovery and Data Mining}, * pages = {80-86}, * publisher = {AAAI Press}, * title = {Integrating Classification and Association Rule Mining}, * year = {1998} * } * </pre> * <p/> <!-- technical-bibtex-end --> * <!-- options-start --> * Valid options are: <p/> * * <pre> -N <required number of rules output> * The required number of rules. (default = 10)</pre> * * <pre> -T <0=confidence | 1=lift | 2=leverage | 3=Conviction> * The metric type by which to rank rules. (default = confidence)</pre> * * <pre> -C <minimum metric score of a rule> * The minimum confidence of a rule. (default = 0.9)</pre> * * <pre> -D <delta for minimum support> * The delta by which the minimum support is decreased in * each iteration. (default = 0.05)</pre> * * <pre> -U <upper bound for minimum support> * Upper bound for minimum support. (default = 1.0)</pre> * * <pre> -M <lower bound for minimum support> * The lower bound for the minimum support. (default = 0.1)</pre> * * <pre> -S <significance level> * If used, rules are tested for significance at * the given level. Slower. (default = no significance testing)</pre> * * <pre> -I * If set the itemsets found are also output. (default = no)</pre> * * <pre> -R * Remove columns that contain all missing values (default = no)</pre> * * <pre> -V * Report progress iteratively. (default = no)</pre> * * <pre> -A * If set class association rules are mined. (default = no)</pre> * * <pre> -c <the class index> * The class index. (default = last)</pre> * <!-- options-end --> * * @author Eibe Frank (eibe@cs.waikato.ac.nz) * @author Mark Hall (mhall@cs.waikato.ac.nz) * @author Stefan Mutter (mutter@cs.waikato.ac.nz) * @version $Revision: 1.31 $ */public class Apriori extends AbstractAssociator implements OptionHandler, CARuleMiner, TechnicalInformationHandler { /** for serialization */ static final long serialVersionUID = 3277498842319212687L; /** The minimum support. */ protected double m_minSupport; /** The upper bound on the support */ protected double m_upperBoundMinSupport; /** The lower bound for the minimum support. */ protected double m_lowerBoundMinSupport; /** Metric type: Confidence */ protected static final int CONFIDENCE = 0; /** Metric type: Lift */ protected static final int LIFT = 1; /** Metric type: Leverage */ protected static final int LEVERAGE = 2; /** Metric type: Conviction */ protected static final int CONVICTION = 3; /** Metric types. */ public static final Tag [] TAGS_SELECTION = { new Tag(CONFIDENCE, "Confidence"), new Tag(LIFT, "Lift"), new Tag(LEVERAGE, "Leverage"), new Tag(CONVICTION, "Conviction") }; /** The selected metric type. */ protected int m_metricType = CONFIDENCE; /** The minimum metric score. */ protected double m_minMetric; /** The maximum number of rules that are output. */ protected int m_numRules; /** Delta by which m_minSupport is decreased in each iteration. */ protected double m_delta; /** Significance level for optional significance test. */ protected double m_significanceLevel; /** Number of cycles used before required number of rules was one. */ protected int m_cycles; /** The set of all sets of itemsets L. */ protected FastVector m_Ls; /** The same information stored in hash tables. */ protected FastVector m_hashtables; /** The list of all generated rules. */ protected FastVector[] m_allTheRules; /** The instances (transactions) to be used for generating the association rules. */ protected Instances m_instances; /** Output itemsets found? */ protected boolean m_outputItemSets; /** Remove columns with all missing values */ protected boolean m_removeMissingCols; /** Report progress iteratively */ protected boolean m_verbose; /** Only the class attribute of all Instances.*/ protected Instances m_onlyClass; /** The class index. */ protected int m_classIndex; /** Flag indicating whether class association rules are mined. */ protected boolean m_car; /** * Returns a string describing this associator * @return a description of the evaluator suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return "Class implementing an Apriori-type algorithm. Iteratively reduces " + "the minimum support until it finds the required number of rules with " + "the given minimum confidence.\n" + "The algorithm has an option to mine class association rules. It is " + "adapted as explained in the second reference.\n\n" + "For more information see:\n\n" + getTechnicalInformation().toString(); } /** * Returns an instance of a TechnicalInformation object, containing * detailed information about the technical background of this class, * e.g., paper reference or book this class is based on. * * @return the technical information about this class */ public TechnicalInformation getTechnicalInformation() { TechnicalInformation result; TechnicalInformation additional; result = new TechnicalInformation(Type.INPROCEEDINGS); result.setValue(Field.AUTHOR, "R. Agrawal and R. Srikant"); result.setValue(Field.TITLE, "Fast Algorithms for Mining Association Rules in Large Databases"); result.setValue(Field.BOOKTITLE, "20th International Conference on Very Large Data Bases"); result.setValue(Field.YEAR, "1994"); result.setValue(Field.PAGES, "478-499"); result.setValue(Field.PUBLISHER, "Morgan Kaufmann, Los Altos, CA"); additional = result.add(Type.INPROCEEDINGS); additional.setValue(Field.AUTHOR, "Bing Liu and Wynne Hsu and Yiming Ma"); additional.setValue(Field.TITLE, "Integrating Classification and Association Rule Mining"); additional.setValue(Field.BOOKTITLE, "Fourth International Conference on Knowledge Discovery and Data Mining"); additional.setValue(Field.YEAR, "1998"); additional.setValue(Field.PAGES, "80-86"); additional.setValue(Field.PUBLISHER, "AAAI Press"); return result; } /** * Constructor that allows to sets default values for the * minimum confidence and the maximum number of rules * the minimum confidence. */ public Apriori() { resetOptions(); } /** * Resets the options to the default values. */ public void resetOptions() { m_removeMissingCols = false; m_verbose = false; m_delta = 0.05; m_minMetric = 0.90; m_numRules = 10; m_lowerBoundMinSupport = 0.1; m_upperBoundMinSupport = 1.0; m_significanceLevel = -1; m_outputItemSets = false; m_car = false; m_classIndex = -1; } /** * Removes columns that are all missing from the data * @param instances the instances * @return a new set of instances with all missing columns removed * @throws Exception if something goes wrong */ protected Instances removeMissingColumns(Instances instances) throws Exception { int numInstances = instances.numInstances(); StringBuffer deleteString = new StringBuffer(); int removeCount = 0; boolean first = true; int maxCount = 0; for (int i=0;i<instances.numAttributes();i++) { AttributeStats as = instances.attributeStats(i); if (m_upperBoundMinSupport == 1.0 && maxCount != numInstances) { // see if we can decrease this by looking for the most frequent value int [] counts = as.nominalCounts; if (counts[Utils.maxIndex(counts)] > maxCount) { maxCount = counts[Utils.maxIndex(counts)]; } } if (as.missingCount == numInstances) { if (first) { deleteString.append((i+1)); first = false; } else { deleteString.append(","+(i+1)); } removeCount++; } } if (m_verbose) { System.err.println("Removed : "+removeCount+" columns with all missing " +"values."); } if (m_upperBoundMinSupport == 1.0 && maxCount != numInstances) { m_upperBoundMinSupport = (double)maxCount / (double)numInstances; if (m_verbose) { System.err.println("Setting upper bound min support to : " +m_upperBoundMinSupport); } } if (deleteString.toString().length() > 0) { Remove af = new Remove(); af.setAttributeIndices(deleteString.toString()); af.setInvertSelection(false); af.setInputFormat(instances); Instances newInst = Filter.useFilter(instances, af); return newInst; } return instances; } /** * Returns default capabilities of the classifier. * * @return the capabilities of this classifier */ public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); // attributes result.enable(Capability.NOMINAL_ATTRIBUTES); result.enable(Capability.MISSING_VALUES); // class (can handle a nominal class if CAR rules are selected). This result.enable(Capability.NO_CLASS); result.enable(Capability.NOMINAL_CLASS); result.enable(Capability.MISSING_CLASS_VALUES); return result; } /** * Method that generates all large itemsets with a minimum support, and from * these all association rules with a minimum confidence. *
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -