📄 apriori.java
字号:
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* Apriori.java
* Copyright (C) 1999 Eibe Frank,Mark Hall
*
*/
package weka.associations;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.Reader;
import java.util.Enumeration;
import java.util.Hashtable;
import weka.core.AttributeStats;
import weka.core.FastVector;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.SelectedTag;
import weka.core.Tag;
import weka.core.Utils;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.Remove;
/**
* Class implementing an Apriori-type algorithm. Iteratively reduces the minimum
* support until it finds the required number of rules with the given minimum
* confidence. <p>
*
* Reference: R. Agrawal, R. Srikant (1994). <i>Fast algorithms for
* mining association rules in large databases </i>. Proc
* International Conference on Very Large Databases,
* pp. 478-499. Santiage, Chile: Morgan Kaufmann, Los Altos, CA. <p>
*
* Valid options are:<p>
*
* -N required number of rules <br>
* The required number of rules (default: 10). <p>
*
* -T type of metric by which to sort rules <br>
* 0 = confidence | 1 = lift | 2 = leverage | 3 = Conviction. <p>
*
* -C minimum confidence of a rule <br>
* The minimum confidence of a rule (default: 0.9). <p>
*
* -D delta for minimum support <br>
* The delta by which the minimum support is decreased in
* each iteration (default: 0.05). <p>
*
* -U upper bound for minimum support <br>
* The upper bound for minimum support. Don't explicitly look for
* rules with more than this level of support. <p>
*
* -M lower bound for minimum support <br>
* The lower bound for the minimum support (default = 0.1). <p>
*
* -S significance level <br>
* If used, rules are tested for significance at
* the given level. Slower (default = no significance testing). <p>
*
* -R <br>
* If set then columns that contain all missing values are removed from
* the data.
*
* -I <br>
* If set the itemsets found are also output (default = no). <p>
*
* @author Eibe Frank (eibe@cs.waikato.ac.nz)
* @author Mark Hall (mhall@cs.waikato.ac.nz)
* @version $Revision$ */
public class Apriori extends Associator implements OptionHandler {
/** The minimum support. */
protected double m_minSupport;
/** The upper bound on the support */
protected double m_upperBoundMinSupport;
/** The lower bound for the minimum support. */
protected double m_lowerBoundMinSupport;
/** Metric types. */
protected static final int CONFIDENCE = 0;
protected static final int LIFT = 1;
protected static final int LEVERAGE = 2;
protected static final int CONVICTION = 3;
public static final Tag [] TAGS_SELECTION = {
new Tag(CONFIDENCE, "Confidence"),
new Tag(LIFT, "Lift"),
new Tag(LEVERAGE, "Leverage"),
new Tag(CONVICTION, "Conviction")
};
/** The selected metric type. */
protected int m_metricType = CONFIDENCE;
/** The minimum metric score. */
protected double m_minMetric;
/** The maximum number of rules that are output. */
protected int m_numRules;
/** Delta by which m_minSupport is decreased in each iteration. */
protected double m_delta;
/** Significance level for optional significance test. */
protected double m_significanceLevel;
/** Number of cycles used before required number of rules was one. */
protected int m_cycles;
/** The set of all sets of itemsets L. */
protected FastVector m_Ls;
/** The same information stored in hash tables. */
protected FastVector m_hashtables;
/** The list of all generated rules. */
protected FastVector[] m_allTheRules;
/** The instances (transactions) to be used for generating
the association rules. */
protected Instances m_instances;
/** Output itemsets found? */
protected boolean m_outputItemSets;
protected boolean m_removeMissingCols;
/** Report progress iteratively */
protected boolean m_verbose;
/**
* Returns a string describing this associator
* @return a description of the evaluator suitable for
* displaying in the explorer/experimenter gui
*/
public String globalInfo() {
return "Finds association rules.";
}
/**
* Constructor that allows to sets default values for the
* minimum confidence and the maximum number of rules
* the minimum confidence.
*/
public Apriori() {
resetOptions();
}
/**
* Resets the options to the default values.
*/
public void resetOptions() {
m_removeMissingCols = false;
m_verbose = false;
m_delta = 0.05;
m_minMetric = 0.90;
m_numRules = 10;
m_lowerBoundMinSupport = 0.1;
m_upperBoundMinSupport = 1.0;
m_significanceLevel = -1;
m_outputItemSets = false;
}
/**
* Removes columns that are all missing from the data
* @param instances the instances
* @return a new set of instances with all missing columns removed
*/
private Instances removeMissingColumns(Instances instances)
throws Exception {
int numInstances = instances.numInstances();
StringBuffer deleteString = new StringBuffer();
int removeCount = 0;
boolean first = true;
int maxCount = 0;
for (int i=0;i<instances.numAttributes();i++) {
AttributeStats as = instances.attributeStats(i);
if (m_upperBoundMinSupport == 1.0 && maxCount != numInstances) {
// see if we can decrease this by looking for the most frequent value
int [] counts = as.nominalCounts;
if (counts[Utils.maxIndex(counts)] > maxCount) {
maxCount = counts[Utils.maxIndex(counts)];
}
}
if (as.missingCount == numInstances) {
if (first) {
deleteString.append((i+1));
first = false;
} else {
deleteString.append(","+(i+1));
}
removeCount++;
}
}
if (m_verbose) {
System.err.println("Removed : "+removeCount+" columns with all missing "
+"values.");
}
if (m_upperBoundMinSupport == 1.0 && maxCount != numInstances) {
m_upperBoundMinSupport = (double)maxCount / (double)numInstances;
if (m_verbose) {
System.err.println("Setting upper bound min support to : "
+m_upperBoundMinSupport);
}
}
if (deleteString.toString().length() > 0) {
Remove af = new Remove();
af.setAttributeIndices(deleteString.toString());
af.setInvertSelection(false);
af.setInputFormat(instances);
Instances newInst = Filter.useFilter(instances, af);
return newInst;
}
return instances;
}
/**
* Method that generates all large itemsets with a minimum support, and from
* these all association rules with a minimum confidence.
*
* @param instances the instances to be used for generating the associations
* @exception Exception if rules can't be built successfully
*/
public void buildAssociations(Instances instances) throws Exception {
double[] confidences, supports;
int[] indices;
FastVector[] sortedRuleSet;
int necSupport=0;
if (instances.checkForStringAttributes()) {
throw new Exception("Can't handle string attributes!");
}
if (m_removeMissingCols) {
instances = removeMissingColumns(instances);
}
// Decrease minimum support until desired number of rules found.
m_cycles = 0;
m_minSupport = m_upperBoundMinSupport - m_delta;
m_minSupport = (m_minSupport < m_lowerBoundMinSupport)
? m_lowerBoundMinSupport
: m_minSupport;
do {
// Reserve space for variables
m_Ls = new FastVector();
m_hashtables = new FastVector();
m_allTheRules = new FastVector[6];
m_allTheRules[0] = new FastVector();
m_allTheRules[1] = new FastVector();
m_allTheRules[2] = new FastVector();
if (m_metricType != CONFIDENCE || m_significanceLevel != -1) {
m_allTheRules[3] = new FastVector();
m_allTheRules[4] = new FastVector();
m_allTheRules[5] = new FastVector();
}
sortedRuleSet = new FastVector[6];
sortedRuleSet[0] = new FastVector();
sortedRuleSet[1] = new FastVector();
sortedRuleSet[2] = new FastVector();
if (m_metricType != CONFIDENCE || m_significanceLevel != -1) {
sortedRuleSet[3] = new FastVector();
sortedRuleSet[4] = new FastVector();
sortedRuleSet[5] = new FastVector();
}
// Find large itemsets and rules
findLargeItemSets(instances);
if (m_significanceLevel != -1 || m_metricType != CONFIDENCE)
findRulesBruteForce();
else
findRulesQuickly();
// Sort rules according to their support
supports = new double[m_allTheRules[2].size()];
for (int i = 0; i < m_allTheRules[2].size(); i++)
supports[i] = (double)((ItemSet)m_allTheRules[1].elementAt(i)).support();
indices = Utils.stableSort(supports);
for (int i = 0; i < m_allTheRules[2].size(); i++) {
sortedRuleSet[0].addElement(m_allTheRules[0].elementAt(indices[i]));
sortedRuleSet[1].addElement(m_allTheRules[1].elementAt(indices[i]));
sortedRuleSet[2].addElement(m_allTheRules[2].elementAt(indices[i]));
if (m_metricType != CONFIDENCE || m_significanceLevel != -1) {
sortedRuleSet[3].addElement(m_allTheRules[3].elementAt(indices[i]));
sortedRuleSet[4].addElement(m_allTheRules[4].elementAt(indices[i]));
sortedRuleSet[5].addElement(m_allTheRules[5].elementAt(indices[i]));
}
}
// Sort rules according to their confidence
m_allTheRules[0].removeAllElements();
m_allTheRules[1].removeAllElements();
m_allTheRules[2].removeAllElements();
if (m_metricType != CONFIDENCE || m_significanceLevel != -1) {
m_allTheRules[3].removeAllElements();
m_allTheRules[4].removeAllElements();
m_allTheRules[5].removeAllElements();
}
confidences = new double[sortedRuleSet[2].size()];
int sortType = 2 + m_metricType;
for (int i = 0; i < sortedRuleSet[2].size(); i++)
confidences[i] =
((Double)sortedRuleSet[sortType].elementAt(i)).doubleValue();
indices = Utils.stableSort(confidences);
for (int i = sortedRuleSet[0].size() - 1;
(i >= (sortedRuleSet[0].size() - m_numRules)) && (i >= 0); i--) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -