📄 jrip.java
字号:
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* JRip.java
* Copyright (C) 2001 Xin Xu, Eibe Frank
*/
package weka.classifiers.rules;
import java.io.Serializable;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;
import weka.classifiers.Classifier;
import weka.classifiers.Evaluation;
import weka.core.AdditionalMeasureProducer;
import weka.core.Attribute;
import weka.core.Copyable;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.UnsupportedAttributeTypeException;
import weka.core.UnsupportedClassTypeException;
import weka.core.Utils;
import weka.core.WeightedInstancesHandler;
import weka.filters.Filter;
import weka.filters.supervised.attribute.ClassOrder;
/**
* This class implements a propositional rule learner, Repeated Incremental
* Pruning to Produce Error Reduction (RIPPER), which is proposed by William
* W. Cohen as an optimized version of IREP. <p>
*
* The algorithm is briefly described as follows: <p>
* Initialize RS = {}, and for each class from the less prevalent one to
* the more frequent one, DO: <p>
*
* 1. Building stage: repeat 1.1 and 1.2 until the descrition length (DL)
* of the ruleset and examples is 64 bits greater than the smallest DL
* met so far, or there are no positive examples, or the error rate >= 50%.
* <p>
* 1.1. Grow phase:<br>
* Grow one rule by greedily adding antecedents (or conditions) to
* the rule until the rule is perfect (i.e. 100% accurate). The
* procedure tries every possible value of each attribute and selects
* the condition with highest information gain: p(log(p/t)-log(P/T)).
* <p>
* 1.2. Prune phase:<br>
* Incrementally prune each rule and allow the pruning of any
* final sequences of the antecedents;<br>
* The pruning metric is (p-n)/(p+n) -- but it's actually
* 2p/(p+n) -1, so in this implementation we simply use p/(p+n)
* (actually (p+1)/(p+n+2), thus if p+n is 0, it's 0.5).<p>
*
* 2. Optimization stage: after generating the initial ruleset {Ri},
* generate and prune two variants of each rule Ri from randomized data
* using procedure 1.1 and 1.2. But one variant is generated from an
* empty rule while the other is generated by greedily adding antecedents
* to the original rule. Moreover, the pruning metric used here is
* (TP+TN)/(P+N).<br>
* Then the smallest possible DL for each variant and the original rule
* is computed. The variant with the minimal DL is selected as the final
* representative of Ri in the ruleset. <br>
* After all the rules in {Ri} have been examined and if there are still
* residual positives, more rules are generated based on the residual
* positives using Building Stage again. <p>
*
* 3. Delete the rules from the ruleset that would increase the DL of the
* whole ruleset if it were in it. and add resultant ruleset to RS. <p>
*
* ENDDO<p>
*
* Note that there seem to be 2 bugs in the ripper program that would
* affect the ruleset size and accuracy slightly. This implementation avoids
* these bugs and thus is a little bit different from Cohen's original
* implementation. Even after fixing the bugs, since the order of classes with
* the same frequency is not defined in ripper, there still seems to be
* some trivial difference between this implementation and the original ripper,
* especially for audiology data in UCI repository, where there are lots of
* classes of few instances.<p>
*
* If wrapped by other classes, typical usage of this class is:<br>
*
* <code>JRip rip = new JRip();
* Instances data = ... // Data from somewhere
* double[] orderedClasses = ... // Get the ordered class counts for the data
* double expFPRate = ... // Calculate the expected FP/(FP+FN) rate
* double classIndex = ... // The class index for which ruleset is built
* // DL of default rule, no theory DL, only data DL
* double defDL = RuleStats.dataDL(expFPRate, 0.0, data.sumOfWeights(),
* 0.0, orderedClasses[(int)classIndex]);
*
* rip.rulesetForOneClass(expFPRate, data, classIndex, defDL);
* RuleStats rulesetStats = rip.getRuleStats(0);
*
* // Can get heaps of information from RuleStats, e.g. combined DL,
* // simpleStats, etc.
* double comDL = rulesetStats.combinedDL(expFPRate, classIndex);
* int whichRule = ... // Want simple stats of which rule?
* double[] simpleStats = rulesetStats.getSimpleStats(whichRule);
* ...
* </code>
*
* Details please see "Fast Effective Rule Induction", William W. Cohen,
* 'Machine Learning: Proceedings of the Twelfth International Conference'
* (ML95). <p>
*
* PS. We have compared this implementation with the original ripper
* implementation in aspects of accuracy, ruleset size and running time
* on both artificial data "ab+bcd+defg" and UCI datasets. In all these
* aspects it seems to be quite comparable to the original ripper
* implementation. However, we didn't consider memory consumption
* optimization in this implementation.<p>
*
* @author Xin Xu (xx5@cs.waikato.ac.nz)
* @author Eibe Frank (eibe@cs.waikato.ac.nz)
* @version $Revision$
*/
public class JRip extends Classifier
implements OptionHandler,
AdditionalMeasureProducer,
WeightedInstancesHandler{
/** The limit of description length surplus in ruleset generation */
private static double MAX_DL_SURPLUS = 64.0;
/** The class attribute of the data*/
private Attribute m_Class;
/** The ruleset */
private FastVector m_Ruleset;
/** The predicted class distribution */
private FastVector m_Distributions;
/** Runs of optimizations */
private int m_Optimizations = 2;
/** Random object used in this class */
private Random m_Random = null;
/** # of all the possible conditions in a rule */
private double m_Total = 0;
/** The seed to perform randomization */
private long m_Seed = 1;
/** The number of folds to split data into Grow and Prune for IREP */
private int m_Folds = 3;
/** The minimal number of instance weights within a split*/
private double m_MinNo = 2.0;
/** Whether in a debug mode */
private boolean m_Debug = false;
/** Whether check the error rate >= 0.5 in stopping criteria */
private boolean m_CheckErr = true;
/** Whether use pruning, i.e. the data is clean or not */
private boolean m_UsePruning = true;
/** The filter used to randomize the class order */
private Filter m_Filter = null;
/** The RuleStats for the ruleset of each class value */
private FastVector m_RulesetStats;
/**
* Returns a string describing classifier
* @return a description suitable for
* displaying in the explorer/experimenter gui
*/
public String globalInfo() {
return "This class implements a propositional rule learner, Repeated Incremental "
+ "Pruning to Produce Error Reduction (RIPPER), which was proposed by William "
+ "W. Cohen as an optimized version of IREP. \n\n"
+ "The algorithm is briefly described as follows: \n\n"
+ "Initialize RS = {}, and for each class from the less prevalent one to "
+ "the more frequent one, DO: \n\n"
+ "1. Building stage:\nRepeat 1.1 and 1.2 until the descrition length (DL) "
+ "of the ruleset and examples is 64 bits greater than the smallest DL "
+ "met so far, or there are no positive examples, or the error rate >= 50%. "
+ "\n\n"
+ "1.1. Grow phase:\n"
+ "Grow one rule by greedily adding antecedents (or conditions) to "
+ "the rule until the rule is perfect (i.e. 100% accurate). The "
+ "procedure tries every possible value of each attribute and selects "
+ "the condition with highest information gain: p(log(p/t)-log(P/T))."
+ "\n\n"
+ "1.2. Prune phase:\n"
+ "Incrementally prune each rule and allow the pruning of any "
+ "final sequences of the antecedents;"
+ "The pruning metric is (p-n)/(p+n) -- but it's actually "
+ "2p/(p+n) -1, so in this implementation we simply use p/(p+n) "
+ "(actually (p+1)/(p+n+2), thus if p+n is 0, it's 0.5).\n\n"
+ "2. Optimization stage:\n after generating the initial ruleset {Ri}, "
+ "generate and prune two variants of each rule Ri from randomized data "
+ "using procedure 1.1 and 1.2. But one variant is generated from an "
+ "empty rule while the other is generated by greedily adding antecedents "
+ "to the original rule. Moreover, the pruning metric used here is "
+ "(TP+TN)/(P+N)."
+ "Then the smallest possible DL for each variant and the original rule "
+ "is computed. The variant with the minimal DL is selected as the final "
+ "representative of Ri in the ruleset."
+ "After all the rules in {Ri} have been examined and if there are still "
+ "residual positives, more rules are generated based on the residual "
+ "positives using Building Stage again. \n"
+ "3. Delete the rules from the ruleset that would increase the DL of the "
+ "whole ruleset if it were in it. and add resultant ruleset to RS. \n"
+ "ENDDO\n\n"
+ "Note that there seem to be 2 bugs in the original ripper program that would "
+ "affect the ruleset size and accuracy slightly. This implementation avoids "
+ "these bugs and thus is a little bit different from Cohen's original "
+ "implementation. Even after fixing the bugs, since the order of classes with "
+ "the same frequency is not defined in ripper, there still seems to be "
+ "some trivial difference between this implementation and the original ripper, "
+ "especially for audiology data in UCI repository, where there are lots of "
+ "classes of few instances.\n\n"
+ "Details please see \"Fast Effective Rule Induction\", William W. Cohen, "
+ "'Machine Learning: Proceedings of the Twelfth International Conference'"
+ "(ML95). \n\n"
+ "PS. We have compared this implementation with the original ripper "
+ "implementation in aspects of accuracy, ruleset size and running time "
+ "on both artificial data \"ab+bcd+defg\" and UCI datasets. In all these "
+ "aspects it seems to be quite comparable to the original ripper "
+ "implementation. However, we didn't consider memory consumption "
+ "optimization in this implementation.\n\n";
}
/**
* Returns an enumeration describing the available options
* Valid options are: <p>
*
* -F number <br>
* The number of folds for reduced error pruning. One fold is
* used as the pruning set. (Default: 3) <p>
*
* -N number <br>
* The minimal weights of instances within a split.
* (Default: 2) <p>
*
* -O number <br>
* Set the number of runs of optimizations. (Default: 2)<p>
*
* -D <br>
* Whether turn on the debug mode
*
* -S number <br>
* The seed of randomization used in Ripper.(Default: 1)<p>
*
* -E <br>
* Whether NOT check the error rate >= 0.5 in stopping criteria.
* (default: check)<p>
*
* -P <br>
* Whether NOT use pruning. (default: use pruning)<p>
*
* @return an enumeration of all the available options
*/
public Enumeration listOptions() {
Vector newVector = new Vector(3);
newVector.addElement(new Option("\tSet number of folds for REP\n" +
"\tOne fold is used as pruning set.\n" +
"\t(default 3)","F", 1, "-F <number of folds>"));
newVector.addElement(new Option("\tSet the minimal weights of instances\n" +
"\twithin a split.\n" +
"\t(default 2.0)","N", 1, "-N <min. weights>"));
newVector.addElement(new Option("\tSet the number of runs of\n"+
"\toptimizations. (Default: 2)", "O",
1,"-O <number of runs>"));
newVector.addElement(new Option("\tSet whether turn on the\n"+
"\tdebug mode (Default: false)", "D",
0,"-D"));
newVector.addElement(new Option("\tThe seed of randomization\n"+
"\t(Default: 1)", "S",
1,"-S <seed>"));
newVector.addElement(new Option("Whether NOT check the error rate>=0.5\n"
+"\tin stopping criteria "
+"\t(default: check)", "E",
0, "-E"));
newVector.addElement(new Option("Whether NOT use pruning\n"
+"\t(default: use pruning)", "P",
0, "-P"));
return newVector.elements();
}
/**
* Parses a given list of options.
*
* @param options the list of options as an array of strings
* @exception Exception if an option is not supported
*/
public void setOptions(String[] options) throws Exception {
String numFoldsString = Utils.getOption('F', options);
if (numFoldsString.length() != 0)
m_Folds = Integer.parseInt(numFoldsString);
else
m_Folds = 3;
String minNoString = Utils.getOption('N', options);
if (minNoString.length() != 0)
m_MinNo = Double.parseDouble(minNoString);
else
m_MinNo = 2.0;
String seedString = Utils.getOption('S', options);
if (seedString.length() != 0)
m_Seed = Long.parseLong(seedString);
else
m_Seed = 1;
String runString = Utils.getOption('O', options);
if (runString.length() != 0)
m_Optimizations = Integer.parseInt(runString);
else
m_Optimizations = 2;
m_Debug = Utils.getFlag('D', options);
m_CheckErr = !Utils.getFlag('E', options);
m_UsePruning = !Utils.getFlag('P', options);
}
/**
* Gets the current settings of the Classifier.
*
* @return an array of strings suitable for passing to setOptions
*/
public String [] getOptions() {
String [] options = new String [11];
int current = 0;
options[current++] = "-F"; options[current++] = "" + m_Folds;
options[current++] = "-N"; options[current++] = "" + m_MinNo;
options[current++] = "-O"; options[current++] = "" + m_Optimizations;
options[current++] = "-S"; options[current++] = "" + m_Seed;
if(m_Debug)
options[current++] = "-D";
if(!m_CheckErr)
options[current++] = "-E";
if(!m_UsePruning)
options[current++] = "-P";
while(current < options.length)
options[current++] = "";
return options;
}
/**
* Returns an enumeration of the additional measure names
* @return an enumeration of the measure names
*/
public Enumeration emerateMeasures() {
Vector newVector = new Vector(1);
newVector.addElement("measureNumRules");
return newVector.elements();
}
/**
* Returns the value of the named measure
* @param measureName the name of the measure to query for its value
* @return the value of the named measure
* @exception IllegalArgumentException if the named measure is not supported
*/
public double getMeasure(String additionalMeasureName) {
if (additionalMeasureName.compareToIgnoreCase("measureNumRules") == 0)
return m_Ruleset.size();
else
throw new IllegalArgumentException(additionalMeasureName+" not supported (RIPPER)");
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String foldsTipText() {
return "Determines the amount of data used for pruning. One fold is used for "
+ "pruning, the rest for growing the rules.";
}
public void setFolds(int fold){ m_Folds = fold; }
public int getFolds(){ return m_Folds; }
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String minNoTipText() {
return "The minimum total weight of the instances in a rule.";
}
public void setMinNo(double m){ m_MinNo = m; }
public double getMinNo(){ return m_MinNo; }
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String seedTipText() {
return "The seed used for randomizing the data.";
}
public void setSeed(long s){ m_Seed = s; }
public long getSeed(){ return m_Seed; }
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String optimizationsTipText() {
return "The number of optimization runs.";
}
public void setOptimizations(int run){ m_Optimizations = run; }
public int getOptimizations(){ return m_Optimizations; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -