📄 jrip.java
字号:
/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* * JRip.java * Copyright (C) 2001 Xin Xu, Eibe Frank */package weka.classifiers.rules;import java.util.Enumeration;import java.util.Random;import java.util.Vector;import weka.core.FastVector;import weka.core.Instances;import weka.core.Instance;import weka.core.Attribute;import weka.core.AttributeStats;import weka.core.Utils;import weka.core.OptionHandler;import weka.core.Option;import weka.core.Copyable;import weka.core.WeightedInstancesHandler;import weka.core.AdditionalMeasureProducer;import weka.core.UnsupportedAttributeTypeException;import weka.core.UnsupportedClassTypeException;import weka.filters.supervised.attribute.ClassOrder;import weka.filters.Filter;import weka.classifiers.DistributionClassifier;import weka.classifiers.Evaluation;/** * This class implements a propositional rule learner, Repeated Incremental * Pruning to Produce Error Reduction (RIPPER), which is proposed by William * W. Cohen as an optimzed version of IREP. <p> * * The algorithm is briefly described as follows: <p> * Initialize RS = {}, and for each class from the less prevalent one to * the more frequent one, DO: <p> * * 1. Building stage: repeat 1.1 and 1.2 until the descrition length (DL) * of the ruleset and examples is 64 bits greater than the smallest DL * met so far, or there are no positive examples, or the error rate >= 50%. * <p> * 1.1. Grow phase:<br> * Grow one rule by greedily adding antecedents (or conditions) to * the rule until the rule is perfect (i.e. 100% accurate). The * procedure tries every possible value of each attribute and selects * the condition with highest information gain: p(log(p/t)-log(P/T)). * <p> * 1.2. Prune phase:<br> * Incrementally prune each rule and allow the pruning of any * final sequences of the antecedents;<br> * The pruning metric is (p-n)/(p+n) -- but it's actually * 2p/(p+n) -1, so in this implementation we simply use p/(p+n) * (actually (p+1)/(p+n+2), thus if p+n is 0, it's 0.5).<p> * * 2. Optimization stage: after generating the initial ruleset {Ri}, * generate and prune two variants of each rule Ri from randomized data * using procedure 1.1 and 1.2. But one variant is generated from an * empty rule while the other is generated by greedily adding antecedents * to the original rule. Moreover, the pruning metric used here is * (TP+TN)/(P+N).<br> * Then the smallest possible DL for each variant and the original rule * is computed. The variant with the minimal DL is selected as the final * representative of Ri in the ruleset. <br> * After all the rules in {Ri} have been examined and if there are still * residual positives, more rules are generated based on the residual * positives using Building Stage again. <p> * * 3. Delete the rules from the ruleset that would increase the DL of the * whole ruleset if it were in it. and add resultant ruleset to RS. <p> * * ENDDO<p> * * Note that there seem to be 2 bugs in the ripper program that would * affect the ruleset size and accuracy slightly. This implementation avoids * these bugs and thus is a little bit different from Cohen's original * implementation. Even after fixing the bugs, since the order of classes with * the same frequency is not defined in ripper, there still seems to be * some trivial difference between this implementation and the original ripper, * especially for audiology data in UCI repository, where there are lots of * classes of few instances.<p> * * If wrapped by other classes, typical usage of this class is:<br> * * <code>JRip rip = new JRip(); * Instances data = ... // Data from somewhere * double[] orderedClasses = ... // Get the ordered class counts for the data * double expFPRate = ... // Calculate the expected FP/(FP+FN) rate * double classIndex = ... // The class index for which ruleset is built * // DL of default rule, no theory DL, only data DL * double defDL = RuleStats.dataDL(expFPRate, 0.0, data.sumOfWeights(), * 0.0, orderedClasses[(int)classIndex]); * * rip.rulesetForOneClass(expFPRate, data, classIndex, defDL); * RuleStats rulesetStats = rip.getRuleStats(0); * * // Can get heaps of information from RuleStats, e.g. combined DL, * // simpleStats, etc. * double comDL = rulesetStats.combinedDL(expFPRate, classIndex); * int whichRule = ... // Want simple stats of which rule? * double[] simpleStats = rulesetStats.getSimpleStats(whichRule); * ... * </code> * * Details please see "Fast Effective Rule Induction", William W. Cohen, * 'Machine Learning: Proceedings of the Twelfth International Conference' * (ML95). <p> * * PS. We have compared this implementation with the original ripper * implementation in aspects of accuracy, ruleset size and running time * on both artificial data "ab+bcd+defg" and UCI datasets. In all these * aspects it seems to be quite comparable to the original ripper * implementation. However, we didn't consider memory consumption * optimization in this implementation.<p> * * @author Xin Xu (xx5@cs.waikato.ac.nz) * @author Eibe Frank (eibe@cs.waikato.ac.nz) * @version $Revision: 1.1.1.1 $ */public class JRip extends DistributionClassifier implements OptionHandler, AdditionalMeasureProducer, WeightedInstancesHandler{ /** The limit of description length surplus in ruleset generation */ private static double MAX_DL_SURPLUS = 64.0; /** The class attribute of the data*/ private Attribute m_Class; /** The ruleset */ private FastVector m_Ruleset; /** The predicted class distribution */ private FastVector m_Distributions; /** Runs of optimizations */ private int m_Optimizations = 2; /** Random object used in this class */ private Random m_Random = null; /** # of all the possible conditions in a rule */ private double m_Total = 0; /** The seed to perform randomization */ private long m_Seed = 1; /** The number of folds to split data into Grow and Prune for IREP */ private int m_Folds = 3; /** The minimal number of instance weights within a split*/ private double m_MinNo = 2.0; /** Whether in a debug mode */ private boolean m_Debug = false; /** Whether check the error rate >= 0.5 in stopping criteria */ private boolean m_CheckErr = true; /** Whether use pruning, i.e. the data is clean or not */ private boolean m_UsePruning = true; /** The filter used to randomize the class order */ private Filter m_Filter = null; /** The RuleStats for the ruleset of each class value */ private FastVector m_RulesetStats; /** * Returns an enumeration describing the available options * Valid options are: <p> * * -F number <br> * The number of folds for reduced error pruning. One fold is * used as the pruning set. (Default: 3) <p> * * -N number <br> * The minimal weights of instances within a split. * (Default: 2) <p> * * -O number <br> * Set the number of runs of optimizations. (Default: 2)<p> * * -D <br> * Whether turn on the debug mode * * -S number <br> * The seed of randomization used in Ripper.(Default: 1)<p> * * -E <br> * Whether NOT check the error rate >= 0.5 in stopping criteria. * (default: check)<p> * * -P <br> * Whether NOT use pruning. (default: use pruning)<p> * * @return an enumeration of all the available options */ public Enumeration listOptions() { Vector newVector = new Vector(3); newVector.addElement(new Option("\tSet number of folds for REP\n" + "\tOne fold is used as pruning set.\n" + "\t(default 3)","F", 1, "-F <number of folds>")); newVector.addElement(new Option("\tSet the minimal weights of instances\n" + "\twithin a split.\n" + "\t(default 2.0)","N", 1, "-N <min. weights>")); newVector.addElement(new Option("\tSet the number of runs of\n"+ "\toptimizations. (Default: 2)", "O", 1,"-O <number of runs>")); newVector.addElement(new Option("\tSet whether turn on the\n"+ "\tdebug mode (Default: false)", "D", 0,"-D")); newVector.addElement(new Option("\tThe seed of randomization\n"+ "\t(Default: 1)", "S", 1,"-S <seed>")); newVector.addElement(new Option("Whether NOT check the error rate>=0.5\n" +"\tin stopping criteria " +"\t(default: check)", "E", 0, "-E")); newVector.addElement(new Option("Whether NOT use pruning\n" +"\t(default: use pruning)", "P", 0, "-P")); return newVector.elements(); } /** * Parses a given list of options. * * @param options the list of options as an array of strings * @exception Exception if an option is not supported */ public void setOptions(String[] options) throws Exception { String numFoldsString = Utils.getOption('F', options); if (numFoldsString.length() != 0) m_Folds = Integer.parseInt(numFoldsString); else m_Folds = 3; String minNoString = Utils.getOption('N', options); if (minNoString.length() != 0) m_MinNo = Double.parseDouble(minNoString); else m_MinNo = 2.0; String seedString = Utils.getOption('S', options); if (seedString.length() != 0) m_Seed = Long.parseLong(seedString); else m_Seed = 1; String runString = Utils.getOption('O', options); if (runString.length() != 0) m_Optimizations = Integer.parseInt(runString); else m_Optimizations = 2; m_Debug = Utils.getFlag('D', options); m_CheckErr = !Utils.getFlag('E', options); m_UsePruning = !Utils.getFlag('P', options); } /** * Gets the current settings of the Classifier. * * @return an array of strings suitable for passing to setOptions */ public String [] getOptions() { String [] options = new String [11]; int current = 0; options[current++] = "-F"; options[current++] = "" + m_Folds; options[current++] = "-N"; options[current++] = "" + m_MinNo; options[current++] = "-O"; options[current++] = "" + m_Optimizations; options[current++] = "-S"; options[current++] = "" + m_Seed; if(m_Debug) options[current++] = "-D"; if(!m_CheckErr) options[current++] = "-E"; if(!m_UsePruning) options[current++] = "-P"; while(current < options.length) options[current++] = ""; return options; } /** * Returns an enumeration of the additional measure names * @return an enumeration of the measure names */ public Enumeration enumerateMeasures() { Vector newVector = new Vector(1); newVector.addElement("measureNumRules"); return newVector.elements(); } /** * Returns the value of the named measure * @param measureName the name of the measure to query for its value * @return the value of the named measure * @exception IllegalArgumentException if the named measure is not supported */ public double getMeasure(String additionalMeasureName) { if (additionalMeasureName.compareTo("measureNumRules") == 0) return m_Ruleset.size(); else throw new IllegalArgumentException(additionalMeasureName+" not supported (RIPPER)"); } public void setFolds(int fold){ m_Folds = fold; } public int getFolds(){ return m_Folds; } public void setMinNo(double m){ m_MinNo = m; } public double getMinNo(){ return m_MinNo; } public void setSeed(long s){ m_Seed = s; } public long getSeed(){ return m_Seed; } public void setOptimizations(int run){ m_Optimizations = run; } public int getOptimizations(){ return m_Optimizations; } public void setDebug(boolean d){m_Debug = d;} public boolean getDebug(){ return m_Debug; } public void setCheckErrorRate(boolean d){ m_CheckErr = d;} public boolean getCheckErrorRate(){ return m_CheckErr; } public void setUsePruning(boolean d){ m_UsePruning = d;} public boolean getUsePruning(){ return m_UsePruning; } /** * Get the ruleset generated by Ripper * * @return the ruleset */ public FastVector getRuleset(){ return m_Ruleset; } /** * Get the statistics of the ruleset in the given position * * @param pos the position of the stats, assuming correct */ public RuleStats getRuleStats(int pos) { return (RuleStats)m_RulesetStats.elementAt(pos); } /** * The single antecedent in the rule, which is composed of an attribute and * the corresponding value. There are two inherited classes, namely NumericAntd * and NominalAntd in which the attributes are numeric and nominal respectively. */ private abstract class Antd implements WeightedInstancesHandler, Copyable{ /* The attribute of the antecedent */ protected Attribute att; /* The attribute value of the antecedent. For numeric attribute, value is either 0(1st bag) or 1(2nd bag) */ protected double value; /* The maximum infoGain achieved by this antecedent test * in the growing data */ protected double maxInfoGain; /* The accurate rate of this antecedent test on the growing data */ protected double accuRate; /* The coverage of this antecedent in the growing data */ protected double cover; /* The accurate data for this antecedent in the growing data */ protected double accu; /* Constructor*/ public Antd(Attribute a){ att=a; value=Double.NaN; maxInfoGain = 0; accuRate = Double.NaN; cover = Double.NaN; accu = Double.NaN; } /* The abstract members for inheritance */ public abstract Instances[] splitData(Instances data, double defAcRt, double cla); public abstract boolean covers(Instance inst); public abstract String toString(); /** Implements Copyable */ public abstract Object copy();
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -