📄 jrip.java

📁 一个数据挖掘软件ALPHAMINERR的整个过程的JAVA版源代码
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
12 3 4 下一页
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    JRip.java
 *    Copyright (C) 2001 Xin Xu, Eibe Frank
 */

package weka.classifiers.rules;

import java.io.Serializable;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;

import weka.classifiers.Classifier;
import weka.classifiers.Evaluation;
import weka.core.AdditionalMeasureProducer;
import weka.core.Attribute;
import weka.core.Copyable;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.UnsupportedAttributeTypeException;
import weka.core.UnsupportedClassTypeException;
import weka.core.Utils;
import weka.core.WeightedInstancesHandler;
import weka.filters.Filter;
import weka.filters.supervised.attribute.ClassOrder;

/**
 * This class implements a propositional rule learner, Repeated Incremental
 * Pruning to Produce Error Reduction (RIPPER), which is proposed by William
 * W. Cohen as an optimized version of IREP. <p>
 * 
 * The algorithm is briefly described as follows: <p>
 * Initialize RS = {}, and for each class from the less prevalent one to 
 * the more frequent one, DO: <p>
 *
 * 1. Building stage: repeat 1.1 and 1.2 until the descrition length (DL)
 *    of the ruleset and examples is 64 bits greater than the smallest DL
 *    met so far, or there are no positive examples, or the error rate >= 50%. 
 *    <p>
 * 1.1. Grow phase:<br>
 *      Grow one rule by greedily adding antecedents (or conditions) to
 *      the rule until the rule is perfect (i.e. 100% accurate).  The
 *      procedure tries every possible value of each attribute and selects
 *      the condition with highest information gain: p(log(p/t)-log(P/T)).
 *      <p>
 * 1.2. Prune phase:<br>
 *      Incrementally prune each rule and allow the pruning of any
 *      final sequences of the antecedents;<br>
 *      The pruning metric is (p-n)/(p+n) -- but it's actually 
 *      2p/(p+n) -1, so in this implementation we simply use p/(p+n)
 *      (actually (p+1)/(p+n+2), thus if p+n is 0, it's 0.5).<p>
 *
 * 2. Optimization stage: after generating the initial ruleset {Ri}, 
 *    generate and prune two variants of each rule Ri from randomized data
 *    using procedure 1.1 and 1.2.  But one variant is generated from an 
 *    empty rule while the other is generated by greedily adding antecedents
 *    to the original rule.  Moreover, the pruning metric used here is 
 *    (TP+TN)/(P+N).<br>
 *    Then the smallest possible DL for each variant and the original rule
 *    is computed.  The variant with the minimal DL is selected as the final 
 *    representative of Ri in the ruleset. <br>
 *    After all the rules in {Ri} have been examined and if there are still
 *    residual positives, more rules are generated based on the residual 
 *    positives using Building Stage again. <p>
 *
 * 3. Delete the rules from the ruleset that would increase the DL of the
 *    whole ruleset if it were in it. and add resultant ruleset to RS. <p>
 * 
 * ENDDO<p>
 *
 * Note that there seem to be 2 bugs in the ripper program that would
 * affect the ruleset size and accuracy slightly.  This implementation avoids
 * these bugs and thus is a little bit different from Cohen's original 
 * implementation.  Even after fixing the bugs, since the order of classes with
 * the same frequency is not defined in ripper, there still seems to be 
 * some trivial difference between this implementation and the original ripper,
 * especially for audiology data in UCI repository, where there are lots of
 * classes of few instances.<p>
 *
 * If wrapped by other classes, typical usage of this class is:<br>
 *
 * <code>JRip rip = new JRip();
 * Instances data = ... // Data from somewhere
 * double[] orderedClasses = ... // Get the ordered class counts for the data 
 * double expFPRate = ... // Calculate the expected FP/(FP+FN) rate
 * double classIndex = ...  // The class index for which ruleset is built
 * // DL of default rule, no theory DL, only data DL
 * double defDL = RuleStats.dataDL(expFPRate, 0.0, data.sumOfWeights(),
 *				   0.0, orderedClasses[(int)classIndex]);
 *	    
 * rip.rulesetForOneClass(expFPRate, data, classIndex, defDL); 
 * RuleStats rulesetStats = rip.getRuleStats(0);
 *
 * // Can get heaps of information from RuleStats, e.g. combined DL, 
 * // simpleStats, etc.
 * double comDL = rulesetStats.combinedDL(expFPRate, classIndex);
 * int whichRule = ... // Want simple stats of which rule?
 * double[] simpleStats = rulesetStats.getSimpleStats(whichRule);
 * ...
 * </code>
 *
 * Details please see "Fast Effective Rule Induction", William W. Cohen, 
 * 'Machine Learning: Proceedings of the Twelfth International Conference'
 * (ML95). <p>
 *
 * PS.  We have compared this implementation with the original ripper 
 * implementation in aspects of accuracy, ruleset size and running time 
 * on both artificial data "ab+bcd+defg" and UCI datasets.  In all these
 * aspects it seems to be quite comparable to the original ripper 
 * implementation.  However, we didn't consider memory consumption
 * optimization in this implementation.<p>
 *
 * @author Xin Xu (xx5@cs.waikato.ac.nz)
 * @author Eibe Frank (eibe@cs.waikato.ac.nz)
 * @version $Revision$
 */
public class JRip extends Classifier 
  implements OptionHandler, 
	     AdditionalMeasureProducer, 
	     WeightedInstancesHandler{    

  /** The limit of description length surplus in ruleset generation */
  private static double MAX_DL_SURPLUS = 64.0;
    
  /** The class attribute of the data*/
  private Attribute m_Class; 
    
  /** The ruleset */
  private FastVector m_Ruleset;
  
  /** The predicted class distribution */
  private FastVector m_Distributions;
  
  /** Runs of optimizations */
  private int m_Optimizations = 2;
    
  /** Random object used in this class */
  private Random m_Random = null;
    
  /** # of all the possible conditions in a rule */
  private double m_Total = 0;

  /** The seed to perform randomization */
  private long m_Seed = 1;

  /** The number of folds to split data into Grow and Prune for IREP */
  private int m_Folds = 3;
    
  /** The minimal number of instance weights within a split*/
  private double m_MinNo = 2.0;

  /** Whether in a debug mode */
  private boolean m_Debug = false;

  /** Whether check the error rate >= 0.5 in stopping criteria */
  private boolean m_CheckErr = true;

  /** Whether use pruning, i.e. the data is clean or not */
  private boolean m_UsePruning = true;

  /** The filter used to randomize the class order */
  private Filter m_Filter = null;

  /** The RuleStats for the ruleset of each class value */
  private FastVector m_RulesetStats;
    
  /**
   * Returns a string describing classifier
   * @return a description suitable for
   * displaying in the explorer/experimenter gui
   */
  public String globalInfo() {

    return "This class implements a propositional rule learner, Repeated Incremental "
      + "Pruning to Produce Error Reduction (RIPPER), which was proposed by William "
      + "W. Cohen as an optimized version of IREP. \n\n"
      + "The algorithm is briefly described as follows: \n\n"
      + "Initialize RS = {}, and for each class from the less prevalent one to "
      + "the more frequent one, DO: \n\n"
      + "1. Building stage:\nRepeat 1.1 and 1.2 until the descrition length (DL) "
      + "of the ruleset and examples is 64 bits greater than the smallest DL "
      + "met so far, or there are no positive examples, or the error rate >= 50%. "
      + "\n\n"
      + "1.1. Grow phase:\n"
      + "Grow one rule by greedily adding antecedents (or conditions) to "
      + "the rule until the rule is perfect (i.e. 100% accurate).  The "
      + "procedure tries every possible value of each attribute and selects "
      + "the condition with highest information gain: p(log(p/t)-log(P/T))."
      + "\n\n"
      + "1.2. Prune phase:\n"
      + "Incrementally prune each rule and allow the pruning of any "
      + "final sequences of the antecedents;"
      + "The pruning metric is (p-n)/(p+n) -- but it's actually "
      + "2p/(p+n) -1, so in this implementation we simply use p/(p+n) "
      + "(actually (p+1)/(p+n+2), thus if p+n is 0, it's 0.5).\n\n"
      + "2. Optimization stage:\n after generating the initial ruleset {Ri}, "
      + "generate and prune two variants of each rule Ri from randomized data "
      + "using procedure 1.1 and 1.2. But one variant is generated from an "
      + "empty rule while the other is generated by greedily adding antecedents "
      + "to the original rule. Moreover, the pruning metric used here is "
      + "(TP+TN)/(P+N)."
      + "Then the smallest possible DL for each variant and the original rule "
      + "is computed.  The variant with the minimal DL is selected as the final "
      + "representative of Ri in the ruleset."
      + "After all the rules in {Ri} have been examined and if there are still "
      + "residual positives, more rules are generated based on the residual "
      + "positives using Building Stage again. \n"
      + "3. Delete the rules from the ruleset that would increase the DL of the "
      + "whole ruleset if it were in it. and add resultant ruleset to RS. \n"
      + "ENDDO\n\n"
      + "Note that there seem to be 2 bugs in the original ripper program that would "
      + "affect the ruleset size and accuracy slightly.  This implementation avoids "
      + "these bugs and thus is a little bit different from Cohen's original "
      + "implementation. Even after fixing the bugs, since the order of classes with "
      + "the same frequency is not defined in ripper, there still seems to be "
      + "some trivial difference between this implementation and the original ripper, "
      + "especially for audiology data in UCI repository, where there are lots of "
      + "classes of few instances.\n\n"
      + "Details please see \"Fast Effective Rule Induction\", William W. Cohen, "
      + "'Machine Learning: Proceedings of the Twelfth International Conference'"
      + "(ML95). \n\n"
      + "PS.  We have compared this implementation with the original ripper "
      + "implementation in aspects of accuracy, ruleset size and running time "
      + "on both artificial data \"ab+bcd+defg\" and UCI datasets.  In all these "
      + "aspects it seems to be quite comparable to the original ripper "
      + "implementation.  However, we didn't consider memory consumption "
      + "optimization in this implementation.\n\n";    
  }
    
  /**
   * Returns an enumeration describing the available options
   * Valid options are: <p>
   *  
   * -F number <br>
   * The number of folds for reduced error pruning. One fold is
   * used as the pruning set. (Default: 3) <p>
   * 
   * -N number <br>
   * The minimal weights of instances within a split.
   * (Default: 2) <p>
   *    
   * -O number <br>
   * Set the number of runs of optimizations. (Default: 2)<p>
   *
   * -D <br>
   * Whether turn on the debug mode
   *
   * -S number <br>
   * The seed of randomization used in Ripper.(Default: 1)<p>
   *
   * -E <br>
   * Whether NOT check the error rate >= 0.5 in stopping criteria.
   * (default: check)<p> 
   *
   * -P <br>
   * Whether NOT use pruning. (default: use pruning)<p>
   *
   * @return an enumeration of all the available options
   */
  public Enumeration listOptions() {
    Vector newVector = new Vector(3);
    newVector.addElement(new Option("\tSet number of folds for REP\n" +
				    "\tOne fold is used as pruning set.\n" +
				    "\t(default 3)","F", 1, "-F <number of folds>"));
    newVector.addElement(new Option("\tSet the minimal weights of instances\n" +
				    "\twithin a split.\n" +
				    "\t(default 2.0)","N", 1, "-N <min. weights>"));		 
    newVector.addElement(new Option("\tSet the number of runs of\n"+
				    "\toptimizations. (Default: 2)", "O",
				    1,"-O <number of runs>"));
	
    newVector.addElement(new Option("\tSet whether turn on the\n"+
				    "\tdebug mode (Default: false)", "D",
				    0,"-D"));
	
    newVector.addElement(new Option("\tThe seed of randomization\n"+
				    "\t(Default: 1)", "S",
				    1,"-S <seed>"));
	
    newVector.addElement(new Option("Whether NOT check the error rate>=0.5\n"
				    +"\tin stopping criteria "
				    +"\t(default: check)", "E", 
				    0, "-E")); 
	
    newVector.addElement(new Option("Whether NOT use pruning\n"
				    +"\t(default: use pruning)", "P", 
				    0, "-P")); 
    return newVector.elements();
  }
    
  /**
   * Parses a given list of options.
   *
   * @param options the list of options as an array of strings
   * @exception Exception if an option is not supported
   */
  public void setOptions(String[] options) throws Exception {
    String numFoldsString = Utils.getOption('F', options);
    if (numFoldsString.length() != 0) 
      m_Folds = Integer.parseInt(numFoldsString);
    else 
      m_Folds = 3;   
	
    String minNoString = Utils.getOption('N', options);
    if (minNoString.length() != 0) 
      m_MinNo = Double.parseDouble(minNoString);
    else 
      m_MinNo = 2.0;
	
    String seedString = Utils.getOption('S', options);
    if (seedString.length() != 0)
      m_Seed = Long.parseLong(seedString);
    else 
      m_Seed = 1;

    String runString = Utils.getOption('O', options);
    if (runString.length() != 0)
      m_Optimizations = Integer.parseInt(runString);
    else 
      m_Optimizations = 2;

    m_Debug = Utils.getFlag('D', options);
    m_CheckErr = !Utils.getFlag('E', options);
    m_UsePruning = !Utils.getFlag('P', options);
  }
    
  /**
   * Gets the current settings of the Classifier.
   *
   * @return an array of strings suitable for passing to setOptions
   */
  public String [] getOptions() {

    String [] options = new String [11];
    int current = 0;
    options[current++] = "-F"; options[current++] = "" + m_Folds;
    options[current++] = "-N"; options[current++] = "" + m_MinNo;
    options[current++] = "-O"; options[current++] = "" + m_Optimizations;
    options[current++] = "-S"; options[current++] = "" + m_Seed;
	
    if(m_Debug)
      options[current++] = "-D";

    if(!m_CheckErr)
      options[current++] = "-E";
	
    if(!m_UsePruning)
      options[current++] = "-P";
	
    while(current < options.length)
      options[current++] = "";
	
    return options;
  }

  /**
   * Returns an enumeration of the additional measure names
   * @return an enumeration of the measure names
   */
  public Enumeration emerateMeasures() {
    Vector newVector = new Vector(1);
    newVector.addElement("measureNumRules");
    return newVector.elements();
  }
    
  /**
   * Returns the value of the named measure
   * @param measureName the name of the measure to query for its value
   * @return the value of the named measure
   * @exception IllegalArgumentException if the named measure is not supported
   */
  public double getMeasure(String additionalMeasureName) {
    if (additionalMeasureName.compareToIgnoreCase("measureNumRules") == 0) 
      return m_Ruleset.size();
    else 
      throw new IllegalArgumentException(additionalMeasureName+" not supported (RIPPER)");
  }  

  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String foldsTipText() {
    return "Determines the amount of data used for pruning. One fold is used for "
      + "pruning, the rest for growing the rules.";
  }
    
  public void setFolds(int fold){ m_Folds = fold; }
  public int getFolds(){ return m_Folds; }

  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String minNoTipText() {
    return "The minimum total weight of the instances in a rule.";
  }

  public void setMinNo(double m){  m_MinNo = m; }
  public double getMinNo(){ return m_MinNo; }

  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String seedTipText() {
    return "The seed used for randomizing the data.";
  }

  public void setSeed(long s){ m_Seed = s; }
  public long getSeed(){ return m_Seed; }

  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String optimizationsTipText() {
    return "The number of optimization runs.";
  }

  public void setOptimizations(int run){ m_Optimizations = run; }
  public int getOptimizations(){ return m_Optimizations; }
12 3 4 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -