racesearch.java

来自「一个数据挖掘软件ALPHAMINERR的整个过程的JAVA版源代码」· Java 代码 · 共 1,662 行 · 第 1/4 页
JAVA
1,662 行
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    RaceSearch.java
 *    Copyright (C) 2000 Mark Hall
 *
 */

package  weka.attributeSelection;

import java.util.BitSet;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;

import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.SelectedTag;
import weka.core.Statistics;
import weka.core.Tag;
import weka.core.Utils;
import weka.experiment.PairedStats;
import weka.experiment.Stats;

/** 
 * Class for performing a racing search. <p>
 *
 * For more information see: <br>
 * Moore, A. W. and Lee, M. S. (1994). Efficient algorithms for minimising
 * cross validation error. Proceedings of the Eleventh International
 * Conference on Machine Learning. pp 190--198. <p>
 *
 * Valid options are:<p>
 *
 * -R <race type><br>
 * 0 = forward, 1 = backward, 2 = schemata, 3 = rank. <p>
 * 
 * -L <significance level> <br>
 * significance level to use for t-tests. <p>
 *
 * -T <threshold> <br>
 * threshold for considering mean errors of two subsets the same <p>
 *
 * -F <xval type> <br>
 * 0 = 10 fold, 1 = leave-one-out (selected automatically for schemata race
 * <p>
 *
 * -A <attribute evaluator> <br>
 * the attribute evaluator to use when doing a rank search <p>
 *
 * -Q <br>
 * produce a ranked list of attributes. Selecting this option forces
 * the race type to be forward. Racing continues until *all* attributes
 * have been selected, thus producing a ranked list of attributes. <p>
 *
 * -N <number to retain> <br>
 * Specify the number of attributes to retain. Overides any threshold. 
 * Use in conjunction with -Q. <p>
 * 
 * -J <threshold> <br>
 * Specify a threshold by which the AttributeSelection module can discard
 * attributes. Use in conjunction with -Q. <p>
 *
 * -Z <br>
 * Turn on verbose output for monitoring the search <p>
 *
 * @author Mark Hall (mhall@cs.waikato.ac.nz)
 * @version $Revision$
 */
public class RaceSearch extends ASSearch implements RankedOutputSearch, 
						    OptionHandler {

  /* the training instances */
  private Instances m_Instances = null;

  /** search types */
  private static final int FORWARD_RACE = 0;
  private static final int BACKWARD_RACE = 1;
  private static final int SCHEMATA_RACE = 2;
  private static final int RANK_RACE = 3;
  public static final Tag [] TAGS_SELECTION = {
    new Tag(FORWARD_RACE, "Forward selection race"),
    new Tag(BACKWARD_RACE, "Backward elimination race"),
    new Tag(SCHEMATA_RACE, "Schemata race"),
    new Tag(RANK_RACE, "Rank race")
      };
  
  /** the selected search type */
  private int m_raceType = FORWARD_RACE;
  
  /** xval types */
  private static final int TEN_FOLD = 0;
  private static final int LEAVE_ONE_OUT = 1;
  public static final Tag [] XVALTAGS_SELECTION = {
    new Tag(TEN_FOLD, "10 Fold"),
    new Tag(LEAVE_ONE_OUT, "Leave-one-out"),
      };

  /** the selected xval type */
  private int m_xvalType = TEN_FOLD;
  
  /** the class index */
  private int m_classIndex;

  /** the number of attributes in the data */
  private int m_numAttribs;

  /** the total number of partially/fully evaluated subsets */
  private int m_totalEvals;

  /** holds the merit of the best subset found */
  private double m_bestMerit = -Double.MAX_VALUE;

  /** the subset evaluator to use */
  private HoldOutSubsetEvaluator m_theEvaluator = null;

  /** the significance level for comparisons */
  private double m_sigLevel = 0.001;

  /** threshold for comparisons */
  private double m_delta = 0.001;

  /** the number of samples above which to begin testing for similarity
      between competing subsets */
  private int m_samples = 20;

  /** number of cross validation folds---equal to the number of instances
      for leave-one-out cv */
  private int m_numFolds = 10;

  /** the attribute evaluator to generate the initial ranking when
      doing a rank race */
  private ASEvaluation m_ASEval = new GainRatioAttributeEval();

  /** will hold the attribute ranking produced by the above attribute
      evaluator if doing a rank search */
  private int [] m_Ranking;

  /** verbose output for monitoring the search and debugging */
  private boolean m_debug = false;

  /** If true then produce a ranked list of attributes by fully traversing
      a forward hillclimb race */
  private boolean m_rankingRequested = false;

  /** The ranked list of attributes produced if m_rankingRequested is true */
  private double [][] m_rankedAtts;

  /** The number of attributes ranked so far (if ranking is requested) */
  private int m_rankedSoFar;

  /** The number of attributes to retain if a ranking is requested. -1
      indicates that all attributes are to be retained. Has precedence over
      m_threshold */
  private int m_numToSelect = -1;

  private int m_calculatedNumToSelect = -1;

  /** the threshold for removing attributes if ranking is requested */
  private double m_threshold = -Double.MAX_VALUE;

  /**
   * Returns a string describing this search method
   * @return a description of the search method suitable for
   * displaying in the explorer/experimenter gui
   */
  public String globalInfo() {
    return "Races the cross validation error of competing "
      +"attribute subsets. Use in conjuction with a ClassifierSubsetEval. "
      +"RaceSearch has four modes:\n\nforward selection "
      +"races all single attribute additions to a base set (initially "
      +" no attributes), selects the winner to become the new base set "
      +"and then iterates until there is no improvement over the base set. "
      +"\n\nBackward elimination is similar but the initial base set has all "
      +"attributes included and races all single attribute deletions. "
      +"\n\nSchemata search is a bit different. Each iteration a series of "
      +"races are run in parallel. Each race in a set determines whether "
      +"a particular attribute should be included or not---ie the race is "
      +"between the attribute being \"in\" or \"out\". The other attributes "
      +"for this race are included or excluded randomly at each point in the "
      +"evaluation. As soon as one race "
      +"has a clear winner (ie it has been decided whether a particular "
      +"attribute should be inor not) then the next set of races begins, "
      +"using the result of the winning race from the previous iteration as "
      +"new base set.\n\nRank race first ranks the attributes using an "
      +"attribute evaluator and then races the ranking. The race includes "
      +"no attributes, the top ranked attribute, the top two attributes, the "
      +"top three attributes, etc.\n\nIt is also possible to generate a "
      +"raked list of attributes through the forward racing process. "
      +"If generateRanking is set to true then a complete forward race will "
      +"be run---that is, racing continues until all attributes have been "
      +"selected. The order that they are added in determines a complete "
      +"ranking of all the attributes.\n\nRacing uses paired and unpaired "
      +"t-tests on cross-validation errors of competing subsets. When there "
      +"is a significant difference between the means of the errors of two "
      +"competing subsets then the poorer of the two can be eliminated from "
      +"the race. Similarly, if there is no significant difference between "
      +"the mean errors of two competing subsets and they are within some "
      +"threshold of each other, then one can be eliminated from the race. ";
  }

  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String raceTypeTipText() {
    return "Set the type of search.";
  }

  /**
   * Set the race type
   *
   * @param d the type of race
   */
  public void setRaceType (SelectedTag d) {
    
    if (d.getTags() == TAGS_SELECTION) {
      m_raceType = d.getSelectedTag().getID();
    }
    if (m_raceType == SCHEMATA_RACE && !m_rankingRequested) {
      try {
	setFoldsType(new SelectedTag(LEAVE_ONE_OUT,
				     XVALTAGS_SELECTION));
	setSignificanceLevel(0.01);
      } catch (Exception ex) {
      }
    } else {
      try {
	setFoldsType(new SelectedTag(TEN_FOLD,
				     XVALTAGS_SELECTION));
	setSignificanceLevel(0.001);
      } catch (Exception ex) {
      }
    }
  }

  /**
   * Get the race type
   *
   * @return the type of race
   */
  public SelectedTag getRaceType() {
    return new SelectedTag(m_raceType, TAGS_SELECTION);
  }

  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String significanceLevelTipText() {
    return "Set the significance level to use for t-test comparisons.";
  }

  /**
   * Sets the significance level to use
   * @param sig the significance level
   */
  public void setSignificanceLevel(double sig) {
    m_sigLevel = sig;
  }

  /**
   * Get the significance level
   * @return the current significance level
   */
  public double getSignificanceLevel() {
    return m_sigLevel;
  }
  
  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String thresholdTipText() {
    return "Set the error threshold by which to consider two subsets "
      +"equivalent.";
  }

  /**
   * Sets the threshold for comparisons
   * @param t the threshold to use
   */
  public void setThreshold(double t) {
    m_delta = t;
  }

  /**
   * Get the threshold
   * @return the current threshold
   */
  public double getThreshold() {
    return m_delta;
  }

  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String foldsTipText() {
    return "Set the number of folds to use for x-val error estimation. "
      +"Leave-one-out is selected automatically for schemata search.";
  }

  /**
   * Set the xfold type
   *
   * @param d the type of xval
   */
  public void setFoldsType (SelectedTag d) {
    
    if (d.getTags() == XVALTAGS_SELECTION) {
      m_xvalType = d.getSelectedTag().getID();
    }
  }

  /**
   * Get the xfold type
   *
   * @return the type of xval
   */
  public SelectedTag getFoldsType () {
    return new SelectedTag(m_xvalType, XVALTAGS_SELECTION);
  }

  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String debugTipText() {
    return "Turn on verbose output for monitoring the search's progress.";
  }

  /**
   * Set whether verbose output should be generated.
   * @param d true if output is to be verbose.
   */
  public void setDebug(boolean d) {
    m_debug = d;
  }

  /**
   * Get whether output is to be verbose
   * @return true if output will be verbose
   */
  public boolean getDebug() {
    return m_debug;
  }

  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String attributeEvaluatorTipText() {
    return "Attribute evaluator to use for generating an initial ranking. "
      +"Use in conjunction with a rank race";    
  }

  /**
   * Set the attribute evaluator to use for generating the ranking.
   * @param newEvaluator the attribute evaluator to use.
   */
  public void setAttributeEvaluator(ASEvaluation newEvaluator) {
    m_ASEval = newEvaluator;
  }

  /**
   * Get the attribute evaluator used to generate the ranking.
   * @return the evaluator used to generate the ranking.
   */
  public ASEvaluation getAttributeEvaluator() {
    return m_ASEval;
  }

    /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String generateRankingTipText() {
    return "Use the racing process to generate a ranked list of attributes. "
      +"Using this mode forces the race to be a forward type and then races "
      +"until all attributes have been added, thus giving a ranked list";
  }
  
  /**
   * Records whether the user has requested a ranked list of attributes.
   * @param doRank true if ranking is requested
   */
  public void setGenerateRanking(boolean doRank) {
    m_rankingRequested = doRank;
    if (m_rankingRequested) {
      try {
	setRaceType(new SelectedTag(FORWARD_RACE,
				    TAGS_SELECTION));
racesearch.java - 源码说明

本页面展示了「一个数据挖掘软件ALPHAMINERR的整个过程的JAVA版源代码」中的 racesearch.java 源码文件，采用 Java 编程语言编写，共 1,662 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与ALPHAMINERR相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?