📄 bftree.java

📁 Weka
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
12 3 4 5 下一页
/* *    This program is free software; you can redistribute it and/or modify *    it under the terms of the GNU General Public License as published by *    the Free Software Foundation; either version 2 of the License, or *    (at your option) any later version. * *    This program is distributed in the hope that it will be useful, *    but WITHOUT ANY WARRANTY; without even the implied warranty of *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *    GNU General Public License for more details. * *    You should have received a copy of the GNU General Public License *    along with this program; if not, write to the Free Software *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* * BFTree.java * Copyright (C) 2007 Haijian Shi * */package weka.classifiers.trees;import weka.classifiers.Evaluation;import weka.classifiers.RandomizableClassifier;import weka.core.AdditionalMeasureProducer;import weka.core.Attribute;import weka.core.Capabilities;import weka.core.FastVector;import weka.core.Instance;import weka.core.Instances;import weka.core.Option;import weka.core.SelectedTag;import weka.core.Tag;import weka.core.TechnicalInformation;import weka.core.TechnicalInformationHandler;import weka.core.Utils;import weka.core.Capabilities.Capability;import weka.core.TechnicalInformation.Field;import weka.core.TechnicalInformation.Type;import weka.core.matrix.Matrix;import java.util.Arrays;import java.util.Enumeration;import java.util.Random;import java.util.Vector;/** <!-- globalinfo-start --> * Class for building a best-first decision tree classifier. This class uses binary split for both nominal and numeric attributes. For missing values, the method of 'fractional' instances is used.<br/> * <br/> * For more information, see:<br/> * <br/> * Haijian Shi (2007). Best-first decision tree learning. Hamilton, NZ.<br/> * <br/> * Jerome Friedman, Trevor Hastie, Robert Tibshirani (2000). Additive logistic regression : A statistical view of boosting. Annals of statistics. 28(2):337-407. * <p/> <!-- globalinfo-end --> * <!-- technical-bibtex-start --> * BibTeX: * <pre> * &#64;mastersthesis{Shi2007, *    address = {Hamilton, NZ}, *    author = {Haijian Shi}, *    note = {COMP594}, *    school = {University of Waikato}, *    title = {Best-first decision tree learning}, *    year = {2007} * } *  * &#64;article{Friedman2000, *    author = {Jerome Friedman and Trevor Hastie and Robert Tibshirani}, *    journal = {Annals of statistics}, *    number = {2}, *    pages = {337-407}, *    title = {Additive logistic regression : A statistical view of boosting}, *    volume = {28}, *    year = {2000}, *    ISSN = {0090-5364} * } * </pre> * <p/> <!-- technical-bibtex-end --> * <!-- options-start --> * Valid options are: <p/> *  * <pre> -S &lt;num&gt; *  Random number seed. *  (default 1)</pre> *  * <pre> -D *  If set, classifier is run in debug mode and *  may output additional info to the console</pre> *  * <pre> -P &lt;UNPRUNED|POSTPRUNED|PREPRUNED&gt; *  The pruning strategy. *  (default: POSTPRUNED)</pre> *  * <pre> -M &lt;min no&gt; *  The minimal number of instances at the terminal nodes. *  (default 2)</pre> *  * <pre> -N &lt;num folds&gt; *  The number of folds used in the pruning. *  (default 5)</pre> *  * <pre> -H *  Don't use heuristic search for nominal attributes in multi-class *  problem (default yes). * </pre> *  * <pre> -G *  Don't use Gini index for splitting (default yes), *  if not information is used.</pre> *  * <pre> -R *  Don't use error rate in internal cross-validation (default yes),  *  but root mean squared error.</pre> *  * <pre> -A *  Use the 1 SE rule to make pruning decision. *  (default no).</pre> *  * <pre> -C *  Percentage of training data size (0-1] *  (default 1).</pre> *  <!-- options-end --> * * @author Haijian Shi (hs69@cs.waikato.ac.nz) * @version $Revision: 1.3 $ */public class BFTree  extends RandomizableClassifier  implements AdditionalMeasureProducer, TechnicalInformationHandler {  /** For serialization.	 */  private static final long serialVersionUID = -7035607375962528217L;  /** pruning strategy: un-pruned */  public static final int PRUNING_UNPRUNED = 0;  /** pruning strategy: post-pruning */  public static final int PRUNING_POSTPRUNING = 1;  /** pruning strategy: pre-pruning */  public static final int PRUNING_PREPRUNING = 2;  /** pruning strategy */  public static final Tag[] TAGS_PRUNING = {    new Tag(PRUNING_UNPRUNED, "unpruned", "Un-pruned"),    new Tag(PRUNING_POSTPRUNING, "postpruned", "Post-pruning"),    new Tag(PRUNING_PREPRUNING, "prepruned", "Pre-pruning")  };    /** the pruning strategy */  protected int m_PruningStrategy = PRUNING_POSTPRUNING;  /** Successor nodes. */  protected BFTree[] m_Successors;  /** Attribute used for splitting. */  protected Attribute m_Attribute;  /** Split point (for numeric attributes). */  protected double m_SplitValue;  /** Split subset (for nominal attributes). */  protected String m_SplitString;  /** Class value for a node. */  protected double m_ClassValue;  /** Class attribute of a dataset. */  protected Attribute m_ClassAttribute;  /** Minimum number of instances at leaf nodes. */  protected int m_minNumObj = 2;  /** Number of folds for the pruning. */  protected int m_numFoldsPruning = 5;  /** If the ndoe is leaf node. */  protected boolean m_isLeaf;  /** Number of expansions. */  protected static int m_Expansion;  /** Fixed number of expansions (if no pruning method is used, its value is -1. Otherwise,   *  its value is gotten from internal cross-validation).   */  protected int m_FixedExpansion = -1;  /** If use huristic search for binary split (default true). Note even if its value is true, it is only   * used when the number of values of a nominal attribute is larger than 4. */  protected boolean m_Heuristic = true;  /** If use Gini index as the splitting criterion - default (if not, information is used). */  protected boolean m_UseGini = true;  /** If use error rate in internal cross-validation to fix the number of expansions - default   *  (if not, root mean squared error is used). */  protected boolean m_UseErrorRate = true;  /** If use the 1SE rule to make the decision. */  protected boolean m_UseOneSE = false;  /** Class distributions.  */  protected double[] m_Distribution;  /** Branch proportions. */  protected double[] m_Props;  /** Sorted indices. */  protected int[][] m_SortedIndices;  /** Sorted weights. */  protected double[][] m_Weights;  /** Distributions of each attribute for two successor nodes. */  protected double[][][] m_Dists;  /** Class probabilities. */  protected double[] m_ClassProbs;  /** Total weights. */  protected double m_TotalWeight;  /** The training data size (0-1). Default 1. */  protected double m_SizePer = 1;  /**   * Returns a string describing classifier   *    * @return 		a description suitable for displaying in the    * 			explorer/experimenter gui   */  public String globalInfo() {    return          "Class for building a best-first decision tree classifier. "      + "This class uses binary split for both nominal and numeric attributes. "      + "For missing values, the method of 'fractional' instances is used.\n\n"      + "For more information, see:\n\n"      + getTechnicalInformation().toString();  }    /**   * Returns an instance of a TechnicalInformation object, containing    * detailed information about the technical background of this class,   * e.g., paper reference or book this class is based on.   *    * @return the technical information about this class   */  public TechnicalInformation getTechnicalInformation() {    TechnicalInformation 	result;    TechnicalInformation 	additional;        result = new TechnicalInformation(Type.MASTERSTHESIS);    result.setValue(Field.AUTHOR, "Haijian Shi");    result.setValue(Field.YEAR, "2007");    result.setValue(Field.TITLE, "Best-first decision tree learning");    result.setValue(Field.SCHOOL, "University of Waikato");    result.setValue(Field.ADDRESS, "Hamilton, NZ");    result.setValue(Field.NOTE, "COMP594");        additional = result.add(Type.ARTICLE);    additional.setValue(Field.AUTHOR, "Jerome Friedman and Trevor Hastie and Robert Tibshirani");    additional.setValue(Field.YEAR, "2000");    additional.setValue(Field.TITLE, "Additive logistic regression : A statistical view of boosting");    additional.setValue(Field.JOURNAL, "Annals of statistics");    additional.setValue(Field.VOLUME, "28");    additional.setValue(Field.NUMBER, "2");    additional.setValue(Field.PAGES, "337-407");    additional.setValue(Field.ISSN, "0090-5364");        return result;  }  /**   * Returns default capabilities of the classifier.   *    * @return 		the capabilities of this classifier   */  public Capabilities getCapabilities() {    Capabilities result = super.getCapabilities();    // attributes    result.enable(Capability.NOMINAL_ATTRIBUTES);    result.enable(Capability.NUMERIC_ATTRIBUTES);    result.enable(Capability.MISSING_VALUES);    // class    result.enable(Capability.NOMINAL_CLASS);    return result;  }  /**   * Method for building a BestFirst decision tree classifier.   *   * @param data 	set of instances serving as training data   * @throws Exception 	if decision tree cannot be built successfully   */  public void buildClassifier(Instances data) throws Exception {    getCapabilities().testWithFail(data);    data = new Instances(data);    data.deleteWithMissingClass();    // build an unpruned tree    if (m_PruningStrategy == PRUNING_UNPRUNED) {      // calculate sorted indices, weights and initial class probabilities      int[][] sortedIndices = new int[data.numAttributes()][0];      double[][] weights = new double[data.numAttributes()][0];      double[] classProbs = new double[data.numClasses()];      double totalWeight = computeSortedInfo(data,sortedIndices, weights,classProbs);      // Compute information of the best split for this node (include split attribute,      // split value and gini gain (or information gain)). At the same time, compute      // variables dists, props and totalSubsetWeights.      double[][][] dists = new double[data.numAttributes()][2][data.numClasses()];      double[][] props = new double[data.numAttributes()][2];      double[][] totalSubsetWeights = new double[data.numAttributes()][2];      FastVector nodeInfo = computeSplitInfo(this, data, sortedIndices, weights, dists,	  props, totalSubsetWeights, m_Heuristic, m_UseGini);      // add the node (with all split info) into BestFirstElements      FastVector BestFirstElements = new FastVector();      BestFirstElements.addElement(nodeInfo);      // Make the best-first decision tree.      int attIndex = ((Attribute)nodeInfo.elementAt(1)).index();      m_Expansion = 0;      makeTree(BestFirstElements, data, sortedIndices, weights, dists, classProbs,	  totalWeight, props[attIndex] ,m_minNumObj, m_Heuristic, m_UseGini, m_FixedExpansion);      return;    }    // the following code is for pre-pruning and post-pruning methods    // Compute train data, test data, sorted indices, sorted weights, total weights,    // class probabilities, class distributions, branch proportions and total subset    // weights for root nodes of each fold for prepruning and postpruning.    int expansion = 0;    Random random = new Random(m_Seed);    Instances cvData = new Instances(data);    cvData.randomize(random);    cvData = new Instances(cvData,0,(int)(cvData.numInstances()*m_SizePer)-1);    cvData.stratify(m_numFoldsPruning);    Instances[] train = new Instances[m_numFoldsPruning];    Instances[] test = new Instances[m_numFoldsPruning];    FastVector[] parallelBFElements = new FastVector [m_numFoldsPruning];    BFTree[] m_roots = new BFTree[m_numFoldsPruning];    int[][][] sortedIndices = new int[m_numFoldsPruning][data.numAttributes()][0];    double[][][] weights = new double[m_numFoldsPruning][data.numAttributes()][0];    double[][] classProbs = new double[m_numFoldsPruning][data.numClasses()];    double[] totalWeight = new double[m_numFoldsPruning];    double[][][][] dists =      new double[m_numFoldsPruning][data.numAttributes()][2][data.numClasses()];    double[][][] props =      new double[m_numFoldsPruning][data.numAttributes()][2];    double[][][] totalSubsetWeights =      new double[m_numFoldsPruning][data.numAttributes()][2];    FastVector[] nodeInfo = new FastVector[m_numFoldsPruning];
12 3 4 5 下一页
💿 文件大小 4679 K
👤 上传用户 lidaniel
📂 所属分类其他
🏷️ 相关标签

#Weka
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -