📄 bftree.java
字号:
/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* * BFTree.java * Copyright (C) 2007 Haijian Shi * */package weka.classifiers.trees;import weka.classifiers.Evaluation;import weka.classifiers.RandomizableClassifier;import weka.core.AdditionalMeasureProducer;import weka.core.Attribute;import weka.core.Capabilities;import weka.core.FastVector;import weka.core.Instance;import weka.core.Instances;import weka.core.Option;import weka.core.SelectedTag;import weka.core.Tag;import weka.core.TechnicalInformation;import weka.core.TechnicalInformationHandler;import weka.core.Utils;import weka.core.Capabilities.Capability;import weka.core.TechnicalInformation.Field;import weka.core.TechnicalInformation.Type;import weka.core.matrix.Matrix;import java.util.Arrays;import java.util.Enumeration;import java.util.Random;import java.util.Vector;/** <!-- globalinfo-start --> * Class for building a best-first decision tree classifier. This class uses binary split for both nominal and numeric attributes. For missing values, the method of 'fractional' instances is used.<br/> * <br/> * For more information, see:<br/> * <br/> * Haijian Shi (2007). Best-first decision tree learning. Hamilton, NZ.<br/> * <br/> * Jerome Friedman, Trevor Hastie, Robert Tibshirani (2000). Additive logistic regression : A statistical view of boosting. Annals of statistics. 28(2):337-407. * <p/> <!-- globalinfo-end --> * <!-- technical-bibtex-start --> * BibTeX: * <pre> * @mastersthesis{Shi2007, * address = {Hamilton, NZ}, * author = {Haijian Shi}, * note = {COMP594}, * school = {University of Waikato}, * title = {Best-first decision tree learning}, * year = {2007} * } * * @article{Friedman2000, * author = {Jerome Friedman and Trevor Hastie and Robert Tibshirani}, * journal = {Annals of statistics}, * number = {2}, * pages = {337-407}, * title = {Additive logistic regression : A statistical view of boosting}, * volume = {28}, * year = {2000}, * ISSN = {0090-5364} * } * </pre> * <p/> <!-- technical-bibtex-end --> * <!-- options-start --> * Valid options are: <p/> * * <pre> -S <num> * Random number seed. * (default 1)</pre> * * <pre> -D * If set, classifier is run in debug mode and * may output additional info to the console</pre> * * <pre> -P <UNPRUNED|POSTPRUNED|PREPRUNED> * The pruning strategy. * (default: POSTPRUNED)</pre> * * <pre> -M <min no> * The minimal number of instances at the terminal nodes. * (default 2)</pre> * * <pre> -N <num folds> * The number of folds used in the pruning. * (default 5)</pre> * * <pre> -H * Don't use heuristic search for nominal attributes in multi-class * problem (default yes). * </pre> * * <pre> -G * Don't use Gini index for splitting (default yes), * if not information is used.</pre> * * <pre> -R * Don't use error rate in internal cross-validation (default yes), * but root mean squared error.</pre> * * <pre> -A * Use the 1 SE rule to make pruning decision. * (default no).</pre> * * <pre> -C * Percentage of training data size (0-1] * (default 1).</pre> * <!-- options-end --> * * @author Haijian Shi (hs69@cs.waikato.ac.nz) * @version $Revision: 1.3 $ */public class BFTree extends RandomizableClassifier implements AdditionalMeasureProducer, TechnicalInformationHandler { /** For serialization. */ private static final long serialVersionUID = -7035607375962528217L; /** pruning strategy: un-pruned */ public static final int PRUNING_UNPRUNED = 0; /** pruning strategy: post-pruning */ public static final int PRUNING_POSTPRUNING = 1; /** pruning strategy: pre-pruning */ public static final int PRUNING_PREPRUNING = 2; /** pruning strategy */ public static final Tag[] TAGS_PRUNING = { new Tag(PRUNING_UNPRUNED, "unpruned", "Un-pruned"), new Tag(PRUNING_POSTPRUNING, "postpruned", "Post-pruning"), new Tag(PRUNING_PREPRUNING, "prepruned", "Pre-pruning") }; /** the pruning strategy */ protected int m_PruningStrategy = PRUNING_POSTPRUNING; /** Successor nodes. */ protected BFTree[] m_Successors; /** Attribute used for splitting. */ protected Attribute m_Attribute; /** Split point (for numeric attributes). */ protected double m_SplitValue; /** Split subset (for nominal attributes). */ protected String m_SplitString; /** Class value for a node. */ protected double m_ClassValue; /** Class attribute of a dataset. */ protected Attribute m_ClassAttribute; /** Minimum number of instances at leaf nodes. */ protected int m_minNumObj = 2; /** Number of folds for the pruning. */ protected int m_numFoldsPruning = 5; /** If the ndoe is leaf node. */ protected boolean m_isLeaf; /** Number of expansions. */ protected static int m_Expansion; /** Fixed number of expansions (if no pruning method is used, its value is -1. Otherwise, * its value is gotten from internal cross-validation). */ protected int m_FixedExpansion = -1; /** If use huristic search for binary split (default true). Note even if its value is true, it is only * used when the number of values of a nominal attribute is larger than 4. */ protected boolean m_Heuristic = true; /** If use Gini index as the splitting criterion - default (if not, information is used). */ protected boolean m_UseGini = true; /** If use error rate in internal cross-validation to fix the number of expansions - default * (if not, root mean squared error is used). */ protected boolean m_UseErrorRate = true; /** If use the 1SE rule to make the decision. */ protected boolean m_UseOneSE = false; /** Class distributions. */ protected double[] m_Distribution; /** Branch proportions. */ protected double[] m_Props; /** Sorted indices. */ protected int[][] m_SortedIndices; /** Sorted weights. */ protected double[][] m_Weights; /** Distributions of each attribute for two successor nodes. */ protected double[][][] m_Dists; /** Class probabilities. */ protected double[] m_ClassProbs; /** Total weights. */ protected double m_TotalWeight; /** The training data size (0-1). Default 1. */ protected double m_SizePer = 1; /** * Returns a string describing classifier * * @return a description suitable for displaying in the * explorer/experimenter gui */ public String globalInfo() { return "Class for building a best-first decision tree classifier. " + "This class uses binary split for both nominal and numeric attributes. " + "For missing values, the method of 'fractional' instances is used.\n\n" + "For more information, see:\n\n" + getTechnicalInformation().toString(); } /** * Returns an instance of a TechnicalInformation object, containing * detailed information about the technical background of this class, * e.g., paper reference or book this class is based on. * * @return the technical information about this class */ public TechnicalInformation getTechnicalInformation() { TechnicalInformation result; TechnicalInformation additional; result = new TechnicalInformation(Type.MASTERSTHESIS); result.setValue(Field.AUTHOR, "Haijian Shi"); result.setValue(Field.YEAR, "2007"); result.setValue(Field.TITLE, "Best-first decision tree learning"); result.setValue(Field.SCHOOL, "University of Waikato"); result.setValue(Field.ADDRESS, "Hamilton, NZ"); result.setValue(Field.NOTE, "COMP594"); additional = result.add(Type.ARTICLE); additional.setValue(Field.AUTHOR, "Jerome Friedman and Trevor Hastie and Robert Tibshirani"); additional.setValue(Field.YEAR, "2000"); additional.setValue(Field.TITLE, "Additive logistic regression : A statistical view of boosting"); additional.setValue(Field.JOURNAL, "Annals of statistics"); additional.setValue(Field.VOLUME, "28"); additional.setValue(Field.NUMBER, "2"); additional.setValue(Field.PAGES, "337-407"); additional.setValue(Field.ISSN, "0090-5364"); return result; } /** * Returns default capabilities of the classifier. * * @return the capabilities of this classifier */ public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); // attributes result.enable(Capability.NOMINAL_ATTRIBUTES); result.enable(Capability.NUMERIC_ATTRIBUTES); result.enable(Capability.MISSING_VALUES); // class result.enable(Capability.NOMINAL_CLASS); return result; } /** * Method for building a BestFirst decision tree classifier. * * @param data set of instances serving as training data * @throws Exception if decision tree cannot be built successfully */ public void buildClassifier(Instances data) throws Exception { getCapabilities().testWithFail(data); data = new Instances(data); data.deleteWithMissingClass(); // build an unpruned tree if (m_PruningStrategy == PRUNING_UNPRUNED) { // calculate sorted indices, weights and initial class probabilities int[][] sortedIndices = new int[data.numAttributes()][0]; double[][] weights = new double[data.numAttributes()][0]; double[] classProbs = new double[data.numClasses()]; double totalWeight = computeSortedInfo(data,sortedIndices, weights,classProbs); // Compute information of the best split for this node (include split attribute, // split value and gini gain (or information gain)). At the same time, compute // variables dists, props and totalSubsetWeights. double[][][] dists = new double[data.numAttributes()][2][data.numClasses()]; double[][] props = new double[data.numAttributes()][2]; double[][] totalSubsetWeights = new double[data.numAttributes()][2]; FastVector nodeInfo = computeSplitInfo(this, data, sortedIndices, weights, dists, props, totalSubsetWeights, m_Heuristic, m_UseGini); // add the node (with all split info) into BestFirstElements FastVector BestFirstElements = new FastVector(); BestFirstElements.addElement(nodeInfo); // Make the best-first decision tree. int attIndex = ((Attribute)nodeInfo.elementAt(1)).index(); m_Expansion = 0; makeTree(BestFirstElements, data, sortedIndices, weights, dists, classProbs, totalWeight, props[attIndex] ,m_minNumObj, m_Heuristic, m_UseGini, m_FixedExpansion); return; } // the following code is for pre-pruning and post-pruning methods // Compute train data, test data, sorted indices, sorted weights, total weights, // class probabilities, class distributions, branch proportions and total subset // weights for root nodes of each fold for prepruning and postpruning. int expansion = 0; Random random = new Random(m_Seed); Instances cvData = new Instances(data); cvData.randomize(random); cvData = new Instances(cvData,0,(int)(cvData.numInstances()*m_SizePer)-1); cvData.stratify(m_numFoldsPruning); Instances[] train = new Instances[m_numFoldsPruning]; Instances[] test = new Instances[m_numFoldsPruning]; FastVector[] parallelBFElements = new FastVector [m_numFoldsPruning]; BFTree[] m_roots = new BFTree[m_numFoldsPruning]; int[][][] sortedIndices = new int[m_numFoldsPruning][data.numAttributes()][0]; double[][][] weights = new double[m_numFoldsPruning][data.numAttributes()][0]; double[][] classProbs = new double[m_numFoldsPruning][data.numClasses()]; double[] totalWeight = new double[m_numFoldsPruning]; double[][][][] dists = new double[m_numFoldsPruning][data.numAttributes()][2][data.numClasses()]; double[][][] props = new double[m_numFoldsPruning][data.numAttributes()][2]; double[][][] totalSubsetWeights = new double[m_numFoldsPruning][data.numAttributes()][2]; FastVector[] nodeInfo = new FastVector[m_numFoldsPruning];
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -