📄 ensembleselection.java
字号:
/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* * EnsembleSelection.java * Copyright (C) 2006 David Michael * */package weka.classifiers.meta;import weka.classifiers.Evaluation;import weka.classifiers.RandomizableClassifier;import weka.classifiers.meta.ensembleSelection.EnsembleMetricHelper;import weka.classifiers.meta.ensembleSelection.EnsembleSelectionLibrary;import weka.classifiers.meta.ensembleSelection.EnsembleSelectionLibraryModel;import weka.classifiers.meta.ensembleSelection.ModelBag;import weka.classifiers.trees.REPTree;import weka.classifiers.xml.XMLClassifier;import weka.core.Capabilities;import weka.core.Instance;import weka.core.Instances;import weka.core.Option;import weka.core.SelectedTag;import weka.core.Tag;import weka.core.TechnicalInformation;import weka.core.TechnicalInformationHandler;import weka.core.Utils;import weka.core.Capabilities.Capability;import weka.core.TechnicalInformation.Field;import weka.core.TechnicalInformation.Type;import weka.core.xml.KOML;import weka.core.xml.XMLOptions;import weka.core.xml.XMLSerialization;import java.io.BufferedInputStream;import java.io.BufferedOutputStream;import java.io.BufferedReader;import java.io.File;import java.io.FileInputStream;import java.io.FileOutputStream;import java.io.FileReader;import java.io.InputStream;import java.io.ObjectInputStream;import java.io.ObjectOutputStream;import java.io.OutputStream;import java.io.Serializable;import java.util.Date;import java.util.Enumeration;import java.util.HashMap;import java.util.Iterator;import java.util.Map;import java.util.Random;import java.util.Set;import java.util.Vector;import java.util.zip.GZIPInputStream;import java.util.zip.GZIPOutputStream;/** <!-- globalinfo-start --> * Combines several classifiers using the ensemble selection method. For more information, see: Caruana, Rich, Niculescu, Alex, Crew, Geoff, and Ksikes, Alex, Ensemble Selection from Libraries of Models, The International Conference on Machine Learning (ICML'04), 2004. Implemented in Weka by Bob Jung and David Michael. * <p/> <!-- globalinfo-end --> * <!-- technical-bibtex-start --> * BibTeX: * <pre> * @inproceedings{RichCaruana2004, * author = {Rich Caruana, Alex Niculescu, Geoff Crew, and Alex Ksikes}, * booktitle = {21st International Conference on Machine Learning}, * title = {Ensemble Selection from Libraries of Models}, * year = {2004} * } * </pre> * <p/> <!-- technical-bibtex-end --> * * Our implementation of ensemble selection is a bit different from the other * classifiers because we assume that the list of models to be trained is too * large to fit in memory and that our base classifiers will need to be * serialized to the file system (in the directory listed in the "workingDirectory * option). We have adopted the term "model library" for this large set of * classifiers keeping in line with the original paper. * <p/> * * If you are planning to use this classifier, we highly recommend you take a * quick look at our FAQ/tutorial on the WIKI. There are a few things that * are unique to this classifier that could trip you up. Otherwise, this * method is a great way to get really great classifier performance without * having to do too much parameter tuning. What is nice is that in the worst * case you get a nice summary of how s large number of diverse models * performed on your data set. * <p/> * * This class relies on the package weka.classifiers.meta.ensembleSelection. * <p/> * * When run from the Explorer or another GUI, the classifier depends on the * package weka.gui.libraryEditor. * <p/> * <!-- options-start --> * Valid options are: <p/> * * <pre> -L </path/to/modelLibrary> * Specifies the Model Library File, continuing the list of all models.</pre> * * <pre> -W </path/to/working/directory> * Specifies the Working Directory, where all models will be stored.</pre> * * <pre> -B <numModelBags> * Set the number of bags, i.e., number of iterations to run * the ensemble selection algorithm.</pre> * * <pre> -E <modelRatio> * Set the ratio of library models that will be randomly chosen * to populate each bag of models.</pre> * * <pre> -V <validationRatio> * Set the ratio of the training data set that will be reserved * for validation.</pre> * * <pre> -H <hillClimbIterations> * Set the number of hillclimbing iterations to be performed * on each model bag.</pre> * * <pre> -I <sortInitialization> * Set the the ratio of the ensemble library that the sort * initialization algorithm will be able to choose from while * initializing the ensemble for each model bag</pre> * * <pre> -X <numFolds> * Sets the number of cross-validation folds.</pre> * * <pre> -P <hillclimbMettric> * Specify the metric that will be used for model selection * during the hillclimbing algorithm. * Valid metrics are: * accuracy, rmse, roc, precision, recall, fscore, all</pre> * * <pre> -A <algorithm> * Specifies the algorithm to be used for ensemble selection. * Valid algorithms are: * "forward" (default) for forward selection. * "backward" for backward elimination. * "both" for both forward and backward elimination. * "best" to simply print out top performer from the * ensemble library * "library" to only train the models in the ensemble * library</pre> * * <pre> -R * Flag whether or not models can be selected more than once * for an ensemble.</pre> * * <pre> -G * Whether sort initialization greedily stops adding models * when performance degrades.</pre> * * <pre> -O * Flag for verbose output. Prints out performance of all * selected models.</pre> * * <pre> -S <num> * Random number seed. * (default 1)</pre> * * <pre> -D * If set, classifier is run in debug mode and * may output additional info to the console</pre> * <!-- options-end --> * * @author Robert Jung * @author David Michael * @version $Revision: 1.2 $ */public class EnsembleSelection extends RandomizableClassifier implements Serializable, TechnicalInformationHandler { /** for serialization */ private static final long serialVersionUID = -1744155148765058511L; /** * The Library of models, from which we can select our ensemble. Usually * loaded from a model list file (.mlf or .model.xml) using the -L * command-line option. */ protected EnsembleSelectionLibrary m_library = new EnsembleSelectionLibrary(); /** * List of models chosen by EnsembleSelection. Populated by buildClassifier. */ protected EnsembleSelectionLibraryModel[] m_chosen_models = null; /** * An array of weights for the chosen models. Elements are parallel to those * in m_chosen_models. That is, m_chosen_model_weights[i] is the weight * associated with the model at m_chosen_models[i]. */ protected int[] m_chosen_model_weights = null; /** Total weight of all chosen models. */ protected int m_total_weight = 0; /** * ratio of library models that will be randomly chosen to be used for each * model bag */ protected double m_modelRatio = 0.5; /** * Indicates the fraction of the given training set that should be used for * hillclimbing/validation. This fraction is set aside and not used for * training. It is assumed that any loaded models were also not trained on * set-aside data. (If the same percentage and random seed were used * previously to train the models in the library, this will work as expected - * i.e., those models will be valid) */ protected double m_validationRatio = 0.25; /** defines metrics that can be chosen for hillclimbing */ public static final Tag[] TAGS_METRIC = { new Tag(EnsembleMetricHelper.METRIC_ACCURACY, "Optimize with Accuracy"), new Tag(EnsembleMetricHelper.METRIC_RMSE, "Optimize with RMSE"), new Tag(EnsembleMetricHelper.METRIC_ROC, "Optimize with ROC"), new Tag(EnsembleMetricHelper.METRIC_PRECISION, "Optimize with precision"), new Tag(EnsembleMetricHelper.METRIC_RECALL, "Optimize with recall"), new Tag(EnsembleMetricHelper.METRIC_FSCORE, "Optimize with fscore"), new Tag(EnsembleMetricHelper.METRIC_ALL, "Optimize with all metrics"), }; /** * The "enumeration" of the algorithms we can use. Forward - forward * selection. For hillclimb iterations, */ public static final int ALGORITHM_FORWARD = 0; public static final int ALGORITHM_BACKWARD = 1; public static final int ALGORITHM_FORWARD_BACKWARD = 2; public static final int ALGORITHM_BEST = 3; public static final int ALGORITHM_BUILD_LIBRARY = 4; /** defines metrics that can be chosen for hillclimbing */ public static final Tag[] TAGS_ALGORITHM = { new Tag(ALGORITHM_FORWARD, "Forward selection"), new Tag(ALGORITHM_BACKWARD, "Backward elimation"), new Tag(ALGORITHM_FORWARD_BACKWARD, "Forward Selection + Backward Elimination"), new Tag(ALGORITHM_BEST, "Best model"), new Tag(ALGORITHM_BUILD_LIBRARY, "Build Library Only") }; /** * this specifies the number of "Ensembl-X" directories that are allowed to * be created in the users home directory where X is the number of the * ensemble */ private static final int MAX_DEFAULT_DIRECTORIES = 1000; /** * The name of the Model Library File (if one is specified) which lists * models from which ensemble selection will choose. This is only used when * run from the command-line, as otherwise m_library is responsible for * this. */ protected String m_modelLibraryFileName = null; /** * The number of "model bags". Using 1 is equivalent to no bagging at all. */ protected int m_numModelBags = 10; /** The metric for which the ensemble will be optimized. */ protected int m_hillclimbMetric = EnsembleMetricHelper.METRIC_RMSE; /** The algorithm used for ensemble selection. */ protected int m_algorithm = ALGORITHM_FORWARD; /** * number of hillclimbing iterations for the ensemble selection algorithm */ protected int m_hillclimbIterations = 100; /** ratio of library models to be used for sort initialization */ protected double m_sortInitializationRatio = 1.0; /** * specifies whether or not the ensemble algorithm is allowed to include a * specific model in the library more than once in each ensemble */ protected boolean m_replacement = true; /** * specifies whether we use "greedy" sort initialization. If false, we * simply add the best m_sortInitializationRatio models of the bag blindly. * If true, we add the best models in order up to m_sortInitializationRatio * until adding the next model would not help performance. */ protected boolean m_greedySortInitialization = true; /** * Specifies whether or not we will output metrics for all models */ protected boolean m_verboseOutput = false; /** * Hash map of cached predictions. The key is a stringified Instance. Each * entry is a 2d array, first indexed by classifier index (i.e., the one * used in m_chosen_model). The second index is the usual "distribution" * index across classes. */ protected Map m_cachedPredictions = null; /** * This string will store the working directory where all models , temporary * prediction values, and modellist logs are to be built and stored. */ protected File m_workingDirectory = new File(getDefaultWorkingDirectory()); /** * Indicates the number of folds for cross-validation. A value of 1 * indicates there is no cross-validation. Cross validation is done in the * "embedded" fashion described by Caruana, Niculescu, and Munson * (unpublished work - tech report forthcoming) */ protected int m_NumFolds = 1; /** * Returns a string describing classifier * * @return a description suitable for displaying in the * explorer/experimenter gui */ public String globalInfo() { return "Combines several classifiers using the ensemble " + "selection method. For more information, see: " + "Caruana, Rich, Niculescu, Alex, Crew, Geoff, and Ksikes, Alex, " + "Ensemble Selection from Libraries of Models, " + "The International Conference on Machine Learning (ICML'04), 2004. " + "Implemented in Weka by Bob Jung and David Michael."; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ public Enumeration listOptions() { Vector result = new Vector(); result.addElement(new Option( "\tSpecifies the Model Library File, continuing the list of all models.", "L", 1, "-L </path/to/modelLibrary>")); result.addElement(new Option( "\tSpecifies the Working Directory, where all models will be stored.", "W", 1, "-W </path/to/working/directory>")); result.addElement(new Option( "\tSet the number of bags, i.e., number of iterations to run \n" + "\tthe ensemble selection algorithm.", "B", 1, "-B <numModelBags>")); result.addElement(new Option( "\tSet the ratio of library models that will be randomly chosen \n" + "\tto populate each bag of models.", "E", 1, "-E <modelRatio>")); result.addElement(new Option( "\tSet the ratio of the training data set that will be reserved \n" + "\tfor validation.", "V", 1, "-V <validationRatio>")); result.addElement(new Option( "\tSet the number of hillclimbing iterations to be performed \n" + "\ton each model bag.", "H", 1, "-H <hillClimbIterations>")); result.addElement(new Option( "\tSet the the ratio of the ensemble library that the sort \n" + "\tinitialization algorithm will be able to choose from while \n" + "\tinitializing the ensemble for each model bag", "I", 1, "-I <sortInitialization>")); result.addElement(new Option( "\tSets the number of cross-validation folds.", "X", 1, "-X <numFolds>")); result.addElement(new Option( "\tSpecify the metric that will be used for model selection \n" + "\tduring the hillclimbing algorithm.\n" + "\tValid metrics are: \n" + "\t\taccuracy, rmse, roc, precision, recall, fscore, all", "P", 1, "-P <hillclimbMettric>")); result.addElement(new Option( "\tSpecifies the algorithm to be used for ensemble selection. \n" + "\tValid algorithms are:\n" + "\t\t\"forward\" (default) for forward selection.\n" + "\t\t\"backward\" for backward elimination.\n" + "\t\t\"both\" for both forward and backward elimination.\n" + "\t\t\"best\" to simply print out top performer from the \n" + "\t\t ensemble library\n" + "\t\t\"library\" to only train the models in the ensemble \n" + "\t\t library", "A", 1, "-A <algorithm>")); result.addElement(new Option( "\tFlag whether or not models can be selected more than once \n" + "\tfor an ensemble.", "R", 0, "-R")); result.addElement(new Option( "\tWhether sort initialization greedily stops adding models \n" + "\twhen performance degrades.", "G", 0, "-G")); result.addElement(new Option( "\tFlag for verbose output. Prints out performance of all \n" + "\tselected models.", "O", 0, "-O")); // TODO - Add more options here Enumeration enu = super.listOptions(); while (enu.hasMoreElements()) { result.addElement(enu.nextElement()); } return result.elements(); }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -