📄 semisupem.java
字号:
/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */package weka.classifiers.bayes;import weka.classifiers.*;import weka.classifiers.sparse.*;import java.io.*;import java.util.*;import weka.core.*;/** * Semi supervised learner that uses EM initialized with labeled data and then * runs EM iterations on the unlabeled data to improve the model. * * See: Kamal Nigam, Andrew McCallum, Sebastian Thrun and Tom * Mitchell. Text Classification from Labeled and Unlabeled Documents * using EM. Machine Learning, 39(2/3). pp. 103-134. 2000. * * Assumes use of a base classifier that is a SoftClassifer that * accepts training data with a soft class distribution rather than * a hard assignment, i.e. SoftClassifiedInstances. Sample soft * classifiers are NaiveBayesSimpleSoft and NaiveBayesSimpleSparseSoft * * @author Ray Mooney (mooney@cs.utexas.edu)*/public class SemiSupEM extends DistributionClassifier implements SemiSupClassifier, OptionHandler{ /** Original set of unlabeled Instances */ protected Instances m_UnlabeledData; /** Soft labeled version of unlabeled data */ protected SoftClassifiedInstances m_UnlabeledInstances; /** Hard Labeled data */ protected Instances m_LabeledInstances; /** Complete set of labeled and unlabeled instances for EM */ protected SoftClassifiedInstances m_AllInstances; /** Base classifier that supports soft classified instances */ protected SoftClassifier m_Classifier = new NaiveBayesSimpleSoft(); /** Weight of unlabeled examples during EM training versus labeled examples (see Nigam et al.)*/ protected double m_Lambda = 1.0; /** random numbers and seed */ protected Random m_Random; protected int m_rseed; /** maximum iterations to perform */ protected int m_max_iterations; /** Create soft labeled Seed for unseen classes */ protected boolean m_seedUnseenClasses; /** Verbose? */ protected boolean m_verbose; protected static double m_minLogLikelihoodIncr = 1e-6; /** The minimum values for numeric attributes. */ protected double [] m_MinArray; /** The maximum values for numeric attributes. */ protected double [] m_MaxArray; /** * Returns a string describing this clusterer * @return a description of the evaluator suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return "Classifier trained using both labeled and unlabeled data using EM"; } /** * Returns an enumeration describing the available options.. <p> * * Valid options are:<p> * * -V <br> * Verbose. <p> * * -I <max iterations> <br> * Terminate after this many iterations if EM has not converged. <p> * * -S <seed> <br> * Specify random number seed. <p> * * -M <num> <br> * Set the minimum allowable standard deviation for normal density * calculation. <p> * * @return an enumeration of all the available options. * **/ public Enumeration listOptions () { Vector newVector = new Vector(7); newVector.addElement(new Option( "\tFull name of classifier to boost.\n" +"\teg: weka.classifiers.bayes.NaiveBayes", "W", 1, "-W <class name>")); newVector.addElement(new Option("\tLambda weight for unlabeled data.\n(default 1)", "L" , 1, "-L <num>")); newVector.addElement(new Option("\tmax iterations.\n(default 100)", "I" , 1, "-I <num>")); newVector.addElement(new Option("\trandom number seed.\n(default 1)" , "S", 1, "-S <num>")); newVector.addElement(new Option("\tverbose.", "V", 0, "-V")); newVector.addElement(new Option("\tSeed unseen classes.", "U", 0, "-U")); if ((m_Classifier != null) && (m_Classifier instanceof OptionHandler)) { newVector.addElement(new Option( "", "", 0, "\nOptions specific to classifier " + m_Classifier.getClass().getName() + ":")); Enumeration enum = ((OptionHandler)m_Classifier).listOptions(); while (enum.hasMoreElements()) { newVector.addElement(enum.nextElement()); } } return newVector.elements(); } /** * Parses a given list of options. * @param options the list of options as an array of strings * @exception Exception if an option is not supported * **/ public void setOptions (String[] options) throws Exception { resetOptions(); String classifierName = Utils.getOption('W', options); if (classifierName.length() == 0) { throw new Exception("A classifier must be specified with" + " the -W option."); } setClassifier((SoftClassifier)Classifier.forName(classifierName, Utils.partitionOptions(options))); setDebug(Utils.getFlag('V', options)); setSeedUnseenClasses(Utils.getFlag('U', options)); String optionString = Utils.getOption('I', options); if (optionString.length() != 0) { setMaxIterations(Integer.parseInt(optionString)); } optionString = Utils.getOption('S', options); if (optionString.length() != 0) { setSeed(Integer.parseInt(optionString)); } optionString = Utils.getOption('L', options); if (optionString.length() != 0) { setLambda(Double.parseDouble(optionString)); } } /** * Reset to default options */ protected void resetOptions () { m_max_iterations = 100; m_rseed = 100; m_verbose = false; m_seedUnseenClasses = false; m_Classifier = new NaiveBayesSimpleSoft(); m_Lambda = 1.0; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String seedTipText() { return "random number seed"; } /** * Set the random number seed * * @param s the seed */ public void setSeed (int s) { m_rseed = s; } /** * Get the random number seed * * @return the seed */ public int getSeed () { return m_rseed; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String maxIterationsTipText() { return "maximum number of EM iterations"; } /** * Set the maximum number of iterations to perform * * @param i the number of iterations * @exception Exception if i is less than 1 */ public void setMaxIterations (int i) throws Exception { if (i < 1) { throw new Exception("Maximum number of iterations must be > 0!"); } m_max_iterations = i; } /** * Get the maximum number of iterations * * @return the number of iterations */ public int getMaxIterations () { return m_max_iterations; } /** * Set debug mode - verbose output * * @param v true for verbose output */ public void setDebug (boolean v) { m_verbose = v; } /** * Get debug mode * * @return true if debug mode is set */ public boolean getDebug () { return m_verbose; } public void setSeedUnseenClasses (boolean v) { m_seedUnseenClasses = v; } public boolean getSeedUnseenClasses () { return m_seedUnseenClasses; } public String seedUnseenClassesTipText() { return "create soft seeds for unseen classes using farthest-first"; } public void setLambda (double v) { m_Lambda = v; } public double getLambda () { return m_Lambda; } public String lambdaTipText() { return "set weight of unlabeled examples vs. labeled"; } /** * Set the classifier for boosting. * * @param newClassifier the Classifier to use. */ public void setClassifier(SoftClassifier newClassifier) { m_Classifier = newClassifier; } /** * Get the classifier used as the classifier * * @return the classifier used as the classifier */ public SoftClassifier getClassifier() { return m_Classifier; } public String classifierTipText() { return "Base SoftClassifier to use for underlying probabilistic classification"; } /** * Gets the current settings of EM. * * @return an array of strings suitable for passing to setOptions() */ public String[] getOptions () { String [] classifierOptions = new String [0]; if ((m_Classifier != null) && (m_Classifier instanceof OptionHandler)) { classifierOptions = ((OptionHandler)m_Classifier).getOptions(); } String [] options = new String [classifierOptions.length + 10]; int current = 0; if (m_verbose) { options[current++] = "-V"; } if (m_seedUnseenClasses) { options[current++] = "-U"; } options[current++] = "-I"; options[current++] = "" + m_max_iterations; options[current++] = "-S"; options[current++] = "" + m_rseed; options[current++] = "-L"; options[current++] = "" + m_Lambda; if (getClassifier() != null) { options[current++] = "-W"; options[current++] = getClassifier().getClass().getName(); } options[current++] = "--"; System.arraycopy(classifierOptions, 0, options, current, classifierOptions.length); current += classifierOptions.length; while (current < options.length) { options[current++] = ""; } return options; } /** * Provide unlabeled data to the classifier. * @unlabeled the unlabeled Instances */ public void setUnlabeled(Instances unlabeled){ m_UnlabeledData = unlabeled; } /** Simple constructor, must set options using command line or GUI */ public SemiSupEM() { resetOptions(); }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -