⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 em.java

📁 数据挖掘中聚类的算法
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
/* *    This program is free software; you can redistribute it and/or modify *    it under the terms of the GNU General Public License as published by *    the Free Software Foundation; either version 2 of the License, or *    (at your option) any later version. * *    This program is distributed in the hope that it will be useful, *    but WITHOUT ANY WARRANTY; without even the implied warranty of *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *    GNU General Public License for more details. * *    You should have received a copy of the GNU General Public License *    along with this program; if not, write to the Free Software *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* *    EM.java *    Copyright (C) 1999 University of Waikato, Hamilton, New Zealand * */package  weka.clusterers;import weka.core.Capabilities;import weka.core.Instance;import weka.core.Instances;import weka.core.Option;import weka.core.Utils;import weka.core.WeightedInstancesHandler;import weka.estimators.DiscreteEstimator;import weka.estimators.Estimator;import weka.filters.unsupervised.attribute.ReplaceMissingValues;import java.util.Enumeration;import java.util.Random;import java.util.Vector;/** <!-- globalinfo-start --> * Simple EM (expectation maximisation) class.<br/> * <br/> * EM assigns a probability distribution to each instance which indicates the probability of it belonging to each of the clusters. EM can decide how many clusters to create by cross validation, or you may specify apriori how many clusters to generate.<br/> * <br/> * The cross validation performed to determine the number of clusters is done in the following steps:<br/> * 1. the number of clusters is set to 1<br/> * 2. the training set is split randomly into 10 folds.<br/> * 3. EM is performed 10 times using the 10 folds the usual CV way.<br/> * 4. the loglikelihood is averaged over all 10 results.<br/> * 5. if loglikelihood has increased the number of clusters is increased by 1 and the program continues at step 2. <br/> * <br/> * The number of folds is fixed to 10, as long as the number of instances in the training set is not smaller 10. If this is the case the number of folds is set equal to the number of instances. * <p/> <!-- globalinfo-end --> * <!-- options-start --> * Valid options are: <p/> *  * <pre> -N &lt;num&gt; *  number of clusters. If omitted or -1 specified, then  *  cross validation is used to select the number of clusters.</pre> *  * <pre> -I &lt;num&gt; *  max iterations. * (default 100)</pre> *  * <pre> -V *  verbose.</pre> *  * <pre> -M &lt;num&gt; *  minimum allowable standard deviation for normal density *  computation *  (default 1e-6)</pre> *  * <pre> -S &lt;num&gt; *  Random number seed. *  (default 1)</pre> *  <!-- options-end --> * * @author Mark Hall (mhall@cs.waikato.ac.nz) * @author Eibe Frank (eibe@cs.waikato.ac.nz) * @version $Revision: 1.40 $ */public class EM  extends RandomizableDensityBasedClusterer  implements NumberOfClustersRequestable, WeightedInstancesHandler {  /** for serialization */  static final long serialVersionUID = 8348181483812829475L;    /** hold the discrete estimators for each cluster */  private Estimator m_model[][];  /** hold the normal estimators for each cluster */  private double m_modelNormal[][][];  /** default minimum standard deviation */  private double m_minStdDev = 1e-6;  private double [] m_minStdDevPerAtt;  /** hold the weights of each instance for each cluster */  private double m_weights[][];  /** the prior probabilities for clusters */  private double m_priors[];  /** the loglikelihood of the data */  private double m_loglikely;  /** training instances */  private Instances m_theInstances = null;  /** number of clusters selected by the user or cross validation */  private int m_num_clusters;  /** the initial number of clusters requested by the user--- -1 if      xval is to be used to find the number of clusters */  private int m_initialNumClusters;  /** number of attributes */  private int m_num_attribs;  /** number of training instances */  private int m_num_instances;  /** maximum iterations to perform */  private int m_max_iterations;  /** attribute min values */  private double [] m_minValues;  /** attribute max values */  private double [] m_maxValues;  /** random number generator */  private Random m_rr;  /** Verbose? */  private boolean m_verbose; /** globally replace missing values */  private ReplaceMissingValues m_replaceMissing;  /**   * Returns a string describing this clusterer   * @return a description of the evaluator suitable for   * displaying in the explorer/experimenter gui   */  public String globalInfo() {    return        "Simple EM (expectation maximisation) class.\n\n"      + "EM assigns a probability distribution to each instance which "      + "indicates the probability of it belonging to each of the clusters. "      + "EM can decide how many clusters to create by cross validation, or you "      + "may specify apriori how many clusters to generate.\n\n"      + "The cross validation performed to determine the number of clusters "      + "is done in the following steps:\n"      + "1. the number of clusters is set to 1\n"      + "2. the training set is split randomly into 10 folds.\n"      + "3. EM is performed 10 times using the 10 folds the usual CV way.\n"      + "4. the loglikelihood is averaged over all 10 results.\n"      + "5. if loglikelihood has increased the number of clusters is increased "      + "by 1 and the program continues at step 2. \n\n"      + "The number of folds is fixed to 10, as long as the number of "      + "instances in the training set is not smaller 10. If this is the case "      + "the number of folds is set equal to the number of instances.";  }  /**   * Returns an enumeration describing the available options.   *   * @return an enumeration of all the available options.   */  public Enumeration listOptions () {    Vector result = new Vector();        result.addElement(new Option(	"\tnumber of clusters. If omitted or -1 specified, then \n"	+ "\tcross validation is used to select the number of clusters.", 	"N", 1, "-N <num>"));    result.addElement(new Option(	"\tmax iterations."	+ "\n(default 100)", 	"I", 1, "-I <num>"));        result.addElement(new Option(	"\tverbose.",	"V", 0, "-V"));        result.addElement(new Option(	"\tminimum allowable standard deviation for normal density\n"	+ "\tcomputation\n"	+ "\t(default 1e-6)",	"M",1,"-M <num>"));    Enumeration en = super.listOptions();    while (en.hasMoreElements())      result.addElement(en.nextElement());        return  result.elements();  }  /**   * Parses a given list of options. <p/>   *    <!-- options-start -->   * Valid options are: <p/>   *    * <pre> -N &lt;num&gt;   *  number of clusters. If omitted or -1 specified, then    *  cross validation is used to select the number of clusters.</pre>   *    * <pre> -I &lt;num&gt;   *  max iterations.   * (default 100)</pre>   *    * <pre> -V   *  verbose.</pre>   *    * <pre> -M &lt;num&gt;   *  minimum allowable standard deviation for normal density   *  computation   *  (default 1e-6)</pre>   *    * <pre> -S &lt;num&gt;   *  Random number seed.   *  (default 1)</pre>   *    <!-- options-end -->   *    * @param options the list of options as an array of strings   * @throws Exception if an option is not supported   */  public void setOptions (String[] options)    throws Exception {    resetOptions();    setDebug(Utils.getFlag('V', options));    String optionString = Utils.getOption('I', options);    if (optionString.length() != 0) {      setMaxIterations(Integer.parseInt(optionString));    }    optionString = Utils.getOption('N', options);    if (optionString.length() != 0) {      setNumClusters(Integer.parseInt(optionString));    }    optionString = Utils.getOption('M', options);    if (optionString.length() != 0) {      setMinStdDev((new Double(optionString)).doubleValue());    }        super.setOptions(options);  }  /**   * Returns the tip text for this property   * @return tip text for this property suitable for   * displaying in the explorer/experimenter gui   */  public String minStdDevTipText() {    return "set minimum allowable standard deviation";  }  /**   * Set the minimum value for standard deviation when calculating   * normal density. Reducing this value can help prevent arithmetic   * overflow resulting from multiplying large densities (arising from small   * standard deviations) when there are many singleton or near singleton   * values.   * @param m minimum value for standard deviation   */  public void setMinStdDev(double m) {    m_minStdDev = m;  }  public void setMinStdDevPerAtt(double [] m) {    m_minStdDevPerAtt = m;  }  /**   * Get the minimum allowable standard deviation.   * @return the minumum allowable standard deviation   */  public double getMinStdDev() {    return m_minStdDev;  }  /**   * Returns the tip text for this property   * @return tip text for this property suitable for   * displaying in the explorer/experimenter gui   */  public String numClustersTipText() {    return "set number of clusters. -1 to select number of clusters "      +"automatically by cross validation.";  }  /**   * Set the number of clusters (-1 to select by CV).   *   * @param n the number of clusters   * @throws Exception if n is 0   */  public void setNumClusters (int n)    throws Exception {        if (n == 0) {      throw  new Exception("Number of clusters must be > 0. (or -1 to " 			   + "select by cross validation).");    }    if (n < 0) {      m_num_clusters = -1;      m_initialNumClusters = -1;    }    else {      m_num_clusters = n;      m_initialNumClusters = n;    }  }  /**   * Get the number of clusters   *   * @return the number of clusters.   */  public int getNumClusters () {    return  m_initialNumClusters;  }  /**   * Returns the tip text for this property   * @return tip text for this property suitable for   * displaying in the explorer/experimenter gui   */  public String maxIterationsTipText() {    return "maximum number of iterations";  }  /**   * Set the maximum number of iterations to perform   *   * @param i the number of iterations   * @throws Exception if i is less than 1   */  public void setMaxIterations (int i)    throws Exception {    if (i < 1) {      throw  new Exception("Maximum number of iterations must be > 0!");    }    m_max_iterations = i;  }  /**   * Get the maximum number of iterations   *   * @return the number of iterations   */  public int getMaxIterations () {    return  m_max_iterations;  }    /**   * Returns the tip text for this property   * @return tip text for this property suitable for   * displaying in the explorer/experimenter gui   */  public String debugTipText() {    return "If set to true, clusterer may output additional info to " +      "the console.";  }  /**   * Set debug mode - verbose output   *   * @param v true for verbose output   */  public void setDebug (boolean v) {    m_verbose = v;  }  /**   * Get debug mode   *   * @return true if debug mode is set   */  public boolean getDebug () {    return  m_verbose;  }

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -