xmeans.java

来自「Java 编写的多种数据挖掘算法包括聚类、分类、预处理等」· Java 代码 · 共 2,160 行 · 第 1/5 页
JAVA
2,160 行
/* *    This program is free software; you can redistribute it and/or modify *    it under the terms of the GNU General Public License as published by *    the Free Software Foundation; either version 2 of the License, or *    (at your option) any later version. * *    This program is distributed in the hope that it will be useful, *    but WITHOUT ANY WARRANTY; without even the implied warranty of *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *    GNU General Public License for more details. * *    You should have received a copy of the GNU General Public License *    along with this program; if not, write to the Free Software *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* *    XMeans.java *    Copyright (C) 2000 Mark Hall, Malcolm Ware, Gabi Schmidberger * */package weka.clusterers;import weka.core.AlgVector;import weka.core.Capabilities;import weka.core.EuclideanDistance;import weka.core.Instance;import weka.core.Instances;import weka.core.KDTree;import weka.core.Option;import weka.core.OptionHandler;import weka.core.TechnicalInformation;import weka.core.TechnicalInformation.Type;import weka.core.TechnicalInformation.Field;import weka.core.TechnicalInformationHandler;import weka.core.Utils;import weka.core.Capabilities.Capability;import weka.filters.Filter;import weka.filters.unsupervised.attribute.ReplaceMissingValues;import java.io.BufferedReader;import java.io.FileOutputStream;import java.io.FileReader;import java.io.PrintWriter;import java.io.Reader;import java.util.Enumeration;import java.util.Random;import java.util.Vector;/** <!-- globalinfo-start --> * Cluster data using the X-means algorithm.<br/> * <br/> * X-Means is K-Means extended by an Improve-Structure part In this part of the algorithm the centers are attempted to be split in its region. The decision between the children of each center and itself is done comparing the BIC-values of the two structures.<br/> * <br/> * For more information see:<br/> * <br/> * Dan Pelleg, Andrew W. Moore: X-means: Extending K-means with Efficient Estimation of the Number of Clusters. In: Seventeenth International Conference on Machine Learning, 727-734, 2000. * <p/> <!-- globalinfo-end --> * <!-- technical-bibtex-start --> * BibTeX: * <pre> * &#64;inproceedings{Pelleg2000, *    author = {Dan Pelleg and Andrew W. Moore}, *    booktitle = {Seventeenth International Conference on Machine Learning}, *    pages = {727-734}, *    publisher = {Morgan Kaufmann}, *    title = {X-means: Extending K-means with Efficient Estimation of the Number of Clusters}, *    year = {2000} * } * </pre> * <p/> <!-- technical-bibtex-end --> * <!-- options-start --> * Valid options are: <p/> *  * <pre> -I &lt;num&gt; *  maximum number of overall iterations *  (default = 1).</pre> *  * <pre> -M &lt;num&gt; *  maximum number of iterations in the kMeans loop in *  the Improve-Parameter part  *  (default = 1000).</pre> *  * <pre> -J &lt;num&gt; *  maximum number of iterations in the kMeans loop *  for the splitted centroids in the Improve-Structure part  *  (default = 1000).</pre> *  * <pre> -L &lt;num&gt; *  minimum number of clusters *  (default = 2).</pre> *  * <pre> -H &lt;num&gt; *  maximum number of clusters *  (default = 4).</pre> *  * <pre> -V &lt;value&gt; *  distance value for binary attributes *  (default = 1.0).</pre> *  * <pre> -K &lt;KDTree class specification&gt; *  Full class name of KDTree class to use, followed *  by scheme options. *  eg: "weka.core.KDTree -P" *  (default = no KDTree class used).</pre> *  * <pre> -C &lt;value&gt; *  cutoff factor, takes the given percentage of the splitted  *  centroids if none of the children win *  (default = 0.0).</pre> *  * <pre> -K &lt;distance function class specification&gt; *  Full class name of Distance function class to use, followed *  by scheme options. *  eg: "weka.core.MahalanobisDistance" *  (default = weka.core.EuclideanDistance).</pre> *  * <pre> -N &lt;file name&gt; *  file to read starting centers from (ARFF format).</pre> *  * <pre> -O &lt;file name&gt; *  file to write centers to (ARFF format).</pre> *  * <pre> -S &lt;num&gt; *  random number seed (default 10).</pre> *  <!-- options-end --> * * @author Gabi Schmidberger (gabi@cs.waikato.ac.nz) * @author Mark Hall (mhall@cs.waikato.ac.nz) * @author Malcolm Ware (mfw4@cs.waikato.ac.nz) * @version $Revision: 1.14 $ * @see Clusterer * @see OptionHandler */public class XMeans   extends Clusterer   implements OptionHandler, TechnicalInformationHandler {  /*   * major TODOS:   *   * make BIC-Score replaceable by other scores   */  /** for serialization */  static final long serialVersionUID = -7941793078404132616L;    /** training instances */  private Instances m_Instances = null;  /** model information, should increase readability */  private Instances m_Model = null;    /** replace missing values in training instances */  private ReplaceMissingValues m_ReplaceMissingFilter;  /**   * Distance value between true and false of binary attributes and    * "same" and "different" of nominal attributes (default = 1.0).   */  private double m_BinValue = 1.0;  /** BIC-Score of the current model */  double m_Bic = Double.MIN_VALUE;  /** Distortion  */  double [] m_Mle = null;  /** maximum overall iterations */  private int m_MaxIterations = 1;  /**   * maximum iterations to perform Kmeans part    * if negative, iterations are not checked   */  private int m_MaxKMeans = 1000;  /** see above, but for kMeans of splitted clusters   */  private int m_MaxKMeansForChildren = 1000;  /** The actual number of clusters */  private int m_NumClusters = 2;  /** min number of clusters to generate */  private int m_MinNumClusters = 2;  /** max number of clusters to generate */  private int m_MaxNumClusters = 4;  /** the distance function used */  private EuclideanDistance m_DistanceF = new EuclideanDistance();  /** cluster centers */  private Instances m_ClusterCenters;  /** file name of the output file for the cluster centers */  String m_InputCenterFile = null;  /* --> DebugVektors - USED FOR DEBUGGING */  /** input file for the random vektors --> USED FOR DEBUGGING */  Reader m_DebugVektorsInput = null;  int m_DebugVektorsIndex = 0;  Instances m_DebugVektors = null;  /** file name of the input file for the random vektors */  String m_DebugVektorsFile = null;  /** input file for the cluster centers */  Reader m_CenterInput = null;      /** file name of the output file for the cluster centers */  String m_OutputCenterFile = null;    /** output file for the cluster centers */  PrintWriter m_CenterOutput = null;      /**   * temporary variable holding cluster assignments while iterating   */  private int [] m_ClusterAssignments;  /** cutoff factor - percentage of splits done in Improve-Structure part     only relevant, if all children lost */   double m_CutOffFactor = 0.5;  /**   * random seed   */  private int m_Seed = 10;  /**   * Index in ranges for LOW and HIGH and WIDTH   */  public static int R_LOW = 0;  public static int R_HIGH = 1;  public static int R_WIDTH = 2;  /**   * KDTrees class if KDTrees are used   */  private KDTree m_KDTree = null;  /** counts iterations done in main loop */  private int m_IterationCount = 0;  /** counter to say how often kMeans was stopped by loop counter */  private int m_KMeansStopped = 0;  /** Number of splits prepared */  private int m_NumSplits = 0;  /** Number of splits accepted (including cutoff factor decisions) */  private int m_NumSplitsDone = 0;  /** Number of splits accepted just because of cutoff factor */  private int m_NumSplitsStillDone = 0;  /**   * level of debug output, 0 is no output.   */  private int m_DebugLevel = 0;    public static int D_PRINTCENTERS = 1;  /** follows the splitting of the centers */  public static int D_FOLLOWSPLIT = 2;  /** have a closer look at converge children */  public static int D_CONVCHCLOSER = 3;  /** check on random vektors */  public static int D_RANDOMVEKTOR = 4;  /** check on kdtree */  public static int D_KDTREE = 5;  /** follow iterations */  public static int D_ITERCOUNT = 6;  /** functions were maybe misused  */  public static int D_METH_MISUSE = 80;   /** for current debug  */  public static int D_CURR = 88;  public static int D_GENERAL = 99;  /** Flag: I'm debugging */  public boolean m_CurrDebugFlag = true;   /**   * Returns a string describing this clusterer   * @return a description of the evaluator suitable for   * displaying in the explorer/experimenter gui   */  public String globalInfo() {    return         "Cluster data using the X-means algorithm.\n\n"       + "X-Means is K-Means extended by an Improve-Structure part In this "      + "part of the algorithm the centers are attempted to be split in "      + "its region. The decision between the children of each center and "      + "itself is done comparing the BIC-values of the two structures.\n\n"      + "For more information see:\n\n"      + getTechnicalInformation().toString();  }  /**   * Returns an instance of a TechnicalInformation object, containing    * detailed information about the technical background of this class,   * e.g., paper reference or book this class is based on.   *    * @return the technical information about this class   */  public TechnicalInformation getTechnicalInformation() {    TechnicalInformation 	result;        result = new TechnicalInformation(Type.INPROCEEDINGS);    result.setValue(Field.AUTHOR, "Dan Pelleg and Andrew W. Moore");    result.setValue(Field.TITLE, "X-means: Extending K-means with Efficient Estimation of the Number of Clusters");    result.setValue(Field.BOOKTITLE, "Seventeenth International Conference on Machine Learning");    result.setValue(Field.YEAR, "2000");    result.setValue(Field.PAGES, "727-734");    result.setValue(Field.PUBLISHER, "Morgan Kaufmann");        return result;  }  /**   * Returns default capabilities of the clusterer.   *   * @return      the capabilities of this clusterer   */  public Capabilities getCapabilities() {    Capabilities result = super.getCapabilities();    // attributes    result.enable(Capability.NUMERIC_ATTRIBUTES);    result.enable(Capability.DATE_ATTRIBUTES);    result.enable(Capability.MISSING_VALUES);    return result;  }   /**   * Generates the X-Means clusterer.    *   * @param data set of instances serving as training data    * @throws Exception if the clusterer has not been    * generated successfully   */  public void buildClusterer(Instances data) throws Exception {    // can clusterer handle the data?    getCapabilities().testWithFail(data);    m_NumSplits = 0;    m_NumSplitsDone = 0;    m_NumSplitsStillDone = 0;    // replace missing values    m_ReplaceMissingFilter = new ReplaceMissingValues();    m_ReplaceMissingFilter.setInputFormat(data);    m_Instances = Filter.useFilter(data, m_ReplaceMissingFilter);        // initialize random function    Random random0 = new Random(m_Seed);    // num of clusters to start with    m_NumClusters =  m_MinNumClusters;    // set distance function to default    if (m_DistanceF == null) {      m_DistanceF = new EuclideanDistance();    }    m_DistanceF.setInstances(m_Instances);    checkInstances();    if (m_DebugVektorsFile != null)      initDebugVektorsInput();    // make list of indexes for m_Instances    int [] allInstList = new int[m_Instances.numInstances()];     for (int i = 0; i < m_Instances.numInstances(); i++) {      allInstList[i] = i;    }        // set model used (just for convenience)    m_Model = new Instances(m_Instances, 0);    // produce the starting centers    if (m_CenterInput != null) {      // read centers from file      m_ClusterCenters = new Instances(m_CenterInput);      m_NumClusters = m_ClusterCenters.numInstances();    }    else      // makes the first centers randomly      m_ClusterCenters = makeCentersRandomly(random0,					     m_Instances, m_NumClusters);    PFD(D_FOLLOWSPLIT, "\n*** Starting centers ");    for (int k = 0; k < m_ClusterCenters.numInstances(); k++) {      PFD(D_FOLLOWSPLIT, "Center " + k + ": " + m_ClusterCenters.instance(k));    }    PrCentersFD(D_PRINTCENTERS);    boolean finished = false;    Instances children;     // builds up a KDTree    if (m_KDTree != null) {      m_KDTree.setInstances(m_Instances);    }      // loop counter of main loop    m_IterationCount = 0;    /**     * "finished" does get true as soon as:     * 1. number of clusters gets >= m_MaxClusters,      * 2. in the last round, none of the centers have been split     *      * if number of clusters is already >= m_MaxClusters      * part 1 (= Improve-Params) is done at least once.     */    while (!finished &&           !stopIteration(m_IterationCount, m_MaxIterations)) {            /* ====================================================================       * 1. Improve-Params                         *    conventional K-means
xmeans.java - 源码说明

本页面展示了「Java 编写的多种数据挖掘算法包括聚类、分类、预处理等」中的 xmeans.java 源码文件，采用 Java 编程语言编写，共 2,160 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与Java相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?