⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 xmeans.java

📁 wekaUT是 university texas austin 开发的基于weka的半指导学习(semi supervised learning)的分类器
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
/* *    This program is free software; you can redistribute it and/or modify *    it under the terms of the GNU General Public License as published by *    the Free Software Foundation; either version 2 of the License, or *    (at your option) any later version. * *    This program is distributed in the hope that it will be useful, *    but WITHOUT ANY WARRANTY; without even the implied warranty of *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *    GNU General Public License for more details. * *    You should have received a copy of the GNU General Public License *    along with this program; if not, write to the Free Software *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* *    XMeans.java *    Copyright (C) 2000 Mark Hall, Malcolm Ware, Gabi Schmidberger * */package weka.clusterers;import java.io.*;import java.util.*;import weka.core.AlgVector;import weka.core.AttributeStats;import weka.core.KDTree;import weka.core.DistanceFunction;import weka.core.EuclideanDistance;import weka.core.Instances;import weka.core.Instance;import weka.core.Attribute;import weka.core.Utils;import weka.core.Option;import weka.core.OptionHandler;import weka.filters.Filter;import weka.filters.unsupervised.attribute.ReplaceMissingValues;/** * XMeans clustering class. * * X-Means is K-Means extended by an Improve-Structure part In this part of  * the algorithm the centers are attempted to be split in its region.  * The decision between the children of  * each center and itself is done comparing the BIC-values of  * the two structures. * See also D. Pelleg and A. Moore's paper 'X-means: Extending  * K-means with Efficient Estimation of the Number of Clusters'. <p> * * Valid options are:<p> * * -I <max iterations> <br> * Maximum number of iterations in the overall loop (default = 1). <p> *  * -M <max iterations> <br> * Maximum number of iterations in the kMeans loop in <br> * the Improve-Parameter part (default = 1000).<p> * * -J <max iterations> <br> * Maximum number of iterations in the kMeans loop for the splitted <br> * centroids in the Improve-Structure part (default = 1000).<p> * * -L <minimal number of clusters> <br> * Specify the number of clusters to start with.<p> * * -H <maximal number of clusters> <br> * Specify the maximal number of clusters.<p> * * -B <value> <br> * Distance value between true and false of binary attributes and <br> * "same" and "different" of nominal attributes (default = 1.0).<p> * * -K <kdtree class><br> * KDTrees class and its options (can only use the same distance function * as XMeans).<p> * * -C <cutoff factor> <br> * If none of the children are better, percentage of the best splits<br> * to be taken.<p> *  * -D <distance function class> * Distance function class to be used (default = Euclidean distance) *  * -N <file name> <br> * Input starting cluster centers from file (ARFF-format). <p> * * -O <file name> <br> * Output cluster centers to file (ARFF-format). <p> * * -S <seed> <br> * Specify random number seed. <p> * * -U <debuglevel> <br> * Set debuglevel. <p> * * -Y <file name> <br> * Used for debugging: Input random vektors from file. <p> * * major TODOS: * * make BIC-Score replaceable by other scores * * @author Gabi Schmidberger <gabi@cs.waikato.ac.nz) * @author Mark Hall (mhall@cs.waikato.ac.nz) * @author Malcolm Ware <mfw4@cs.waikato.ac.nz) * @version $Revision: 1.1.1.1 $ * @see Clusterer * @see OptionHandler */public class XMeans extends Clusterer implements OptionHandler {  private AlgVector algv; // TODO just a trick  /* training instances */  private Instances m_Instances = null;  /* model information, should increase readability */  private Instances m_Model = null;    /* replace missing values in training instances */  private ReplaceMissingValues m_ReplaceMissingFilter;  /**   * Distance value between true and false of binary attributes and    * "same" and "different" of nominal attributes (default = 1.0).   */  private double m_BinValue = 1.0;  /* BIC-Score of the current model */  double m_Bic = Double.MIN_VALUE;  /* Distortion  */  double [] m_Mle = null;  /* maximum overall iterations */  private int m_MaxIterations = 1;  /**   * maximum iterations to perform Kmeans part    * if negative, iterations are not checked   */  private int m_MaxKMeans = 1000;  /* see above, but for kMeans of splitted clusters   */  private int m_MaxKMeansForChildren = 1000;  /* The actual number of clusters */  private int m_NumClusters = 2;  /* min number of clusters to generate */  private int m_MinNumClusters = 2;  /* max number of clusters to generate */  private int m_MaxNumClusters = 4;  /** the distance function used */  private DistanceFunction m_DistanceF = null;  /* cluster centers */  private Instances m_ClusterCenters;  /* file name of the output file for the cluster centers */  String m_InputCenterFile = null;  /*--> DebugVektors - USED FOR DEBUGGING */  /* input file for the random vektors --> USED FOR DEBUGGING */  Reader m_DebugVektorsInput = null;  int m_DebugVektorsIndex = 0;  Instances m_DebugVektors = null;  /* file name of the input file for the random vektors */  String m_DebugVektorsFile = null;  /* input file for the cluster centers */  Reader m_CenterInput = null;      /* file name of the output file for the cluster centers */  String m_OutputCenterFile = null;    /* output file for the cluster centers */  PrintWriter m_CenterOutput = null;      /**   * temporary variable holding cluster assignments while iterating   */  private int [] m_ClusterAssignments;  /* cutoff factor - percentage of splits done in Improve-Structure part     only relevant, if all children lost */   double m_CutOffFactor = 0.5;  /**   * random seed   */  private int m_Seed = 10;  /**   * Ranges of the universe of data, lowest value, highest value and width   */  double [][] m_Ranges;  /**   * Index in ranges for LOW and HIGH and WIDTH   */  public static int R_LOW = 0;  public static int R_HIGH = 1;  public static int R_WIDTH = 2;  /**   * KDTrees class if KDTrees are used   */  private KDTree m_KDTree = null;  /* counts iterations done in main loop */  private int m_IterationCount = 0;  /* counter to say how often kMeans was stopped by loop counter */  private int m_KMeansStopped = 0;  /* Number of splits prepared */  private int m_NumSplits = 0;  /* Number of splits accepted (including cutoff factor decisions) */  private int m_NumSplitsDone = 0;  /* Number of splits accepted just because of cutoff factor */  private int m_NumSplitsStillDone = 0;  /**   * level of debug output, 0 is no output.   */  private int m_DebugLevel = 0;  public static int D_PRINTCENTERS = 1;  // follows the splitting of the centers  public static int D_FOLLOWSPLIT = 2;  // have a closer look at converge children  public static int D_CONVCHCLOSER = 3;  // check on random vektors  public static int D_RANDOMVEKTOR = 4;  // check on kdtree  public static int D_KDTREE = 5;  // follow iterations  public static int D_ITERCOUNT = 6;  // functions were maybe misused   public static int D_METH_MISUSE = 80;   // for current debug   public static int D_CURR = 88;  public static int D_GENERAL = 99;  // Flag: I'm debugging  public boolean m_CurrDebugFlag = true;   /**   * Returns a string describing this clusterer   * @return a description of the evaluator suitable for   * displaying in the explorer/experimenter gui   */  public String globalInfo() {    return "Cluster data using the X-means algorithm" +      ", as described in D. Pelleg and A. Moore's paper 'X-means: Extending" +      " K-means with Efficient Estimation of the Number of Clusters'.";  }    /**   * Function should be in the Instances class!!   *   * Initializes the minimum and maximum values   * based on all instances.   *   * @param instList list of indexes    */  public static double [][] initializeRanges(Instances instances,                                             int[] instList) {    int numAtt = instances.numAttributes();        double [][] ranges = new double [numAtt][3];        // initialize ranges using the first instance    updateRangesFirst(instances.instance(instList[0]), numAtt,		      ranges);    // update ranges, starting from the second    for (int i = 1; i < instList.length; i++) {      updateRanges(instances.instance(instList[i]), numAtt,		   ranges);    }    return ranges;  }  /**   * Function should be in the Instances class!!   *   * Prints a range.   *   * @param ranges the ranges to print   */  public static void printRanges(Instances model,				 double[][] ranges) {    System.out.println("printRanges");    for (int j = 0; j < model.numAttributes(); j++) {      System.out.print("Attribute "+ j +" LOW: " + ranges[j][R_LOW]);      System.out.print(" HIGH: " + ranges[j][R_HIGH]);      System.out.print(" WIDTH: " + ranges[j][R_WIDTH]);      System.out.println(" ");    }  }  /**   * Function should be in the Instances class!!   *   * Used to initialize the ranges. For this the values   * of the first instance is used to save time.   * Sets low and high to the values of the first instance and   * width to zero.   *   * @param instance the new instance   * @param numAtt number of attributes in the model   */  public static void updateRangesFirst(Instance instance, int numAtt,				       double[][] ranges) {      for (int j = 0; j < numAtt; j++) {      if (!instance.isMissing(j)) {	  ranges[j][R_LOW] = instance.value(j);	  ranges[j][R_HIGH] = instance.value(j);	  ranges[j][R_WIDTH] = 0.0;	}       else { // if value was missing	  ranges[j][R_LOW] = Double.MIN_VALUE;	  ranges[j][R_HIGH] = Double.MAX_VALUE;	  ranges[j][R_WIDTH] = 0.0; //todo??	}    }  }   /**   * Function should be in the Instances class!!   *   * Updates the minimum and maximum and width values for all the attributes   * based on a new instance.   *   * @param instance the new instance   * @param numAtt number of attributes in the model   * @param ranges low, high and width values for all attributes   */  public static void updateRanges(Instance instance, int numAtt,				  double [][] ranges) {      // updateRangesFirst must have been called on ranges    for (int j = 0; j < numAtt; j++) {      double value = instance.value(j);      if (!instance.isMissing(j)) {	if (value < ranges[j][R_LOW]) {	  ranges[j][R_LOW] = value;	  ranges[j][R_WIDTH] = ranges[j][R_HIGH] - ranges[j][R_LOW];	} else {	  if (instance.value(j) > ranges[j][R_HIGH]) {	    ranges[j][R_HIGH] = value;	    ranges[j][R_WIDTH] = ranges[j][R_HIGH] - ranges[j][R_LOW];	  }	}      }    }  }   /**   * Generates the X-Means clusterer.    *   * @param data set of instances serving as training data    * @exception Exception if the clusterer has not been    * generated successfully   */  public void buildClusterer(Instances data) throws Exception {    if (data.checkForStringAttributes()) {      throw  new Exception("Can't handle string attributes!");    }    // replace missing values    m_ReplaceMissingFilter = new ReplaceMissingValues();    m_ReplaceMissingFilter.setInputFormat(data);    data = Filter.useFilter(data, m_ReplaceMissingFilter);    m_Instances = data;        // initialize random function    Random random0 = new Random(m_Seed);    // num of clusters to start with    m_NumClusters =  m_MinNumClusters;    // set distance function to default    if (m_DistanceF == null) {      m_DistanceF = new EuclideanDistance(data);      checkInstances();    }    if (m_DistanceF != null) {      checkInstances();    }    //     if (m_DebugVektorsFile != null)      initDebugVektorsInput();    // make list of indexes for m_Instances    int [] allInstList = new int[m_Instances.numInstances()];     for (int i = 0; i < m_Instances.numInstances(); i++) {      allInstList[i] = i;    }    // prepare the min and max value        m_Ranges = m_Instances.initializeRanges(allInstList);    // set model used (just for convenience)    m_Model = new Instances(m_Instances, 0);    // produce the starting centers    if (m_CenterInput != null) {      // read centers from file      m_ClusterCenters = new Instances(m_CenterInput);      m_NumClusters = m_ClusterCenters.numInstances();    }    else      // makes the first centers randomly      m_ClusterCenters = makeCentersRandomly(random0,					     m_Instances, m_NumClusters);    PFD(D_FOLLOWSPLIT, "\n*** Starting centers ");    for (int k = 0; k < m_ClusterCenters.numInstances(); k++) {      PFD(D_FOLLOWSPLIT, "Center " + k + ": " + m_ClusterCenters.instance(k));    }    PrCentersFD(D_PRINTCENTERS);    boolean finished = false;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -