📄 xmeans.java
字号:
/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* * XMeans.java * Copyright (C) 2000 Mark Hall, Malcolm Ware, Gabi Schmidberger * */package weka.clusterers;import java.io.*;import java.util.*;import weka.core.AlgVector;import weka.core.AttributeStats;import weka.core.KDTree;import weka.core.DistanceFunction;import weka.core.EuclideanDistance;import weka.core.Instances;import weka.core.Instance;import weka.core.Attribute;import weka.core.Utils;import weka.core.Option;import weka.core.OptionHandler;import weka.filters.Filter;import weka.filters.unsupervised.attribute.ReplaceMissingValues;/** * XMeans clustering class. * * X-Means is K-Means extended by an Improve-Structure part In this part of * the algorithm the centers are attempted to be split in its region. * The decision between the children of * each center and itself is done comparing the BIC-values of * the two structures. * See also D. Pelleg and A. Moore's paper 'X-means: Extending * K-means with Efficient Estimation of the Number of Clusters'. <p> * * Valid options are:<p> * * -I <max iterations> <br> * Maximum number of iterations in the overall loop (default = 1). <p> * * -M <max iterations> <br> * Maximum number of iterations in the kMeans loop in <br> * the Improve-Parameter part (default = 1000).<p> * * -J <max iterations> <br> * Maximum number of iterations in the kMeans loop for the splitted <br> * centroids in the Improve-Structure part (default = 1000).<p> * * -L <minimal number of clusters> <br> * Specify the number of clusters to start with.<p> * * -H <maximal number of clusters> <br> * Specify the maximal number of clusters.<p> * * -B <value> <br> * Distance value between true and false of binary attributes and <br> * "same" and "different" of nominal attributes (default = 1.0).<p> * * -K <kdtree class><br> * KDTrees class and its options (can only use the same distance function * as XMeans).<p> * * -C <cutoff factor> <br> * If none of the children are better, percentage of the best splits<br> * to be taken.<p> * * -D <distance function class> * Distance function class to be used (default = Euclidean distance) * * -N <file name> <br> * Input starting cluster centers from file (ARFF-format). <p> * * -O <file name> <br> * Output cluster centers to file (ARFF-format). <p> * * -S <seed> <br> * Specify random number seed. <p> * * -U <debuglevel> <br> * Set debuglevel. <p> * * -Y <file name> <br> * Used for debugging: Input random vektors from file. <p> * * major TODOS: * * make BIC-Score replaceable by other scores * * @author Gabi Schmidberger <gabi@cs.waikato.ac.nz) * @author Mark Hall (mhall@cs.waikato.ac.nz) * @author Malcolm Ware <mfw4@cs.waikato.ac.nz) * @version $Revision: 1.1.1.1 $ * @see Clusterer * @see OptionHandler */public class XMeans extends Clusterer implements OptionHandler { private AlgVector algv; // TODO just a trick /* training instances */ private Instances m_Instances = null; /* model information, should increase readability */ private Instances m_Model = null; /* replace missing values in training instances */ private ReplaceMissingValues m_ReplaceMissingFilter; /** * Distance value between true and false of binary attributes and * "same" and "different" of nominal attributes (default = 1.0). */ private double m_BinValue = 1.0; /* BIC-Score of the current model */ double m_Bic = Double.MIN_VALUE; /* Distortion */ double [] m_Mle = null; /* maximum overall iterations */ private int m_MaxIterations = 1; /** * maximum iterations to perform Kmeans part * if negative, iterations are not checked */ private int m_MaxKMeans = 1000; /* see above, but for kMeans of splitted clusters */ private int m_MaxKMeansForChildren = 1000; /* The actual number of clusters */ private int m_NumClusters = 2; /* min number of clusters to generate */ private int m_MinNumClusters = 2; /* max number of clusters to generate */ private int m_MaxNumClusters = 4; /** the distance function used */ private DistanceFunction m_DistanceF = null; /* cluster centers */ private Instances m_ClusterCenters; /* file name of the output file for the cluster centers */ String m_InputCenterFile = null; /*--> DebugVektors - USED FOR DEBUGGING */ /* input file for the random vektors --> USED FOR DEBUGGING */ Reader m_DebugVektorsInput = null; int m_DebugVektorsIndex = 0; Instances m_DebugVektors = null; /* file name of the input file for the random vektors */ String m_DebugVektorsFile = null; /* input file for the cluster centers */ Reader m_CenterInput = null; /* file name of the output file for the cluster centers */ String m_OutputCenterFile = null; /* output file for the cluster centers */ PrintWriter m_CenterOutput = null; /** * temporary variable holding cluster assignments while iterating */ private int [] m_ClusterAssignments; /* cutoff factor - percentage of splits done in Improve-Structure part only relevant, if all children lost */ double m_CutOffFactor = 0.5; /** * random seed */ private int m_Seed = 10; /** * Ranges of the universe of data, lowest value, highest value and width */ double [][] m_Ranges; /** * Index in ranges for LOW and HIGH and WIDTH */ public static int R_LOW = 0; public static int R_HIGH = 1; public static int R_WIDTH = 2; /** * KDTrees class if KDTrees are used */ private KDTree m_KDTree = null; /* counts iterations done in main loop */ private int m_IterationCount = 0; /* counter to say how often kMeans was stopped by loop counter */ private int m_KMeansStopped = 0; /* Number of splits prepared */ private int m_NumSplits = 0; /* Number of splits accepted (including cutoff factor decisions) */ private int m_NumSplitsDone = 0; /* Number of splits accepted just because of cutoff factor */ private int m_NumSplitsStillDone = 0; /** * level of debug output, 0 is no output. */ private int m_DebugLevel = 0; public static int D_PRINTCENTERS = 1; // follows the splitting of the centers public static int D_FOLLOWSPLIT = 2; // have a closer look at converge children public static int D_CONVCHCLOSER = 3; // check on random vektors public static int D_RANDOMVEKTOR = 4; // check on kdtree public static int D_KDTREE = 5; // follow iterations public static int D_ITERCOUNT = 6; // functions were maybe misused public static int D_METH_MISUSE = 80; // for current debug public static int D_CURR = 88; public static int D_GENERAL = 99; // Flag: I'm debugging public boolean m_CurrDebugFlag = true; /** * Returns a string describing this clusterer * @return a description of the evaluator suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return "Cluster data using the X-means algorithm" + ", as described in D. Pelleg and A. Moore's paper 'X-means: Extending" + " K-means with Efficient Estimation of the Number of Clusters'."; } /** * Function should be in the Instances class!! * * Initializes the minimum and maximum values * based on all instances. * * @param instList list of indexes */ public static double [][] initializeRanges(Instances instances, int[] instList) { int numAtt = instances.numAttributes(); double [][] ranges = new double [numAtt][3]; // initialize ranges using the first instance updateRangesFirst(instances.instance(instList[0]), numAtt, ranges); // update ranges, starting from the second for (int i = 1; i < instList.length; i++) { updateRanges(instances.instance(instList[i]), numAtt, ranges); } return ranges; } /** * Function should be in the Instances class!! * * Prints a range. * * @param ranges the ranges to print */ public static void printRanges(Instances model, double[][] ranges) { System.out.println("printRanges"); for (int j = 0; j < model.numAttributes(); j++) { System.out.print("Attribute "+ j +" LOW: " + ranges[j][R_LOW]); System.out.print(" HIGH: " + ranges[j][R_HIGH]); System.out.print(" WIDTH: " + ranges[j][R_WIDTH]); System.out.println(" "); } } /** * Function should be in the Instances class!! * * Used to initialize the ranges. For this the values * of the first instance is used to save time. * Sets low and high to the values of the first instance and * width to zero. * * @param instance the new instance * @param numAtt number of attributes in the model */ public static void updateRangesFirst(Instance instance, int numAtt, double[][] ranges) { for (int j = 0; j < numAtt; j++) { if (!instance.isMissing(j)) { ranges[j][R_LOW] = instance.value(j); ranges[j][R_HIGH] = instance.value(j); ranges[j][R_WIDTH] = 0.0; } else { // if value was missing ranges[j][R_LOW] = Double.MIN_VALUE; ranges[j][R_HIGH] = Double.MAX_VALUE; ranges[j][R_WIDTH] = 0.0; //todo?? } } } /** * Function should be in the Instances class!! * * Updates the minimum and maximum and width values for all the attributes * based on a new instance. * * @param instance the new instance * @param numAtt number of attributes in the model * @param ranges low, high and width values for all attributes */ public static void updateRanges(Instance instance, int numAtt, double [][] ranges) { // updateRangesFirst must have been called on ranges for (int j = 0; j < numAtt; j++) { double value = instance.value(j); if (!instance.isMissing(j)) { if (value < ranges[j][R_LOW]) { ranges[j][R_LOW] = value; ranges[j][R_WIDTH] = ranges[j][R_HIGH] - ranges[j][R_LOW]; } else { if (instance.value(j) > ranges[j][R_HIGH]) { ranges[j][R_HIGH] = value; ranges[j][R_WIDTH] = ranges[j][R_HIGH] - ranges[j][R_LOW]; } } } } } /** * Generates the X-Means clusterer. * * @param data set of instances serving as training data * @exception Exception if the clusterer has not been * generated successfully */ public void buildClusterer(Instances data) throws Exception { if (data.checkForStringAttributes()) { throw new Exception("Can't handle string attributes!"); } // replace missing values m_ReplaceMissingFilter = new ReplaceMissingValues(); m_ReplaceMissingFilter.setInputFormat(data); data = Filter.useFilter(data, m_ReplaceMissingFilter); m_Instances = data; // initialize random function Random random0 = new Random(m_Seed); // num of clusters to start with m_NumClusters = m_MinNumClusters; // set distance function to default if (m_DistanceF == null) { m_DistanceF = new EuclideanDistance(data); checkInstances(); } if (m_DistanceF != null) { checkInstances(); } // if (m_DebugVektorsFile != null) initDebugVektorsInput(); // make list of indexes for m_Instances int [] allInstList = new int[m_Instances.numInstances()]; for (int i = 0; i < m_Instances.numInstances(); i++) { allInstList[i] = i; } // prepare the min and max value m_Ranges = m_Instances.initializeRanges(allInstList); // set model used (just for convenience) m_Model = new Instances(m_Instances, 0); // produce the starting centers if (m_CenterInput != null) { // read centers from file m_ClusterCenters = new Instances(m_CenterInput); m_NumClusters = m_ClusterCenters.numInstances(); } else // makes the first centers randomly m_ClusterCenters = makeCentersRandomly(random0, m_Instances, m_NumClusters); PFD(D_FOLLOWSPLIT, "\n*** Starting centers "); for (int k = 0; k < m_ClusterCenters.numInstances(); k++) { PFD(D_FOLLOWSPLIT, "Center " + k + ": " + m_ClusterCenters.instance(k)); } PrCentersFD(D_PRINTCENTERS); boolean finished = false;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -