📄 xmeans.java
字号:
/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* * XMeans.java * Copyright (C) 2000 University of Waikato, Hamilton, New Zealand * */package weka.clusterers;import weka.core.AlgVector;import weka.core.Capabilities;import weka.core.DistanceFunction;import weka.core.EuclideanDistance;import weka.core.Instance;import weka.core.Instances;import weka.core.neighboursearch.KDTree;import weka.core.Option;import weka.core.OptionHandler;import weka.core.TechnicalInformation;import weka.core.TechnicalInformationHandler;import weka.core.Utils;import weka.core.Capabilities.Capability;import weka.core.TechnicalInformation.Field;import weka.core.TechnicalInformation.Type;import weka.filters.Filter;import weka.filters.unsupervised.attribute.ReplaceMissingValues;import java.io.BufferedReader;import java.io.File;import java.io.FileOutputStream;import java.io.FileReader;import java.io.PrintWriter;import java.io.Reader;import java.util.Enumeration;import java.util.Random;import java.util.Vector;/** <!-- globalinfo-start --> * Cluster data using the X-means algorithm.<br/> * <br/> * X-Means is K-Means extended by an Improve-Structure part In this part of the algorithm the centers are attempted to be split in its region. The decision between the children of each center and itself is done comparing the BIC-values of the two structures.<br/> * <br/> * For more information see:<br/> * <br/> * Dan Pelleg, Andrew W. Moore: X-means: Extending K-means with Efficient Estimation of the Number of Clusters. In: Seventeenth International Conference on Machine Learning, 727-734, 2000. * <p/> <!-- globalinfo-end --> * <!-- technical-bibtex-start --> * BibTeX: * <pre> * @inproceedings{Pelleg2000, * author = {Dan Pelleg and Andrew W. Moore}, * booktitle = {Seventeenth International Conference on Machine Learning}, * pages = {727-734}, * publisher = {Morgan Kaufmann}, * title = {X-means: Extending K-means with Efficient Estimation of the Number of Clusters}, * year = {2000} * } * </pre> * <p/> <!-- technical-bibtex-end --> * <!-- options-start --> * Valid options are: <p/> * * <pre> -I <num> * maximum number of overall iterations * (default 1).</pre> * * <pre> -M <num> * maximum number of iterations in the kMeans loop in * the Improve-Parameter part * (default 1000).</pre> * * <pre> -J <num> * maximum number of iterations in the kMeans loop * for the splitted centroids in the Improve-Structure part * (default 1000).</pre> * * <pre> -L <num> * minimum number of clusters * (default 2).</pre> * * <pre> -H <num> * maximum number of clusters * (default 4).</pre> * * <pre> -B <value> * distance value for binary attributes * (default 1.0).</pre> * * <pre> -use-kdtree * Uses the KDTree internally * (default no).</pre> * * <pre> -K <KDTree class specification> * Full class name of KDTree class to use, followed * by scheme options. * eg: "weka.core.neighboursearch.kdtrees.KDTree -P" * (default no KDTree class used).</pre> * * <pre> -C <value> * cutoff factor, takes the given percentage of the splitted * centroids if none of the children win * (default 0.0).</pre> * * <pre> -D <distance function class specification> * Full class name of Distance function class to use, followed * by scheme options. * (default weka.core.EuclideanDistance).</pre> * * <pre> -N <file name> * file to read starting centers from (ARFF format).</pre> * * <pre> -O <file name> * file to write centers to (ARFF format).</pre> * * <pre> -U <int> * The debug level. * (default 0)</pre> * * <pre> -Y <file name> * The debug vectors file.</pre> * * <pre> -S <num> * Random number seed. * (default 10)</pre> * <!-- options-end --> * * @author Gabi Schmidberger (gabi@cs.waikato.ac.nz) * @author Mark Hall (mhall@cs.waikato.ac.nz) * @author Malcolm Ware (mfw4@cs.waikato.ac.nz) * @version $Revision: 1.23 $ * @see RandomizableClusterer */public class XMeans extends RandomizableClusterer implements TechnicalInformationHandler { /* * major TODOS: * * make BIC-Score replaceable by other scores */ /** for serialization. */ private static final long serialVersionUID = -7941793078404132616L; /** training instances. */ protected Instances m_Instances = null; /** model information, should increase readability. */ protected Instances m_Model = null; /** replace missing values in training instances. */ protected ReplaceMissingValues m_ReplaceMissingFilter; /** * Distance value between true and false of binary attributes and * "same" and "different" of nominal attributes (default = 1.0). */ protected double m_BinValue = 1.0; /** BIC-Score of the current model. */ protected double m_Bic = Double.MIN_VALUE; /** Distortion. */ protected double[] m_Mle = null; /** maximum overall iterations. */ protected int m_MaxIterations = 1; /** * maximum iterations to perform Kmeans part * if negative, iterations are not checked. */ protected int m_MaxKMeans = 1000; /** see above, but for kMeans of splitted clusters. */ protected int m_MaxKMeansForChildren = 1000; /** The actual number of clusters. */ protected int m_NumClusters = 2; /** min number of clusters to generate. */ protected int m_MinNumClusters = 2; /** max number of clusters to generate. */ protected int m_MaxNumClusters = 4; /** the distance function used. */ protected DistanceFunction m_DistanceF = new EuclideanDistance(); /** cluster centers. */ protected Instances m_ClusterCenters; /** file name of the output file for the cluster centers. */ protected File m_InputCenterFile = new File(System.getProperty("user.dir")); /* --> DebugVectors - USED FOR DEBUGGING */ /** input file for the random vectors --> USED FOR DEBUGGING. */ protected Reader m_DebugVectorsInput = null; /** the index for the current debug vector. */ protected int m_DebugVectorsIndex = 0; /** all the debug vectors. */ protected Instances m_DebugVectors = null; /** file name of the input file for the random vectors. */ protected File m_DebugVectorsFile = new File(System.getProperty("user.dir")); /** input file for the cluster centers. */ protected Reader m_CenterInput = null; /** file name of the output file for the cluster centers. */ protected File m_OutputCenterFile = new File(System.getProperty("user.dir")); /** output file for the cluster centers. */ protected PrintWriter m_CenterOutput = null; /** * temporary variable holding cluster assignments while iterating. */ protected int[] m_ClusterAssignments; /** cutoff factor - percentage of splits done in Improve-Structure part only relevant, if all children lost. */ protected double m_CutOffFactor = 0.5; /** Index in ranges for LOW. */ public static int R_LOW = 0; /** Index in ranges for HIGH. */ public static int R_HIGH = 1; /** Index in ranges for WIDTH. */ public static int R_WIDTH = 2; /** * KDTrees class if KDTrees are used. */ protected KDTree m_KDTree = new KDTree(); /** whether to use the KDTree (the KDTree is only initialized to be * configurable from the GUI). */ protected boolean m_UseKDTree = false; /** counts iterations done in main loop. */ protected int m_IterationCount = 0; /** counter to say how often kMeans was stopped by loop counter. */ protected int m_KMeansStopped = 0; /** Number of splits prepared. */ protected int m_NumSplits = 0; /** Number of splits accepted (including cutoff factor decisions). */ protected int m_NumSplitsDone = 0; /** Number of splits accepted just because of cutoff factor. */ protected int m_NumSplitsStillDone = 0; /** * level of debug output, 0 is no output. */ protected int m_DebugLevel = 0; /** print the centers. */ public static int D_PRINTCENTERS = 1; /** follows the splitting of the centers. */ public static int D_FOLLOWSPLIT = 2; /** have a closer look at converge children. */ public static int D_CONVCHCLOSER = 3; /** check on random vectors. */ public static int D_RANDOMVECTOR = 4; /** check on kdtree. */ public static int D_KDTREE = 5; /** follow iterations. */ public static int D_ITERCOUNT = 6; /** functions were maybe misused. */ public static int D_METH_MISUSE = 80; /** for current debug. */ public static int D_CURR = 88; /** general debugging. */ public static int D_GENERAL = 99; /** Flag: I'm debugging. */ public boolean m_CurrDebugFlag = true; /** * the default constructor. */ public XMeans() { super(); m_SeedDefault = 10; setSeed(m_SeedDefault); } /** * Returns a string describing this clusterer. * @return a description of the evaluator suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return "Cluster data using the X-means algorithm.\n\n" + "X-Means is K-Means extended by an Improve-Structure part In this " + "part of the algorithm the centers are attempted to be split in " + "its region. The decision between the children of each center and " + "itself is done comparing the BIC-values of the two structures.\n\n" + "For more information see:\n\n" + getTechnicalInformation().toString(); } /** * Returns an instance of a TechnicalInformation object, containing * detailed information about the technical background of this class, * e.g., paper reference or book this class is based on. * * @return the technical information about this class */ public TechnicalInformation getTechnicalInformation() { TechnicalInformation result; result = new TechnicalInformation(Type.INPROCEEDINGS); result.setValue(Field.AUTHOR, "Dan Pelleg and Andrew W. Moore"); result.setValue(Field.TITLE, "X-means: Extending K-means with Efficient Estimation of the Number of Clusters"); result.setValue(Field.BOOKTITLE, "Seventeenth International Conference on Machine Learning"); result.setValue(Field.YEAR, "2000"); result.setValue(Field.PAGES, "727-734"); result.setValue(Field.PUBLISHER, "Morgan Kaufmann"); return result; } /** * Returns default capabilities of the clusterer. * * @return the capabilities of this clusterer */ public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); // attributes result.enable(Capability.NUMERIC_ATTRIBUTES); result.enable(Capability.DATE_ATTRIBUTES); result.enable(Capability.MISSING_VALUES); return result; } /** * Generates the X-Means clusterer. * * @param data set of instances serving as training data * @throws Exception if the clusterer has not been * generated successfully */ public void buildClusterer(Instances data) throws Exception { // can clusterer handle the data? getCapabilities().testWithFail(data); m_NumSplits = 0; m_NumSplitsDone = 0; m_NumSplitsStillDone = 0; // replace missing values m_ReplaceMissingFilter = new ReplaceMissingValues(); m_ReplaceMissingFilter.setInputFormat(data); m_Instances = Filter.useFilter(data, m_ReplaceMissingFilter); // initialize random function Random random0 = new Random(m_Seed); // num of clusters to start with m_NumClusters = m_MinNumClusters; // set distance function to default if (m_DistanceF == null) { m_DistanceF = new EuclideanDistance(); } m_DistanceF.setInstances(m_Instances); checkInstances(); if (m_DebugVectorsFile.exists() && m_DebugVectorsFile.isFile()) initDebugVectorsInput(); // make list of indexes for m_Instances int[] allInstList = new int[m_Instances.numInstances()]; for (int i = 0; i < m_Instances.numInstances(); i++) { allInstList[i] = i; } // set model used (just for convenience) m_Model = new Instances(m_Instances, 0); // produce the starting centers if (m_CenterInput != null) { // read centers from file m_ClusterCenters = new Instances(m_CenterInput); m_NumClusters = m_ClusterCenters.numInstances(); } else // makes the first centers randomly m_ClusterCenters = makeCentersRandomly(random0, m_Instances, m_NumClusters); PFD(D_FOLLOWSPLIT, "\n*** Starting centers "); for (int k = 0; k < m_ClusterCenters.numInstances(); k++) { PFD(D_FOLLOWSPLIT, "Center " + k + ": " + m_ClusterCenters.instance(k)); } PrCentersFD(D_PRINTCENTERS); boolean finished = false; Instances children; // builds up a KDTree if (m_UseKDTree) m_KDTree.setInstances(m_Instances); // loop counter of main loop m_IterationCount = 0; /**
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -