📄 clusterevaluation.java
字号:
/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* * ClusterEvaluation.java * Copyright (C) 1999 University of Waikato, Hamilton, New Zealand * */package weka.clusterers;import weka.core.Drawable;import weka.core.Instance;import weka.core.Instances;import weka.core.Option;import weka.core.OptionHandler;import weka.core.Range;import weka.core.SerializationHelper;import weka.core.Utils;import weka.core.converters.ConverterUtils.DataSource;import weka.filters.Filter;import weka.filters.unsupervised.attribute.Remove;import java.io.BufferedWriter;import java.io.FileWriter;import java.io.Serializable;import java.util.Enumeration;import java.util.Random;import java.util.Vector;/** * Class for evaluating clustering models.<p/> * * Valid options are: <p/> * * -t name of the training file <br/> * Specify the training file. <p/> * * -T name of the test file <br/> * Specify the test file to apply clusterer to. <p/> * * -d name of file to save clustering model to <br/> * Specify output file. <p/> * * -l name of file to load clustering model from <br/> * Specifiy input file. <p/> * * -p attribute range <br/> * Output predictions. Predictions are for the training file if only the * training file is specified, otherwise they are for the test file. The range * specifies attribute values to be output with the predictions. * Use '-p 0' for none. <p/> * * -x num folds <br/> * Set the number of folds for a cross validation of the training data. * Cross validation can only be done for distribution clusterers and will * be performed if the test file is missing. <p/> * * -s num <br/> * Sets the seed for randomizing the data for cross-validation. <p/> * * -c class <br/> * Set the class attribute. If set, then class based evaluation of clustering * is performed. <p/> * * -g name of graph file <br/> * Outputs the graph representation of the clusterer to the file. Only for * clusterer that implemented the <code>weka.core.Drawable</code> interface. * <p/> * * @author Mark Hall (mhall@cs.waikato.ac.nz) * @version $Revision: 1.41 $ * @see weka.core.Drawable */public class ClusterEvaluation implements Serializable { /** for serialization */ static final long serialVersionUID = -830188327319128005L; /** the clusterer */ private Clusterer m_Clusterer; /** holds a string describing the results of clustering the training data */ private StringBuffer m_clusteringResults; /** holds the number of clusters found by the clusterer */ private int m_numClusters; /** holds the assigments of instances to clusters for a particular testing dataset */ private double[] m_clusterAssignments; /** holds the average log likelihood for a particular testing dataset if the clusterer is a DensityBasedClusterer */ private double m_logL; /** will hold the mapping of classes to clusters (for class based evaluation) */ private int[] m_classToCluster = null; /** * set the clusterer * @param clusterer the clusterer to use */ public void setClusterer(Clusterer clusterer) { m_Clusterer = clusterer; } /** * return the results of clustering. * @return a string detailing the results of clustering a data set */ public String clusterResultsToString() { return m_clusteringResults.toString(); } /** * Return the number of clusters found for the most recent call to * evaluateClusterer * @return the number of clusters found */ public int getNumClusters() { return m_numClusters; } /** * Return an array of cluster assignments corresponding to the most * recent set of instances clustered. * @return an array of cluster assignments */ public double[] getClusterAssignments() { return m_clusterAssignments; } /** * Return the array (ordered by cluster number) of minimum error class to * cluster mappings * @return an array of class to cluster mappings */ public int[] getClassesToClusters() { return m_classToCluster; } /** * Return the log likelihood corresponding to the most recent * set of instances clustered. * * @return a <code>double</code> value */ public double getLogLikelihood() { return m_logL; } /** * Constructor. Sets defaults for each member variable. Default Clusterer * is EM. */ public ClusterEvaluation () { setClusterer(new SimpleKMeans()); m_clusteringResults = new StringBuffer(); m_clusterAssignments = null; } /** * Evaluate the clusterer on a set of instances. Calculates clustering * statistics and stores cluster assigments for the instances in * m_clusterAssignments * * @param test the set of instances to cluster * @throws Exception if something goes wrong */ public void evaluateClusterer(Instances test) throws Exception { evaluateClusterer(test, ""); } /** * Evaluate the clusterer on a set of instances. Calculates clustering * statistics and stores cluster assigments for the instances in * m_clusterAssignments * * @param test the set of instances to cluster * @param testFileName the name of the test file for incremental testing, * if "" or null then not used * @throws Exception if something goes wrong */ public void evaluateClusterer(Instances test, String testFileName) throws Exception { int i = 0; int cnum; double loglk = 0.0; int cc = m_Clusterer.numberOfClusters(); m_numClusters = cc; double[] instanceStats = new double[cc]; Instances testRaw = null; boolean hasClass = (test.classIndex() >= 0); int unclusteredInstances = 0; Vector<Double> clusterAssignments = new Vector<Double>(); Filter filter = null; DataSource source = null; Instance inst; if (testFileName == null) testFileName = ""; // load data if (testFileName.length() != 0) source = new DataSource(testFileName); else source = new DataSource(test); testRaw = source.getStructure(test.classIndex()); // If class is set then do class based evaluation as well if (hasClass) { if (testRaw.classAttribute().isNumeric()) throw new Exception("ClusterEvaluation: Class must be nominal!"); filter = new Remove(); ((Remove) filter).setAttributeIndices("" + (testRaw.classIndex() + 1)); ((Remove) filter).setInvertSelection(false); filter.setInputFormat(testRaw); } i = 0; while (source.hasMoreElements(testRaw)) { // next instance inst = source.nextElement(testRaw); if (filter != null) { filter.input(inst); filter.batchFinished(); inst = filter.output(); } cnum = -1; try { if (m_Clusterer instanceof DensityBasedClusterer) { loglk += ((DensityBasedClusterer)m_Clusterer). logDensityForInstance(inst); cnum = m_Clusterer.clusterInstance(inst); clusterAssignments.add((double) cnum); } else { cnum = m_Clusterer.clusterInstance(inst); clusterAssignments.add((double) cnum); } } catch (Exception e) { clusterAssignments.add(0.0); unclusteredInstances++; } if (cnum != -1) { instanceStats[cnum]++; } } double sum = Utils.sum(instanceStats); loglk /= sum; m_logL = loglk; m_clusterAssignments = new double [clusterAssignments.size()]; for (i = 0; i < clusterAssignments.size(); i++) m_clusterAssignments[i] = clusterAssignments.get(i); int numInstFieldWidth = (int)((Math.log(clusterAssignments.size())/Math.log(10))+1); m_clusteringResults.append(m_Clusterer.toString()); m_clusteringResults.append("Clustered Instances\n\n"); int clustFieldWidth = (int)((Math.log(cc)/Math.log(10))+1); for (i = 0; i < cc; i++) { if (instanceStats[i] > 0) m_clusteringResults.append(Utils.doubleToString((double)i, clustFieldWidth, 0) + " " + Utils.doubleToString(instanceStats[i], numInstFieldWidth, 0) + " (" + Utils.doubleToString((instanceStats[i] / sum * 100.0) , 3, 0) + "%)\n"); } if (unclusteredInstances > 0) m_clusteringResults.append("\nUnclustered instances : " +unclusteredInstances); if (m_Clusterer instanceof DensityBasedClusterer) m_clusteringResults.append("\n\nLog likelihood: " + Utils.doubleToString(loglk, 1, 5) + "\n"); if (hasClass) evaluateClustersWithRespectToClass(test, testFileName); } /** * Evaluates cluster assignments with respect to actual class labels. * Assumes that m_Clusterer has been trained and tested on * inst (minus the class). * * @param inst the instances (including class) to evaluate with respect to * @param fileName the name of the test file for incremental testing, * if "" or null then not used * @throws Exception if something goes wrong */ private void evaluateClustersWithRespectToClass(Instances inst, String fileName) throws Exception { int numClasses = inst.classAttribute().numValues(); int[][] counts = new int [m_numClusters][numClasses]; int[] clusterTotals = new int[m_numClusters]; double[] best = new double[m_numClusters+1]; double[] current = new double[m_numClusters+1]; DataSource source = null; Instances instances = null; Instance instance = null; int i; int numInstances; if (fileName == null) fileName = ""; if (fileName.length() != 0) source = new DataSource(fileName); else source = new DataSource(inst); instances = source.getStructure(inst.classIndex()); i = 0; while (source.hasMoreElements(instances)) { instance = source.nextElement(instances); counts[(int)m_clusterAssignments[i]][(int)instance.classValue()]++; clusterTotals[(int)m_clusterAssignments[i]]++; i++; } numInstances = i; best[m_numClusters] = Double.MAX_VALUE; mapClasses(m_numClusters, 0, counts, clusterTotals, current, best, 0); m_clusteringResults.append("\n\nClass attribute: " +inst.classAttribute().name() +"\n"); m_clusteringResults.append("Classes to Clusters:\n"); String matrixString = toMatrixString(counts, clusterTotals, new Instances(inst, 0)); m_clusteringResults.append(matrixString).append("\n"); int Cwidth = 1 + (int)(Math.log(m_numClusters) / Math.log(10)); // add the minimum error assignment for (i = 0; i < m_numClusters; i++) { if (clusterTotals[i] > 0) { m_clusteringResults.append("Cluster " +Utils.doubleToString((double)i,Cwidth,0)); m_clusteringResults.append(" <-- "); if (best[i] < 0) { m_clusteringResults.append("No class\n"); } else { m_clusteringResults. append(inst.classAttribute().value((int)best[i])).append("\n"); } } } m_clusteringResults.append("\nIncorrectly clustered instances :\t" +best[m_numClusters]+"\t" +(Utils.doubleToString((best[m_numClusters] / numInstances * 100.0), 8, 4)) +" %\n"); // copy the class assignments m_classToCluster = new int [m_numClusters]; for (i = 0; i < m_numClusters; i++) { m_classToCluster[i] = (int)best[i]; } } /**
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -