clusterevaluation.java

来自「Java 编写的多种数据挖掘算法包括聚类、分类、预处理等」· Java 代码 · 共 1,130 行 · 第 1/3 页
JAVA
1,130 行
/* *    This program is free software; you can redistribute it and/or modify *    it under the terms of the GNU General Public License as published by *    the Free Software Foundation; either version 2 of the License, or *    (at your option) any later version. * *    This program is distributed in the hope that it will be useful, *    but WITHOUT ANY WARRANTY; without even the implied warranty of *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *    GNU General Public License for more details. * *    You should have received a copy of the GNU General Public License *    along with this program; if not, write to the Free Software *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* *    ClusterEvaluation.java *    Copyright (C) 1999 Mark Hall * */package  weka.clusterers;import weka.core.Drawable;import weka.core.Instance;import weka.core.Instances;import weka.core.Option;import weka.core.OptionHandler;import weka.core.Range;import weka.core.Utils;import weka.filters.Filter;import weka.filters.unsupervised.attribute.Remove;import java.io.BufferedReader;import java.io.BufferedWriter;import java.io.FileInputStream;import java.io.FileOutputStream;import java.io.FileReader;import java.io.FileWriter;import java.io.ObjectInputStream;import java.io.ObjectOutputStream;import java.io.Serializable;import java.util.Enumeration;import java.util.Random;/** * Class for evaluating clustering models.<p> * * Valid options are: <p> * * -t name of the training file <br> * Specify the training file. <p> * * -T name of the test file <br> * Specify the test file to apply clusterer to. <p> * * -d name of file to save clustering model to <br> * Specify output file. <p> * * -l name of file to load clustering model from <br> * Specifiy input file. <p> * * -p attribute range <br> * Output predictions. Predictions are for the training file if only the * training file is specified, otherwise they are for the test file. The range * specifies attribute values to be output with the predictions. * Use '-p 0' for none. <p> * * -x num folds <br> * Set the number of folds for a cross validation of the training data. * Cross validation can only be done for distribution clusterers and will * be performed if the test file is missing. <p> * * -c class <br> * Set the class attribute. If set, then class based evaluation of clustering * is performed. <p> * * @author   Mark Hall (mhall@cs.waikato.ac.nz) * @version  $Revision: 1.30 $ */public class ClusterEvaluation   implements Serializable {  /** for serialization */  static final long serialVersionUID = -830188327319128005L;    /** the instances to cluster */  private Instances m_trainInstances;    /** the clusterer */  private Clusterer m_Clusterer;  /** holds a string describing the results of clustering the training data */  private StringBuffer m_clusteringResults;  /** holds the number of clusters found by the clusterer */  private int m_numClusters;  /** holds the assigments of instances to clusters for a particular testing      dataset */  private double [] m_clusterAssignments;  /** holds the average log likelihood for a particular testing dataset     if the clusterer is a DensityBasedClusterer */  private double m_logL;  /** will hold the mapping of classes to clusters (for class based       evaluation) */  private int [] m_classToCluster = null;  /**   * set the clusterer   * @param clusterer the clusterer to use   */  public void setClusterer(Clusterer clusterer) {    m_Clusterer = clusterer;  }  /**   * return the results of clustering.   * @return a string detailing the results of clustering a data set   */  public String clusterResultsToString() {    return m_clusteringResults.toString();  }  /**   * Return the number of clusters found for the most recent call to   * evaluateClusterer   * @return the number of clusters found   */  public int getNumClusters() {    return m_numClusters;  }  /**   * Return an array of cluster assignments corresponding to the most   * recent set of instances clustered.   * @return an array of cluster assignments   */  public double [] getClusterAssignments() {    return m_clusterAssignments;  }  /**   * Return the array (ordered by cluster number) of minimum error class to   * cluster mappings   * @return an array of class to cluster mappings   */  public int [] getClassesToClusters() {    return m_classToCluster;  }  /**   * Return the log likelihood corresponding to the most recent   * set of instances clustered.   *   * @return a <code>double</code> value   */  public double getLogLikelihood() {    return m_logL;  }  /**   * Constructor. Sets defaults for each member variable. Default Clusterer   * is EM.   */  public ClusterEvaluation () {    setClusterer(new EM());    m_trainInstances = null;    m_clusteringResults = new StringBuffer();    m_clusterAssignments = null;  }  /**   * Evaluate the clusterer on a set of instances. Calculates clustering   * statistics and stores cluster assigments for the instances in   * m_clusterAssignments   * @param test the set of instances to cluster   * @throws Exception if something goes wrong   */  public void evaluateClusterer(Instances test) throws Exception {    int i = 0;    int cnum;    double loglk = 0.0;    int cc = m_Clusterer.numberOfClusters();    m_numClusters = cc;    int numInstFieldWidth = (int)((Math.log(test.numInstances())/				   Math.log(10))+1);    double[] instanceStats = new double[cc];    m_clusterAssignments = new double [test.numInstances()];    Instances testCopy = test;    boolean hasClass = (testCopy.classIndex() >= 0);    int unclusteredInstances = 0;    // If class is set then do class based evaluation as well    if (hasClass) {      if (testCopy.classAttribute().isNumeric()) {	throw new Exception("ClusterEvaluation: Class must be nominal!");      }      Remove removeClass = new Remove();      removeClass.setAttributeIndices(""+(testCopy.classIndex()+1));      removeClass.setInvertSelection(false);      removeClass.setInputFormat(testCopy);      testCopy = Filter.useFilter(testCopy, removeClass);    }    for (i=0;i<testCopy.numInstances();i++) {      cnum = -1;      try {	if (m_Clusterer instanceof DensityBasedClusterer) {	  loglk += ((DensityBasedClusterer)m_Clusterer).	    logDensityForInstance(testCopy.instance(i));	  //	  temp = Utils.sum(dist);	  	  //	  Utils.normalize(dist);	  cnum = m_Clusterer.clusterInstance(testCopy.instance(i)); 	  // Utils.maxIndex(dist);	  m_clusterAssignments[i] = (double)cnum;	} else {	  cnum = m_Clusterer.clusterInstance(testCopy.instance(i));	  m_clusterAssignments[i] = (double)cnum;	}      }      catch (Exception e) {	unclusteredInstances++;      }            if (cnum != -1) {	instanceStats[cnum]++;      }    }    /* // count the actual number of used clusters    int count = 0;    for (i = 0; i < cc; i++) {      if (instanceStats[i] > 0) {	count++;      }    }    if (count > 0) {      double [] tempStats = new double [count];      double [] map = new double [m_clusterAssignments.length];      count=0;      for (i=0;i<cc;i++) {	if (instanceStats[i] > 0) {	  tempStats[count] = instanceStats[i];	  map[i] = count;	  count++;	}      }      instanceStats = tempStats;      cc = instanceStats.length;      for (i=0;i<m_clusterAssignments.length;i++) {	m_clusterAssignments[i] = map[(int)m_clusterAssignments[i]];      }      } */     double sum = Utils.sum(instanceStats);    loglk /= sum;    m_logL = loglk;        m_clusteringResults.append(m_Clusterer.toString());    m_clusteringResults.append("Clustered Instances\n\n");    int clustFieldWidth = (int)((Math.log(cc)/Math.log(10))+1);    for (i = 0; i < cc; i++) {      if (instanceStats[i] > 0) {	m_clusteringResults.append(Utils.doubleToString((double)i, 							clustFieldWidth, 0) 				   + "      " 				   + Utils.doubleToString(instanceStats[i],							  numInstFieldWidth, 0) 				   + " (" 				   + Utils.doubleToString((instanceStats[i] / 							   sum * 100.0)							  , 3, 0) + "%)\n");      }    }        if (unclusteredInstances > 0) {      m_clusteringResults.append("\nUnclustered instances : "				 +unclusteredInstances);    }    if (m_Clusterer instanceof DensityBasedClusterer) {      m_clusteringResults.append("\n\nLog likelihood: " 				 + Utils.doubleToString(loglk, 1, 5) 				 + "\n");    }        if (hasClass) {      evaluateClustersWithRespectToClass(test);    }  }  /**   * Evaluates cluster assignments with respect to actual class labels.   * Assumes that m_Clusterer has been trained and tested on    * inst (minus the class).   * @param inst the instances (including class) to evaluate with respect to   * @throws Exception if something goes wrong   */  private void evaluateClustersWithRespectToClass(Instances inst)    throws Exception {    int numClasses = inst.classAttribute().numValues();    int [][] counts = new int [m_numClusters][numClasses];    int [] clusterTotals = new int[m_numClusters];    double [] best = new double[m_numClusters+1];    double [] current = new double[m_numClusters+1];    for (int i = 0; i < inst.numInstances(); i++) {      counts[(int)m_clusterAssignments[i]][(int)inst.instance(i).classValue()]++;      clusterTotals[(int)m_clusterAssignments[i]]++;    }        best[m_numClusters] = Double.MAX_VALUE;    mapClasses(0, counts, clusterTotals, current, best, 0);    m_clusteringResults.append("\n\nClass attribute: "			+inst.classAttribute().name()			+"\n");    m_clusteringResults.append("Classes to Clusters:\n");    String matrixString = toMatrixString(counts, clusterTotals, inst);    m_clusteringResults.append(matrixString).append("\n");    int Cwidth = 1 + (int)(Math.log(m_numClusters) / Math.log(10));    // add the minimum error assignment    for (int i = 0; i < m_numClusters; i++) {      if (clusterTotals[i] > 0) {	m_clusteringResults.append("Cluster "				   +Utils.doubleToString((double)i,Cwidth,0));	m_clusteringResults.append(" <-- ");		if (best[i] < 0) {	  m_clusteringResults.append("No class\n");	} else {	  m_clusteringResults.	    append(inst.classAttribute().value((int)best[i])).append("\n");	}      }    }    m_clusteringResults.append("\nIncorrectly clustered instances :\t"			       +best[m_numClusters]+"\t"			       +(Utils.doubleToString((best[m_numClusters] / 						       inst.numInstances() * 						       100.0), 8, 4))			       +" %\n");    // copy the class assignments    m_classToCluster = new int [m_numClusters];    for (int i = 0; i < m_numClusters; i++) {      m_classToCluster[i] = (int)best[i];    }  }  /**   * Returns a "confusion" style matrix of classes to clusters assignments   * @param counts the counts of classes for each cluster   * @param clusterTotals total number of examples in each cluster   * @param inst the training instances (with class)   * @return the "confusion" style matrix as string   * @throws Exception if matrix can't be generated   */  private String toMatrixString(int [][] counts, int [] clusterTotals,				Instances inst)     throws Exception {    StringBuffer ms = new StringBuffer();    int maxval = 0;    for (int i = 0; i < m_numClusters; i++) {      for (int j = 0; j < counts[i].length; j++) {	if (counts[i][j] > maxval) {	  maxval = counts[i][j];	}      }    }
clusterevaluation.java - 源码说明

本页面展示了「Java 编写的多种数据挖掘算法包括聚类、分类、预处理等」中的 clusterevaluation.java 源码文件，采用 Java 编程语言编写，共 1,130 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与Java相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?