📄 clusterevaluation.java
字号:
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* ClusterEvaluation.java
* Copyright (C) 1999 Mark Hall
*
*/
package weka.clusterers;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.Enumeration;
import java.util.Random;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Range;
import weka.core.Utils;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.Remove;
/**
* Class for evaluating clustering models.<p>
*
* Valid options are: <p>
*
* -t <name of the training file> <br>
* Specify the training file. <p>
*
* -T <name of the test file> <br>
* Specify the test file to apply clusterer to. <p>
*
* -d <name of file to save clustering model to> <br>
* Specify output file. <p>
*
* -l <name of file to load clustering model from> <br>
* Specifiy input file. <p>
*
* -p <attribute range> <br>
* Output predictions. Predictions are for the training file if only the
* training file is specified, otherwise they are for the test file. The range
* specifies attribute values to be output with the predictions.
* Use '-p 0' for none. <p>
*
* -x <num folds> <br>
* Set the number of folds for a cross validation of the training data.
* Cross validation can only be done for distribution clusterers and will
* be performed if the test file is missing. <p>
*
* -c <class> <br>
* Set the class attribute. If set, then class based evaluation of clustering
* is performed. <p>
*
* @author Mark Hall (mhall@cs.waikato.ac.nz)
* @version $Revision$
*/
public class ClusterEvaluation implements Serializable {
/** the instances to cluster */
private Instances m_trainInstances;
/** the clusterer */
private Clusterer m_Clusterer;
/** holds a string describing the results of clustering the training data */
private StringBuffer m_clusteringResults;
/** holds the number of clusters found by the clusterer */
private int m_numClusters;
/** holds the assigments of instances to clusters for a particular testing
dataset */
private double [] m_clusterAssignments;
/* holds the average log likelihood for a particular testing dataset
if the clusterer is a DensityBasedClusterer */
private double m_logL;
/** will hold the mapping of classes to clusters (for class based
evaluation) */
private int [] m_classToCluster = null;
/**
* set the clusterer
* @param clusterer the clusterer to use
*/
public void setClusterer(Clusterer clusterer) {
m_Clusterer = clusterer;
}
/**
* return the results of clustering.
* @return a string detailing the results of clustering a data set
*/
public String clusterResultsToString() {
return m_clusteringResults.toString();
}
/**
* Return the number of clusters found for the most recent call to
* evaluateClusterer
* @return the number of clusters found
*/
public int getNumClusters() {
return m_numClusters;
}
/**
* Return an array of cluster assignments corresponding to the most
* recent set of instances clustered.
* @return an array of cluster assignments
*/
public double [] getClusterAssignments() {
return m_clusterAssignments;
}
/**
* Return the array (ordered by cluster number) of minimum error class to
* cluster mappings
* @return an array of class to cluster mappings
*/
public int [] getClassesToClusters() {
return m_classToCluster;
}
/**
* Return the log likelihood corresponding to the most recent
* set of instances clustered.
*
* @return a <code>double</code> value
*/
public double getLogLikelihood() {
return m_logL;
}
/**
* Constructor. Sets defaults for each member variable. Default Clusterer
* is EM.
*/
public ClusterEvaluation () {
setClusterer(new EM());
m_trainInstances = null;
m_clusteringResults = new StringBuffer();
m_clusterAssignments = null;
}
/**
* Evaluate the clusterer on a set of instances. Calculates clustering
* statistics and stores cluster assigments for the instances in
* m_clusterAssignments
* @param test the set of instances to cluster
* @exception Exception if something goes wrong
*/
public void evaluateClusterer(Instances test) throws Exception {
int i = 0;
int cnum;
double loglk = 0.0;
double[] dist;
double temp;
int cc = m_Clusterer.numberOfClusters();
m_numClusters = cc;
int numInstFieldWidth = (int)((Math.log(test.numInstances())/
Math.log(10))+1);
double[] instanceStats = new double[cc];
m_clusterAssignments = new double [test.numInstances()];
Instances testCopy = test;
boolean hasClass = (testCopy.classIndex() >= 0);
int unclusteredInstances = 0;
// If class is set then do class based evaluation as well
if (hasClass) {
if (testCopy.classAttribute().isNumeric()) {
throw new Exception("ClusterEvaluation: Class must be nominal!");
}
Remove removeClass = new Remove();
removeClass.setAttributeIndices(""+(testCopy.classIndex()+1));
removeClass.setInvertSelection(false);
removeClass.setInputFormat(testCopy);
testCopy = Filter.useFilter(testCopy, removeClass);
}
for (i=0;i<testCopy.numInstances();i++) {
cnum = -1;
try {
if (m_Clusterer instanceof DensityBasedClusterer) {
loglk += ((DensityBasedClusterer)m_Clusterer).
logDensityForInstance(testCopy.instance(i));
// temp = Utils.sum(dist);
// Utils.normalize(dist);
cnum = m_Clusterer.clusterInstance(testCopy.instance(i));
// Utils.maxIndex(dist);
m_clusterAssignments[i] = (double)cnum;
} else {
cnum = m_Clusterer.clusterInstance(testCopy.instance(i));
m_clusterAssignments[i] = (double)cnum;
}
}
catch (Exception e) {
unclusteredInstances++;
}
if (cnum != -1) {
instanceStats[cnum]++;
}
}
/* // count the actual number of used clusters
int count = 0;
for (i = 0; i < cc; i++) {
if (instanceStats[i] > 0) {
count++;
}
}
if (count > 0) {
double [] tempStats = new double [count];
double [] map = new double [m_clusterAssignments.length];
count=0;
for (i=0;i<cc;i++) {
if (instanceStats[i] > 0) {
tempStats[count] = instanceStats[i];
map[i] = count;
count++;
}
}
instanceStats = tempStats;
cc = instanceStats.length;
for (i=0;i<m_clusterAssignments.length;i++) {
m_clusterAssignments[i] = map[(int)m_clusterAssignments[i]];
}
} */
double sum = Utils.sum(instanceStats);
loglk /= sum;
m_logL = loglk;
m_clusteringResults.append(m_Clusterer.toString());
m_clusteringResults.append("Clustered Instances\n\n");
int clustFieldWidth = (int)((Math.log(cc)/Math.log(10))+1);
for (i = 0; i < cc; i++) {
if (instanceStats[i] > 0) {
m_clusteringResults.append(Utils.doubleToString((double)i,
clustFieldWidth, 0)
+ " "
+ Utils.doubleToString(instanceStats[i],
numInstFieldWidth, 0)
+ " ("
+ Utils.doubleToString((instanceStats[i] /
sum * 100.0)
, 3, 0) + "%)\n");
}
}
if (unclusteredInstances > 0) {
m_clusteringResults.append("\nUnclustered instances : "
+unclusteredInstances);
}
if (m_Clusterer instanceof DensityBasedClusterer) {
m_clusteringResults.append("\n\nLog likelihood: "
+ Utils.doubleToString(loglk, 1, 5)
+ "\n");
}
if (hasClass) {
evaluateClustersWithRespectToClass(test);
}
}
/**
* Evaluates cluster assignments with respect to actual class labels.
* Assumes that m_Clusterer has been trained and tested on
* inst (minus the class).
* @param inst the instances (including class) to evaluate with respect to
* @exception Exception if something goes wrong
*/
private void evaluateClustersWithRespectToClass(Instances inst)
throws Exception {
int numClasses = inst.classAttribute().numValues();
int [][] counts = new int [m_numClusters][numClasses];
int [] clusterTotals = new int[m_numClusters];
double [] best = new double[m_numClusters+1];
double [] current = new double[m_numClusters+1];
for (int i = 0; i < inst.numInstances(); i++) {
counts[(int)m_clusterAssignments[i]][(int)inst.instance(i).classValue()]++;
clusterTotals[(int)m_clusterAssignments[i]]++;
}
best[m_numClusters] = Double.MAX_VALUE;
mapClasses(0, counts, clusterTotals, current, best, 0);
m_clusteringResults.append("\n\nClass attribute: "
+inst.classAttribute().name()
+"\n");
m_clusteringResults.append("Classes to Clusters:\n");
String matrixString = toMatrixString(counts, clusterTotals, inst);
m_clusteringResults.append(matrixString).append("\n");
int Cwidth = 1 + (int)(Math.log(m_numClusters) / Math.log(10));
// add the minimum error assignment
for (int i = 0; i < m_numClusters; i++) {
if (clusterTotals[i] > 0) {
m_clusteringResults.append("Cluster "
+Utils.doubleToString((double)i,Cwidth,0));
m_clusteringResults.append(" <-- ");
if (best[i] < 0) {
m_clusteringResults.append("No class\n");
} else {
m_clusteringResults.
append(inst.classAttribute().value((int)best[i])).append("\n");
}
}
}
m_clusteringResults.append("\nIncorrectly clustered instances :\t"
+best[m_numClusters]+"\t"
+(Utils.doubleToString((best[m_numClusters] /
inst.numInstances() *
100.0), 8, 4))
+" %\n");
// copy the class assignments
m_classToCluster = new int [m_numClusters];
for (int i = 0; i < m_numClusters; i++) {
m_classToCluster[i] = (int)best[i];
}
}
/**
* Returns a "confusion" style matrix of classes to clusters assignments
* @param counts the counts of classes for each cluster
* @param clusterTotals total number of examples in each cluster
* @param inst the training instances (with class)
* @exception Exception if matrix can't be generated
*/
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -