📄 .#semisupclustererevaluation.java.1.9
字号:
/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* * SemiSupClustererEvaluation.java * Copyright (C) 2002 Sugato Basu, Misha Bilenko * */package weka.clusterers;import java.util.*;import java.io.*;import weka.core.*;import weka.filters.Filter;import weka.filters.unsupervised.attribute.Remove;/** * Class for evaluating clustering models - extends ClusterEvaluation.java<p> * Implements different clustering evaluation metrics * * @author Sugato Basu, Misha Bilenko */public class SemiSupClustererEvaluation extends ClusterEvaluation { /** Purity of the clustering */ protected double m_Purity; /** Entropy of the clustering */ protected double m_Entropy; /** Objective function of the clustering */ protected double m_Objective; /** MI Metric the clustering */ protected double m_MIMetric; /** KL Divergence of the clustering */ protected double m_KLDivergence; /** The number of underlying classes */ protected int m_NumClasses; /** The number of produced clusters */ protected int m_NumClusters; /** All labeled training instances */ protected Instances m_LabeledTrain; /** All unlabaled training instances */ protected Instances m_UnlabeledTrain; /** All test instances */ protected Instances m_Test; /** Training pairs */ protected ArrayList m_labeledTrainPairs; /** The weight of all incorrectly categorized test instances. */ protected double m_WeightTestIncorrect; /** The weight of all correctly categorized test instances. */ protected double m_WeightTestCorrect; /** The weight of all uncategorized test instances. */ protected double m_WeightTestUnclassified; /** The weight of test instances that had a class assigned to them. */ protected double m_WeightTestWithClass; /** Array for storing the confusion matrix. */ protected double [][] m_ConfusionMatrix; /** The names of the classes. */ protected String [] m_ClassNames; /** Is the class nominal or numeric? */ protected boolean m_ClassIsNominal; /** If the class is not nominal, we do not need the confusion matrix but do pairs counts directly */ protected int m_totalPairs; protected int m_goodPairs; protected int m_trueGoodPairs; /** The total cost of predictions (includes instance weights) */ protected double m_TotalCost; public String toSummaryString() { return super.toString(); } /** * Returns a string describing this evaluator * @return a description of the evaluator suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return " A clusterer evaluator that evaluates results of running a " + "semi-supervised clustering algorithm."; } public SemiSupClustererEvaluation (Instances test, int numClasses, int numClusters) { m_NumClasses = numClasses; m_NumClusters = numClusters; m_ClassIsNominal = test.classAttribute().isNominal(); if (m_ClassIsNominal) { m_ConfusionMatrix = new double [m_NumClusters][m_NumClasses]; m_ClassNames = new String [m_NumClasses]; for(int i = 0; i < m_NumClasses; i++) { m_ClassNames[i] = test.classAttribute().value(i); } } } public SemiSupClustererEvaluation (ArrayList labeledTrainPairs, Instances test, int numClasses, int numClusters) { this (test,numClasses,numClusters); m_labeledTrainPairs = labeledTrainPairs; } /** * Evaluates the semi-sup clusterer on a given set of test instances * * @param clusterer semi-supervised clusterer * @param testInstances set of test instances for evaluation * @exception Exception if model could not be evaluated successfully */ public void evaluateModel (Clusterer clusterer, Instances testInstances, Instances unlabeledTest) throws Exception { if (m_ClassIsNominal) { m_Test = testInstances; m_Objective = ((SemiSupClusterer) clusterer).objectiveFunction(); // Assuming transductive clustering here ... will need to generalize in future System.out.println("Evaluating cluster results ..."); for (int i = 0; i < unlabeledTest.numInstances(); i++) { evaluateModelOnce(clusterer, unlabeledTest.instance(i), (int) (testInstances.instance(i)).classValue()); } } else { // string-based class attributes int numInstances = testInstances.numInstances(); Attribute classAttr = testInstances.classAttribute(); int [][] sharedClass = new int[numInstances][numInstances]; HashSet dontCareSet = new HashSet(); final int HAVE_SHARED_CLASS = 0; final int NO_SHARED_CLASS = 1; final int DONT_CARE = 2; m_totalPairs = 0; m_goodPairs = 0; // calculate the number of true pairs m_trueGoodPairs = 0; HashSet [] classSets = new HashSet[numInstances]; for (int i = 0; i < numInstances; i++) { System.out.println("Classattr: " + classAttr); String classList = testInstances.instance(i).stringValue(classAttr); if (classList.length() != 0) { // skip unassigned instances // parse the list of classes into a hashset HashSet classSet = new HashSet(); StringTokenizer tokenizer = new StringTokenizer(classList, "_"); while (tokenizer.hasMoreTokens()) { classSet.add(tokenizer.nextToken()); } classSets[i] = classSet; for (int j = 0; j < i; j++) { if (classSets[j] != null) { // skip unassigned instances HashSet prevSet = (HashSet) classSets[j]; Iterator iterator = prevSet.iterator(); boolean shareClass = false; // go through previously assigned instance's classes and see if current class list contains any while (iterator.hasNext() && !shareClass) { String classString = (String) iterator.next(); if (classSet.contains(classString)) { shareClass = true; } } if (shareClass) { m_trueGoodPairs++; sharedClass[i][j] = sharedClass[j][i] = HAVE_SHARED_CLASS; } else { sharedClass[i][j] = sharedClass[j][i] = NO_SHARED_CLASS; } } } } else { // all pairs with this instance are don't care dontCareSet.add(new Integer(i)); for (int j = 0; j < numInstances; j++) { sharedClass[i][j] = sharedClass[j][i] = DONT_CARE; } } } // now cluster and evaluate precision ArrayList[] classLists = new ArrayList[m_NumClasses]; for (int i = 0; i < classLists.length; i++) { classLists[i] = new ArrayList(); } for (int i = 0; i < unlabeledTest.numInstances(); i++) { if (!dontCareSet.contains(new Integer(i))) { int clusterIdx = clusterer.clusterInstance(unlabeledTest.instance(i)); // go through all instances previously assigned to the same cluster and check whether they have common classes for (int j = 0; j < classLists[clusterIdx].size(); j++) { int sameClusterInstanceIdx = ((Integer) classLists[clusterIdx].get(j)).intValue(); if (sharedClass[j][sameClusterInstanceIdx] == HAVE_SHARED_CLASS) { m_goodPairs++; } m_totalPairs++; } classLists[clusterIdx].add(new Integer(i)); } } } } /** * Evaluates the semi-sup clusterer on a given test instance * * @param clusterer semi-supervised clusterer * @param test test instance for evaluation * @exception Exception if model could not be evaluated successfully */ public void evaluateModelOnce (Clusterer clusterer, Instance testWithoutLabel, int classValue) throws Exception { double [] pred; if (m_ClassIsNominal) { if (clusterer instanceof DistributionClusterer) { pred = ((DistributionClusterer) clusterer).distributionForInstance(testWithoutLabel); } else { pred = makeDistribution(clusterer.clusterInstance(testWithoutLabel)); } updateStatsForClusterer(pred, classValue); } } /** * Convert a single prediction into a probability distribution * with all zero probabilities except the predicted value which * has probability 1.0; * * @param predictedClass the index of the predicted class * @return the probability distribution */ protected double [] makeDistribution(int predictedCluster) { double [] result = new double [m_NumClasses]; if (m_ClassIsNominal) { result[predictedCluster] = 1.0; } else { result[0] = predictedCluster; } return result; } /** * Updates all the statistics about a clusterer performance for * the current test instance. * * @param distrib the probabilities assigned to each class * @param test the test instance
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -