⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 clusterevaluation.java

📁 一个数据挖掘软件ALPHAMINERR的整个过程的JAVA版源代码
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    ClusterEvaluation.java
 *    Copyright (C) 1999 Mark Hall
 *
 */

package  weka.clusterers;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.Enumeration;
import java.util.Random;

import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Range;
import weka.core.Utils;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.Remove;

/**
 * Class for evaluating clustering models.<p>
 *
 * Valid options are: <p>
 *
 * -t <name of the training file> <br>
 * Specify the training file. <p>
 *
 * -T <name of the test file> <br>
 * Specify the test file to apply clusterer to. <p>
 *
 * -d <name of file to save clustering model to> <br>
 * Specify output file. <p>
 *
 * -l <name of file to load clustering model from> <br>
 * Specifiy input file. <p>
 *
 * -p <attribute range> <br>
 * Output predictions. Predictions are for the training file if only the
 * training file is specified, otherwise they are for the test file. The range
 * specifies attribute values to be output with the predictions.
 * Use '-p 0' for none. <p>
 *
 * -x <num folds> <br>
 * Set the number of folds for a cross validation of the training data.
 * Cross validation can only be done for distribution clusterers and will
 * be performed if the test file is missing. <p>
 *
 * -c <class> <br>
 * Set the class attribute. If set, then class based evaluation of clustering
 * is performed. <p>
 *
 * @author   Mark Hall (mhall@cs.waikato.ac.nz)
 * @version  $Revision$
 */
public class ClusterEvaluation implements Serializable {

  /** the instances to cluster */
  private Instances m_trainInstances;
  
  /** the clusterer */
  private Clusterer m_Clusterer;

  /** holds a string describing the results of clustering the training data */
  private StringBuffer m_clusteringResults;

  /** holds the number of clusters found by the clusterer */
  private int m_numClusters;

  /** holds the assigments of instances to clusters for a particular testing
      dataset */
  private double [] m_clusterAssignments;

  /* holds the average log likelihood for a particular testing dataset
     if the clusterer is a DensityBasedClusterer */
  private double m_logL;

  /** will hold the mapping of classes to clusters (for class based 
      evaluation) */
  private int [] m_classToCluster = null;

  /**
   * set the clusterer
   * @param clusterer the clusterer to use
   */
  public void setClusterer(Clusterer clusterer) {
    m_Clusterer = clusterer;
  }

  /**
   * return the results of clustering.
   * @return a string detailing the results of clustering a data set
   */
  public String clusterResultsToString() {
    return m_clusteringResults.toString();
  }

  /**
   * Return the number of clusters found for the most recent call to
   * evaluateClusterer
   * @return the number of clusters found
   */
  public int getNumClusters() {
    return m_numClusters;
  }

  /**
   * Return an array of cluster assignments corresponding to the most
   * recent set of instances clustered.
   * @return an array of cluster assignments
   */
  public double [] getClusterAssignments() {
    return m_clusterAssignments;
  }

  /**
   * Return the array (ordered by cluster number) of minimum error class to
   * cluster mappings
   * @return an array of class to cluster mappings
   */
  public int [] getClassesToClusters() {
    return m_classToCluster;
  }

  /**
   * Return the log likelihood corresponding to the most recent
   * set of instances clustered.
   *
   * @return a <code>double</code> value
   */
  public double getLogLikelihood() {
    return m_logL;
  }

  /**
   * Constructor. Sets defaults for each member variable. Default Clusterer
   * is EM.
   */
  public ClusterEvaluation () {
    setClusterer(new EM());
    m_trainInstances = null;
    m_clusteringResults = new StringBuffer();
    m_clusterAssignments = null;
  }

  /**
   * Evaluate the clusterer on a set of instances. Calculates clustering
   * statistics and stores cluster assigments for the instances in
   * m_clusterAssignments
   * @param test the set of instances to cluster
   * @exception Exception if something goes wrong
   */
  public void evaluateClusterer(Instances test) throws Exception {
    int i = 0;
    int cnum;
    double loglk = 0.0;
    double[] dist;
    double temp;
    int cc = m_Clusterer.numberOfClusters();
    m_numClusters = cc;
    int numInstFieldWidth = (int)((Math.log(test.numInstances())/
				   Math.log(10))+1);
    double[] instanceStats = new double[cc];
    m_clusterAssignments = new double [test.numInstances()];
    Instances testCopy = test;
    boolean hasClass = (testCopy.classIndex() >= 0);
    int unclusteredInstances = 0;

    // If class is set then do class based evaluation as well
    if (hasClass) {
      if (testCopy.classAttribute().isNumeric()) {
	throw new Exception("ClusterEvaluation: Class must be nominal!");
      }
      Remove removeClass = new Remove();
      removeClass.setAttributeIndices(""+(testCopy.classIndex()+1));
      removeClass.setInvertSelection(false);
      removeClass.setInputFormat(testCopy);
      testCopy = Filter.useFilter(testCopy, removeClass);
    }

    for (i=0;i<testCopy.numInstances();i++) {
      cnum = -1;
      try {
	if (m_Clusterer instanceof DensityBasedClusterer) {
	  loglk += ((DensityBasedClusterer)m_Clusterer).
	    logDensityForInstance(testCopy.instance(i));
	  //	  temp = Utils.sum(dist);
	  
	  //	  Utils.normalize(dist);
	  cnum = m_Clusterer.clusterInstance(testCopy.instance(i)); 
	  // Utils.maxIndex(dist);
	  m_clusterAssignments[i] = (double)cnum;
	} else {
	  cnum = m_Clusterer.clusterInstance(testCopy.instance(i));
	  m_clusterAssignments[i] = (double)cnum;
	}
      }
      catch (Exception e) {
	unclusteredInstances++;
      }
      
      if (cnum != -1) {
	instanceStats[cnum]++;
      }
    }

    /* // count the actual number of used clusters
    int count = 0;
    for (i = 0; i < cc; i++) {
      if (instanceStats[i] > 0) {
	count++;
      }
    }
    if (count > 0) {
      double [] tempStats = new double [count];
      double [] map = new double [m_clusterAssignments.length];
      count=0;
      for (i=0;i<cc;i++) {
	if (instanceStats[i] > 0) {
	  tempStats[count] = instanceStats[i];
	  map[i] = count;
	  count++;
	}
      }
      instanceStats = tempStats;
      cc = instanceStats.length;
      for (i=0;i<m_clusterAssignments.length;i++) {
	m_clusterAssignments[i] = map[(int)m_clusterAssignments[i]];
      }
      } */ 

    double sum = Utils.sum(instanceStats);
    loglk /= sum;
    m_logL = loglk;
    
    m_clusteringResults.append(m_Clusterer.toString());
    m_clusteringResults.append("Clustered Instances\n\n");
    int clustFieldWidth = (int)((Math.log(cc)/Math.log(10))+1);
    for (i = 0; i < cc; i++) {
      if (instanceStats[i] > 0) {
	m_clusteringResults.append(Utils.doubleToString((double)i, 
							clustFieldWidth, 0) 
				   + "      " 
				   + Utils.doubleToString(instanceStats[i],
							  numInstFieldWidth, 0) 
				   + " (" 
				   + Utils.doubleToString((instanceStats[i] / 
							   sum * 100.0)
							  , 3, 0) + "%)\n");
      }
    }
    
    if (unclusteredInstances > 0) {
      m_clusteringResults.append("\nUnclustered instances : "
				 +unclusteredInstances);
    }

    if (m_Clusterer instanceof DensityBasedClusterer) {
      m_clusteringResults.append("\n\nLog likelihood: " 
				 + Utils.doubleToString(loglk, 1, 5) 
				 + "\n");
    }
    
    if (hasClass) {
      evaluateClustersWithRespectToClass(test);
    }
  }

  /**
   * Evaluates cluster assignments with respect to actual class labels.
   * Assumes that m_Clusterer has been trained and tested on 
   * inst (minus the class).
   * @param inst the instances (including class) to evaluate with respect to
   * @exception Exception if something goes wrong
   */
  private void evaluateClustersWithRespectToClass(Instances inst)
    throws Exception {
    int numClasses = inst.classAttribute().numValues();
    int [][] counts = new int [m_numClusters][numClasses];
    int [] clusterTotals = new int[m_numClusters];
    double [] best = new double[m_numClusters+1];
    double [] current = new double[m_numClusters+1];

    for (int i = 0; i < inst.numInstances(); i++) {
      counts[(int)m_clusterAssignments[i]][(int)inst.instance(i).classValue()]++;
      clusterTotals[(int)m_clusterAssignments[i]]++;
    }
    
    best[m_numClusters] = Double.MAX_VALUE;
    mapClasses(0, counts, clusterTotals, current, best, 0);

    m_clusteringResults.append("\n\nClass attribute: "
			+inst.classAttribute().name()
			+"\n");
    m_clusteringResults.append("Classes to Clusters:\n");
    String matrixString = toMatrixString(counts, clusterTotals, inst);
    m_clusteringResults.append(matrixString).append("\n");

    int Cwidth = 1 + (int)(Math.log(m_numClusters) / Math.log(10));
    // add the minimum error assignment
    for (int i = 0; i < m_numClusters; i++) {
      if (clusterTotals[i] > 0) {
	m_clusteringResults.append("Cluster "
				   +Utils.doubleToString((double)i,Cwidth,0));
	m_clusteringResults.append(" <-- ");
	
	if (best[i] < 0) {
	  m_clusteringResults.append("No class\n");
	} else {
	  m_clusteringResults.
	    append(inst.classAttribute().value((int)best[i])).append("\n");
	}
      }
    }
    m_clusteringResults.append("\nIncorrectly clustered instances :\t"
			       +best[m_numClusters]+"\t"
			       +(Utils.doubleToString((best[m_numClusters] / 
						       inst.numInstances() * 
						       100.0), 8, 4))
			       +" %\n");

    // copy the class assignments
    m_classToCluster = new int [m_numClusters];
    for (int i = 0; i < m_numClusters; i++) {
      m_classToCluster[i] = (int)best[i];
    }
  }

  /**
   * Returns a "confusion" style matrix of classes to clusters assignments
   * @param counts the counts of classes for each cluster
   * @param clusterTotals total number of examples in each cluster
   * @param inst the training instances (with class)
   * @exception Exception if matrix can't be generated
   */

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -