📄 clusterevaluation.java

📁 一个数据挖掘系统的源码
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
12 3 4 下一页

/**
 *
 *   AgentAcademy - an open source Data Mining framework for
 *   training intelligent agents
 *
 *   Copyright (C)   2001-2003 AA Consortium.
 *
 *   This library is open source software; you can redistribute it
 *   and/or modify it under the terms of the GNU Lesser General
 *   Public License as published by the Free Software Foundation;
 *   either version 2.0 of the License, or (at your option) any later
 *   version.
 *
 *   This library is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU Lesser General Public
 *   License along with this library; if not, write to the Free
 *   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 *   MA  02111-1307 USA
 *
 */

package org.agentacademy.modules.dataminer.clusterers;

/**
 * <p>Title: The Data Miner prototype</p>
 * <p>Description: A prototype for the DataMiner (DM), the Agent Academy (AA) module responsible for performing data mining on the contents of the Agent Use Repository (AUR). The extracted knowledge is to be sent back to the AUR in the form of a PMML document.</p>
 * <p>Copyright: Copyright (c) 2002</p>
 * <p>Company: CERTH</p>
 * @author asymeon
 * @version 0.3
 */


import  java.util.*;
import  java.io.*;
import  org.agentacademy.modules.dataminer.core.*;
import  org.agentacademy.modules.dataminer.filters.Filter;
import  org.agentacademy.modules.dataminer.filters.AttributeFilter;
import org.apache.log4j.Logger;

/**
 * Class for evaluating clustering models.<p>
 *
 * Valid options are: <p>
 *
 * -t <name of the training file> <br>
 * Specify the training file. <p>
 *
 * -T <name of the test file> <br>
 * Specify the test file to apply clusterer to. <p>
 *
 * -d <name of file to save clustering model to> <br>
 * Specify output file. <p>
 *
 * -l <name of file to load clustering model from> <br>
 * Specifiy input file. <p>
 *
 * -p <attribute range> <br>
 * Output predictions. Predictions are for the training file if only the
 * training file is specified, otherwise they are for the test file. The range
 * specifies attribute values to be output with the predictions.
 * Use '-p 0' for none. <p>
 *
 * -x <num folds> <br>
 * Set the number of folds for a cross validation of the training data.
 * Cross validation can only be done for distribution clusterers and will
 * be performed if the test file is missing. <p>
 *
 * -c <class> <br>
 * Set the class attribute. If set, then class based evaluation of clustering
 * is performed. <p>
 *
 */
public class ClusterEvaluation {

 public static Logger                log = Logger.getLogger(ClusterEvaluation.class);
  /** the instances to cluster */
  private Instances m_trainInstances;

  /** the clusterer */
  private Clusterer m_Clusterer;

  /** do cross validation (DistributionClusterers only) */
  private boolean m_doXval;

  /** the number of folds to use for cross validation */
  private int m_numFolds;

  /** seed to use for cross validation */
  private int m_seed;

  /** holds a string describing the results of clustering the training data */
  private StringBuffer m_clusteringResults;

  /** holds the number of clusters found by the clusterer */
  private int m_numClusters;

  /** holds the assigments of instances to clusters for a particular testing
      dataset */
  private double [] m_clusterAssignments;

  /** will hold the mapping of classes to clusters (for class based
      evaluation) */
  private int [] m_classToCluster = null;

  /**
   * set the clusterer
   * @param clusterer the clusterer to use
   */
  public void setClusterer(Clusterer clusterer) {
    m_Clusterer = clusterer;
  }

  /**
   * set whether or not to do cross validation
   * @param x true if cross validation is to be done
   */
  public void setDoXval(boolean x) {
    m_doXval = x;
  }

  /**
   * set the number of folds to use for cross validation
   * @param folds the number of folds
   */
  public void setFolds(int folds) {
    m_numFolds = folds;
  }

  /**
   * set the seed to use for cross validation
   * @param s the seed.
   */
  public void setSeed(int s) {
    m_seed = s;
  }

  /**
   * return the results of clustering.
   * @return a string detailing the results of clustering a data set
   */
  public String clusterResultsToString() {
    return m_clusteringResults.toString();
  }

  /**
   * Return the number of clusters found for the most recent call to
   * evaluateClusterer
   * @return the number of clusters found
   */
  public int getNumClusters() {
    return m_numClusters;
  }

  /**
   * Return an array of cluster assignments corresponding to the most
   * recent set of instances clustered.
   * @return an array of cluster assignments
   */
  public double [] getClusterAssignments() {
    return m_clusterAssignments;
  }

  /**
   * Return the array (ordered by cluster number) of minimum error class to
   * cluster mappings
   * @return an array of class to cluster mappings
   */
  public int [] getClassesToClusters() {
    return m_classToCluster;
  }

  /**
   * Constructor. Sets defaults for each member variable. Default Clusterer
   * is EM.
   */
  public ClusterEvaluation () {
    setFolds(10);
    setDoXval(false);
    setSeed(1);
    setClusterer(new EM());
    m_trainInstances = null;
    m_clusteringResults = new StringBuffer();
    m_clusterAssignments = null;
  }

  /**
   * Evaluate the clusterer on a set of instances. Calculates clustering
   * statistics and stores cluster assigments for the instances in
   * m_clusterAssignments
   * @param test the set of instances to cluster
   * @exception Exception if something goes wrong
   */
  public void evaluateClusterer(Instances test) throws Exception {
    int i = 0;
    int cnum;
    double loglk = 0.0;
    double[] dist;
    double temp;
    int cc = m_Clusterer.numberOfClusters();
    m_numClusters = cc;
    int numInstFieldWidth = (int)((Math.log(test.numInstances())/
				   Math.log(10))+1);
    double[] instanceStats = new double[cc];
    m_clusterAssignments = new double [test.numInstances()];
    Instances testCopy = test;
    boolean hasClass = (testCopy.classIndex() >= 0);
    int unclusteredInstances = 0;

    // If class is set then do class based evaluation as well
    if (hasClass) {
      if (testCopy.classAttribute().isNumeric()) {
	throw new Exception("ClusterEvaluation: Class must be nominal!");
      }
      AttributeFilter removeClass = new AttributeFilter();
      removeClass.setAttributeIndices(""+(testCopy.classIndex()+1));
      removeClass.setInvertSelection(false);
      removeClass.setInputFormat(testCopy);
      testCopy = Filter.useFilter(testCopy, removeClass);
    }

    for (i=0;i<testCopy.numInstances();i++) {
      cnum = -1;
      try {
	if (m_Clusterer instanceof DistributionClusterer) {
	  temp = ((DistributionClusterer)m_Clusterer).
	    densityForInstance(testCopy.instance(i));
	  //	  temp = Utils.sum(dist);
	  if (temp > 0) {
	    loglk += Math.log(temp);
	  }

	  //	  Utils.normalize(dist);
	  cnum = m_Clusterer.clusterInstance(testCopy.instance(i));
	  // Utils.maxIndex(dist);
	  m_clusterAssignments[i] = (double)cnum;
	} else {
	  cnum = m_Clusterer.clusterInstance(testCopy.instance(i));
	  m_clusterAssignments[i] = (double)cnum;
	}
      }
      catch (Exception e) {
	unclusteredInstances++;
      }

      if (cnum != -1) {
	instanceStats[cnum]++;
      }
    }

    /* // count the actual number of used clusters
    int count = 0;
    for (i = 0; i < cc; i++) {
      if (instanceStats[i] > 0) {
	count++;
      }
    }
    if (count > 0) {
      double [] tempStats = new double [count];
      double [] map = new double [m_clusterAssignments.length];
      count=0;
      for (i=0;i<cc;i++) {
	if (instanceStats[i] > 0) {
	  tempStats[count] = instanceStats[i];
	  map[i] = count;
	  count++;
	}
      }
      instanceStats = tempStats;
      cc = instanceStats.length;
      for (i=0;i<m_clusterAssignments.length;i++) {
	m_clusterAssignments[i] = map[(int)m_clusterAssignments[i]];
      }
      } */

    double sum = Utils.sum(instanceStats);
    loglk /= sum;

    m_clusteringResults.append(m_Clusterer.toString());
    m_clusteringResults.append("Clustered Instances\n\n");
    int clustFieldWidth = (int)((Math.log(cc)/Math.log(10))+1);
    for (i = 0; i < cc; i++) {
      if (instanceStats[i] > 0) {
	m_clusteringResults.append(Utils.doubleToString((double)i,
							clustFieldWidth, 0)
				   + "      "
				   + Utils.doubleToString(instanceStats[i],
							  numInstFieldWidth, 0)
				   + " ("
				   + Utils.doubleToString((instanceStats[i] /
							   sum * 100.0)
							  , 3, 0) + "%)\n");
      }
    }

    if (unclusteredInstances > 0) {
      m_clusteringResults.append("\nUnclustered instances : "
				 +unclusteredInstances);
    }

    if (m_Clusterer instanceof DistributionClusterer) {
      m_clusteringResults.append("\n\nLog likelihood: "
				 + Utils.doubleToString(loglk, 1, 5)
				 + "\n");
    }
    if (hasClass) {
      evaluateClustersWithRespectToClass(test);
    }
  }

  /**
   * Evaluates cluster assignments with respect to actual class labels.
   * Assumes that m_Clusterer has been trained and tested on
   * inst (minus the class).
   * @param inst the instances (including class) to evaluate with respect to
   * @exception Exception if something goes wrong
   */
  private void evaluateClustersWithRespectToClass(Instances inst)
    throws Exception {
    int numClasses = inst.classAttribute().numValues();
    int [][] counts = new int [m_numClusters][numClasses];
    int [] clusterTotals = new int[m_numClusters];
    double [] best = new double[m_numClusters+1];
    double [] current = new double[m_numClusters+1];

    for (int i = 0; i < inst.numInstances(); i++) {
      counts[(int)m_clusterAssignments[i]][(int)inst.instance(i).classValue()]++;
      clusterTotals[(int)m_clusterAssignments[i]]++;
    }

    best[m_numClusters] = Double.MAX_VALUE;
    mapClasses(0, counts, clusterTotals, current, best, 0);

    m_clusteringResults.append("\n\nClass attribute: "
			+inst.classAttribute().name()
			+"\n");
    m_clusteringResults.append("Classes to Clusters:\n");
    String matrixString = toMatrixString(counts, clusterTotals, inst);
    m_clusteringResults.append(matrixString).append("\n");

    int Cwidth = 1 + (int)(Math.log(m_numClusters) / Math.log(10));
    // add the minimum error assignment
    for (int i = 0; i < m_numClusters; i++) {
      if (clusterTotals[i] > 0) {
	m_clusteringResults.append("Cluster "
				   +Utils.doubleToString((double)i,Cwidth,0));
	m_clusteringResults.append(" <-- ");

	if (best[i] < 0) {
	  m_clusteringResults.append("No class\n");
	} else {
	  m_clusteringResults.
	    append(inst.classAttribute().value((int)best[i])).append("\n");
	}
      }
    }
    m_clusteringResults.append("\nIncorrectly clustered instances :\t"
			       +best[m_numClusters]+"\t"
			       +(Utils.doubleToString((best[m_numClusters] /
						       inst.numInstances() *
						       100.0), 8, 4))
			       +" %\n");

    // copy the class assignments
    m_classToCluster = new int [m_numClusters];
    for (int i = 0; i < m_numClusters; i++) {
      m_classToCluster[i] = (int)best[i];
    }
  }

  /**
   * Returns a "confusion" style matrix of classes to clusters assignments
   * @param counts the counts of classes for each cluster
   * @param clusterTotals total number of examples in each cluster
   * @param inst the training instances (with class)
   * @exception Exception if matrix can't be generated
   */
  private String toMatrixString(int [][] counts, int [] clusterTotals,
				Instances inst)
    throws Exception {
    StringBuffer ms = new StringBuffer();

    int maxval = 0;
    for (int i = 0; i < m_numClusters; i++) {
      for (int j = 0; j < counts[i].length; j++) {
	if (counts[i][j] > maxval) {
	  maxval = counts[i][j];
	}
      }
    }

    int Cwidth = 1 + Math.max((int)(Math.log(maxval) / Math.log(10)),
			      (int)(Math.log(m_numClusters) / Math.log(10)));

    ms.append("\n");

    for (int i = 0; i < m_numClusters; i++) {
      if (clusterTotals[i] > 0) {
12 3 4 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -