farthestfirst.java

来自「wekaUT是 university texas austin 开发的基于wek」· Java 代码 · 共 544 行
JAVA
544 行
/* *    This program is free software; you can redistribute it and/or modify *    it under the terms of the GNU General Public License as published by *    the Free Software Foundation; either version 2 of the License, or *    (at your option) any later version. * *    This program is distributed in the hope that it will be useful, *    but WITHOUT ANY WARRANTY; without even the implied warranty of *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *    GNU General Public License for more details. * *    You should have received a copy of the GNU General Public License *    along with this program; if not, write to the Free Software *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* *    FarthestFirst.java *    Copyright (C) 2002 Bernhard Pfahringer *    based on SimpleKMeans which is  *     Copyright (C) 2000 Mark Hall (mhall@cs.waikato.ac.nz) * */package weka.clusterers;import  java.io.*;import  java.util.*;import  weka.core.*;import  weka.filters.Filter;import  weka.filters.unsupervised.attribute.ReplaceMissingValues;/** * Implements the "Farthest First Traversal Algorithm" by  * Hochbaum and Shmoys 1985: A best possible heuristic for the * k-center problem, Mathematics of Operations Research, 10(2):180-184, * as cited by Sanjoy Dasgupta "performance guarantees for hierarchical * clustering", colt 2002, sydney * * works as a fast simple approximate clusterer * * modelled after SimpleKMeans, might be a useful initializer for it * * Valid options are:<p> * * -N <number of clusters> <br> * Specify the number of clusters to generate. <p> * * -S <seed> <br> * Specify random number seed. <p> * * @author Bernhard Pfahringer (bernhard@cs.waikato.ac.nz) * @version $Revision: 1.1.1.1 $ * @see Clusterer * @see OptionHandler */// Todo: rewrite to be fully incremental//       cleanup, like deleting m_instances public class FarthestFirst extends Clusterer implements OptionHandler {  /**   * training instances, not necessary to keep,    * could be replaced by m_ClusterCentroids where needed for header info   */  protected Instances m_instances;  /**   * replace missing values in training instances   */  protected ReplaceMissingValues m_ReplaceMissingFilter;  /**   * number of clusters to generate   */  protected int m_NumClusters = 2;  /**   * holds the cluster centroids   */  protected Instances m_ClusterCentroids;  /**   * attribute min values   */  private double [] m_Min;    /**   * attribute max values   */  private double [] m_Max;  /**   * random seed   */  protected int m_Seed = 1;  /**   * Returns a string describing this clusterer   * @return a description of the evaluator suitable for   * displaying in the explorer/experimenter gui   */  public String globalInfo() {    return "Cluster data using the FarthestFirst algorithm";  }  /**   * Generates a clusterer. Has to initialize all fields of the clusterer   * that are not being set via options.   *   * @param data set of instances serving as training data    * @exception Exception if the clusterer has not been    * generated successfully   */  public void buildClusterer(Instances data) throws Exception {    //long start = System.currentTimeMillis();    if (data.checkForStringAttributes()) {      throw  new Exception("Can't handle string attributes!");    }    m_ReplaceMissingFilter = new ReplaceMissingValues();    m_ReplaceMissingFilter.setInputFormat(data);    m_instances = Filter.useFilter(data, m_ReplaceMissingFilter);    initMinMax(m_instances);    m_ClusterCentroids = new Instances(m_instances, m_NumClusters);    int n = m_instances.numInstances();    Random r = new Random(m_Seed);    boolean[] selected = new boolean[n];    double[] minDistance = new double[n];    for(int i = 0; i<n; i++) minDistance[i] = Double.MAX_VALUE;    int firstI = r.nextInt(n);    m_ClusterCentroids.add(m_instances.instance(firstI));    selected[firstI] = true;    updateMinDistance(minDistance,selected,m_instances,m_instances.instance(firstI));    if (m_NumClusters > n) m_NumClusters = n;    for(int i = 1; i < m_NumClusters; i++) {      int nextI =  farthestAway(minDistance, selected);      m_ClusterCentroids.add(m_instances.instance(nextI));      selected[nextI] = true;      updateMinDistance(minDistance,selected,m_instances,m_instances.instance(nextI));    }    m_instances = new Instances(m_instances,0);    //long end = System.currentTimeMillis();    //System.out.println("Clustering Time = " + (end-start));  }  protected void updateMinDistance(double[] minDistance, boolean[] selected, 				   Instances data, Instance center) {    for(int i = 0; i<selected.length; i++)       if (!selected[i]) {	double d = distance(center,data.instance(i));	if (d<minDistance[i]) 	  minDistance[i] = d;      }  }  protected int farthestAway(double[] minDistance, boolean[] selected) {    double maxDistance = -1.0;    int maxI = -1;    for(int i = 0; i<selected.length; i++)       if (!selected[i]) 	if (maxDistance < minDistance[i]) {	  maxDistance = minDistance[i];	  maxI = i;	}    return maxI;  }  protected void initMinMax(Instances data) {    m_Min = new double [data.numAttributes()];    m_Max = new double [data.numAttributes()];    for (int i = 0; i < data.numAttributes(); i++) {      m_Min[i] = m_Max[i] = Double.NaN;    }    for (int i = 0; i < data.numInstances(); i++) {      updateMinMax(data.instance(i));    }  }  /**   * Updates the minimum and maximum values for all the attributes   * based on a new instance.   *   * @param instance the new instance   */  private void updateMinMax(Instance instance) {      for (int j = 0;j < instance.numAttributes(); j++) {      if (Double.isNaN(m_Min[j])) {	m_Min[j] = instance.value(j);	m_Max[j] = instance.value(j);      } else {	if (instance.value(j) < m_Min[j]) {	  m_Min[j] = instance.value(j);	} else {	  if (instance.value(j) > m_Max[j]) {	    m_Max[j] = instance.value(j);	  }	}      }    }  }  /**   * clusters an instance that has been through the filters   *   * @param instance the instance to assign a cluster to   * @return a cluster number   */  protected int clusterProcessedInstance(Instance instance) {    double minDist = Double.MAX_VALUE;    int bestCluster = 0;    for (int i = 0; i < m_NumClusters; i++) {      double dist = distance(instance, m_ClusterCentroids.instance(i));      if (dist < minDist) {	minDist = dist;	bestCluster = i;      }    }    return bestCluster;  }  /**   * Classifies a given instance.   *   * @param instance the instance to be assigned to a cluster   * @return the number of the assigned cluster as an integer   * if the class is enumerated, otherwise the predicted value   * @exception Exception if instance could not be classified   * successfully   */  public int clusterInstance(Instance instance) throws Exception {    m_ReplaceMissingFilter.input(instance);    m_ReplaceMissingFilter.batchFinished();    Instance inst = m_ReplaceMissingFilter.output();    return clusterProcessedInstance(inst);  }  /**   * Calculates the distance between two instances   *   * @param test the first instance   * @param train the second instance   * @return the distance between the two given instances, between 0 and 1   */            protected double distance(Instance first, Instance second) {      double distance = 0;    int firstI, secondI;    for (int p1 = 0, p2 = 0; 	 p1 < first.numValues() || p2 < second.numValues();) {      if (p1 >= first.numValues()) {	firstI = m_instances.numAttributes();      } else {	firstI = first.index(p1);       }      if (p2 >= second.numValues()) {	secondI = m_instances.numAttributes();      } else {	secondI = second.index(p2);      }      if (firstI == m_instances.classIndex()) {	p1++; continue;      }       if (secondI == m_instances.classIndex()) {	p2++; continue;      }       double diff;      if (firstI == secondI) {	diff = difference(firstI, 			  first.valueSparse(p1),			  second.valueSparse(p2));	p1++; p2++;      } else if (firstI > secondI) {	diff = difference(secondI, 			  0, second.valueSparse(p2));	p2++;      } else {	diff = difference(firstI, 			  first.valueSparse(p1), 0);	p1++;      }      distance += diff * diff;    }        return Math.sqrt(distance / m_instances.numAttributes());  }  /**   * Computes the difference between two given attribute   * values.   */  protected double difference(int index, double val1, double val2) {    switch (m_instances.attribute(index).type()) {    case Attribute.NOMINAL:            // If attribute is nominal      if (Instance.isMissingValue(val1) || 	  Instance.isMissingValue(val2) ||	  ((int)val1 != (int)val2)) {	return 1;      } else {	return 0;      }    case Attribute.NUMERIC:      // If attribute is numeric      if (Instance.isMissingValue(val1) || 	  Instance.isMissingValue(val2)) {	if (Instance.isMissingValue(val1) && 	    Instance.isMissingValue(val2)) {	  return 1;	} else {	  double diff;	  if (Instance.isMissingValue(val2)) {	    diff = norm(val1, index);	  } else {	    diff = norm(val2, index);	  }	  if (diff < 0.5) {	    diff = 1.0 - diff;	  }	  return diff;	}      } else {	return norm(val1, index) - norm(val2, index);      }    default:      return 0;    }  }  /**   * Normalizes a given value of a numeric attribute.   *   * @param x the value to be normalized   * @param i the attribute's index   */  protected double norm(double x, int i) {    if (Double.isNaN(m_Min[i]) || Utils.eq(m_Max[i],m_Min[i])) {      return 0;    } else {      return (x - m_Min[i]) / (m_Max[i] - m_Min[i]);    }  }  /**   * Returns the number of clusters.   *   * @return the number of clusters generated for a training dataset.   * @exception Exception if number of clusters could not be returned   * successfully   */  public int numberOfClusters() throws Exception {    return m_NumClusters;  }  /**   * Returns an enumeration describing the available options.. <p>   *   * Valid options are:<p>   *   * -N <number of clusters> <br>   * Specify the number of clusters to generate. If omitted,   * FarthestFirst will use cross validation to select the number of clusters   * automatically. <p>   *   * -S <seed> <br>   * Specify random number seed. <p>   *   * @return an enumeration of all the available options.   *   **/  public Enumeration listOptions () {    Vector newVector = new Vector(2);     newVector.addElement(new Option("\tnumber of clusters. (default = 2)." 				    , "N", 1, "-N <num>"));     newVector.addElement(new Option("\trandom number seed.\n (default 10)"				     , "S", 1, "-S <num>"));     return  newVector.elements();  }  /**   * Returns the tip text for this property   * @return tip text for this property suitable for   * displaying in the explorer/experimenter gui   */  public String numClustersTipText() {    return "set number of clusters";  }  /**   * set the number of clusters to generate   *   * @param n the number of clusters to generate   */  public void setNumClusters(int n) {    m_NumClusters = n;  }  /**   * gets the number of clusters to generate   *   * @return the number of clusters to generate   */  public int getNumClusters() {    return m_NumClusters;  }      /**   * Returns the tip text for this property   * @return tip text for this property suitable for   * displaying in the explorer/experimenter gui   */  public String seedTipText() {    return "random number seed";  }  /**   * Set the random number seed   *   * @param s the seed   */  public void setSeed (int s) {    m_Seed = s;  }  /**   * Get the random number seed   *   * @return the seed   */  public int getSeed () {    return  m_Seed;  }  /**   * Parses a given list of options.   * @param options the list of options as an array of strings   * @exception Exception if an option is not supported   *   **/  public void setOptions (String[] options)    throws Exception {    String optionString = Utils.getOption('N', options);    if (optionString.length() != 0) {      setNumClusters(Integer.parseInt(optionString));    }    optionString = Utils.getOption('S', options);        if (optionString.length() != 0) {      setSeed(Integer.parseInt(optionString));    }  }  /**   * Gets the current settings of FarthestFirst   *   * @return an array of strings suitable for passing to setOptions()   */  public String[] getOptions () {    String[] options = new String[4];    int current = 0;        options[current++] = "-N";    options[current++] = "" + getNumClusters();    options[current++] = "-S";    options[current++] = "" + getSeed();        while (current < options.length) {      options[current++] = "";    }    return  options;  }  /**   * return a string describing this clusterer   *   * @return a description of the clusterer as a string   */  public String toString() {    StringBuffer temp = new StringBuffer();    temp.append("\n FarthestFirst\n==============\n");    temp.append("\nCluster centroids:\n");    for (int i = 0; i < m_NumClusters; i++) {      temp.append("\nCluster "+i+"\n\t");      for (int j = 0; j < m_ClusterCentroids.numAttributes(); j++) {	if (m_ClusterCentroids.attribute(j).isNominal()) {	  temp.append(" "+m_ClusterCentroids.attribute(j).		      value((int)m_ClusterCentroids.instance(i).value(j)));	} else {	  temp.append(" "+m_ClusterCentroids.instance(i).value(j));	}      }    }    temp.append("\n\n");    return temp.toString();  }  /**   * Main method for testing this class.   *   * @param argv should contain the following arguments: <p>   * -t training file [-N number of clusters]   */  public static void main (String[] argv) {    try {      System.out.println(ClusterEvaluation.			 evaluateClusterer(new FarthestFirst(), argv));    }    catch (Exception e) {      System.out.println(e.getMessage());      e.printStackTrace();    }  }}
farthestfirst.java - 源码说明

本页面展示了「wekaUT是 university texas austin 开发的基于weka的半指导学习(semi supervised learning)的分类器」中的 farthestfirst.java 源码文件，采用 Java 编程语言编写，共 544 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与university相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?