⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 farthestfirst.java

📁 数据挖掘中聚类的算法
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
/* *    This program is free software; you can redistribute it and/or modify *    it under the terms of the GNU General Public License as published by *    the Free Software Foundation; either version 2 of the License, or *    (at your option) any later version. * *    This program is distributed in the hope that it will be useful, *    but WITHOUT ANY WARRANTY; without even the implied warranty of *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *    GNU General Public License for more details. * *    You should have received a copy of the GNU General Public License *    along with this program; if not, write to the Free Software *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* *    FarthestFirst.java *    Copyright (C) 2002 University of Waikato, Hamilton, New Zealand * */package weka.clusterers;import weka.core.Attribute;import weka.core.Capabilities;import weka.core.Instance;import weka.core.Instances;import weka.core.Option;import weka.core.TechnicalInformation;import weka.core.TechnicalInformationHandler;import weka.core.Utils;import weka.core.Capabilities.Capability;import weka.core.TechnicalInformation.Field;import weka.core.TechnicalInformation.Type;import weka.filters.Filter;import weka.filters.unsupervised.attribute.ReplaceMissingValues;import java.util.Enumeration;import java.util.Random;import java.util.Vector;/** <!-- globalinfo-start --> * Cluster data using the FarthestFirst algorithm.<br/> * <br/> * For more information see:<br/> * <br/> * Hochbaum, Shmoys (1985). A best possible heuristic for the k-center problem. Mathematics of Operations Research. 10(2):180-184.<br/> * <br/> * Sanjoy Dasgupta: Performance Guarantees for Hierarchical Clustering. In: 15th Annual Conference on Computational Learning Theory, 351-363, 2002.<br/> * <br/> * Notes:<br/> * - works as a fast simple approximate clusterer<br/> * - modelled after SimpleKMeans, might be a useful initializer for it * <p/> <!-- globalinfo-end --> * <!-- technical-bibtex-start --> * BibTeX: * <pre> * &#64;article{Hochbaum1985, *    author = {Hochbaum and Shmoys}, *    journal = {Mathematics of Operations Research}, *    number = {2}, *    pages = {180-184}, *    title = {A best possible heuristic for the k-center problem}, *    volume = {10}, *    year = {1985} * } *  * &#64;inproceedings{Dasgupta2002, *    author = {Sanjoy Dasgupta}, *    booktitle = {15th Annual Conference on Computational Learning Theory}, *    pages = {351-363}, *    publisher = {Springer}, *    title = {Performance Guarantees for Hierarchical Clustering}, *    year = {2002} * } * </pre> * <p/> <!-- technical-bibtex-end --> * <!-- options-start --> * Valid options are: <p/> *  * <pre> -N &lt;num&gt; *  number of clusters. (default = 2).</pre> *  * <pre> -S &lt;num&gt; *  Random number seed. *  (default 1)</pre> *  <!-- options-end --> * * @author Bernhard Pfahringer (bernhard@cs.waikato.ac.nz) * @version $Revision: 1.8 $ * @see RandomizableClusterer */public class FarthestFirst   extends RandomizableClusterer   implements TechnicalInformationHandler {  //Todo: rewrite to be fully incremental  //      cleanup, like deleting m_instances   /** for serialization */  static final long serialVersionUID = 7499838100631329509L;    /**   * training instances, not necessary to keep,    * could be replaced by m_ClusterCentroids where needed for header info   */  protected Instances m_instances;  /**   * replace missing values in training instances   */  protected ReplaceMissingValues m_ReplaceMissingFilter;  /**   * number of clusters to generate   */  protected int m_NumClusters = 2;  /**   * holds the cluster centroids   */  protected Instances m_ClusterCentroids;  /**   * attribute min values   */  private double [] m_Min;    /**   * attribute max values   */  private double [] m_Max;  /**   * Returns a string describing this clusterer   * @return a description of the evaluator suitable for   * displaying in the explorer/experimenter gui   */  public String globalInfo() {    return "Cluster data using the FarthestFirst algorithm.\n\n"      + "For more information see:\n\n"      + getTechnicalInformation().toString() + "\n\n"      + "Notes:\n"      + "- works as a fast simple approximate clusterer\n"      + "- modelled after SimpleKMeans, might be a useful initializer for it";  }  /**   * Returns an instance of a TechnicalInformation object, containing    * detailed information about the technical background of this class,   * e.g., paper reference or book this class is based on.   *    * @return the technical information about this class   */  public TechnicalInformation getTechnicalInformation() {    TechnicalInformation 	result;    TechnicalInformation 	additional;        result = new TechnicalInformation(Type.ARTICLE);    result.setValue(Field.AUTHOR, "Hochbaum and Shmoys");    result.setValue(Field.YEAR, "1985");    result.setValue(Field.TITLE, "A best possible heuristic for the k-center problem");    result.setValue(Field.JOURNAL, "Mathematics of Operations Research");    result.setValue(Field.VOLUME, "10");    result.setValue(Field.NUMBER, "2");    result.setValue(Field.PAGES, "180-184");        additional = result.add(Type.INPROCEEDINGS);    additional.setValue(Field.AUTHOR, "Sanjoy Dasgupta");    additional.setValue(Field.TITLE, "Performance Guarantees for Hierarchical Clustering");    additional.setValue(Field.BOOKTITLE, "15th Annual Conference on Computational Learning Theory");    additional.setValue(Field.YEAR, "2002");    additional.setValue(Field.PAGES, "351-363");    additional.setValue(Field.PUBLISHER, "Springer");        return result;  }  /**   * Returns default capabilities of the clusterer.   *   * @return      the capabilities of this clusterer   */  public Capabilities getCapabilities() {    Capabilities result = super.getCapabilities();    // attributes    result.enable(Capability.NOMINAL_ATTRIBUTES);    result.enable(Capability.NUMERIC_ATTRIBUTES);    result.enable(Capability.DATE_ATTRIBUTES);    result.enable(Capability.MISSING_VALUES);    return result;  }  /**   * Generates a clusterer. Has to initialize all fields of the clusterer   * that are not being set via options.   *   * @param data set of instances serving as training data    * @throws Exception if the clusterer has not been    * generated successfully   */  public void buildClusterer(Instances data) throws Exception {    // can clusterer handle the data?    getCapabilities().testWithFail(data);    //long start = System.currentTimeMillis();    m_ReplaceMissingFilter = new ReplaceMissingValues();    m_ReplaceMissingFilter.setInputFormat(data);    m_instances = Filter.useFilter(data, m_ReplaceMissingFilter);    initMinMax(m_instances);    m_ClusterCentroids = new Instances(m_instances, m_NumClusters);    int n = m_instances.numInstances();    Random r = new Random(getSeed());    boolean[] selected = new boolean[n];    double[] minDistance = new double[n];    for(int i = 0; i<n; i++) minDistance[i] = Double.MAX_VALUE;    int firstI = r.nextInt(n);    m_ClusterCentroids.add(m_instances.instance(firstI));    selected[firstI] = true;    updateMinDistance(minDistance,selected,m_instances,m_instances.instance(firstI));    if (m_NumClusters > n) m_NumClusters = n;    for(int i = 1; i < m_NumClusters; i++) {      int nextI =  farthestAway(minDistance, selected);      m_ClusterCentroids.add(m_instances.instance(nextI));      selected[nextI] = true;      updateMinDistance(minDistance,selected,m_instances,m_instances.instance(nextI));    }    m_instances = new Instances(m_instances,0);    //long end = System.currentTimeMillis();    //System.out.println("Clustering Time = " + (end-start));  }  protected void updateMinDistance(double[] minDistance, boolean[] selected, 				   Instances data, Instance center) {    for(int i = 0; i<selected.length; i++)       if (!selected[i]) {	double d = distance(center,data.instance(i));	if (d<minDistance[i]) 	  minDistance[i] = d;      }  }  protected int farthestAway(double[] minDistance, boolean[] selected) {    double maxDistance = -1.0;    int maxI = -1;    for(int i = 0; i<selected.length; i++)       if (!selected[i]) 	if (maxDistance < minDistance[i]) {	  maxDistance = minDistance[i];	  maxI = i;	}    return maxI;  }  protected void initMinMax(Instances data) {    m_Min = new double [data.numAttributes()];    m_Max = new double [data.numAttributes()];    for (int i = 0; i < data.numAttributes(); i++) {      m_Min[i] = m_Max[i] = Double.NaN;    }    for (int i = 0; i < data.numInstances(); i++) {      updateMinMax(data.instance(i));    }  }  /**   * Updates the minimum and maximum values for all the attributes   * based on a new instance.   *   * @param instance the new instance   */  private void updateMinMax(Instance instance) {      for (int j = 0;j < instance.numAttributes(); j++) {      if (Double.isNaN(m_Min[j])) {	m_Min[j] = instance.value(j);	m_Max[j] = instance.value(j);      } else {	if (instance.value(j) < m_Min[j]) {	  m_Min[j] = instance.value(j);	} else {	  if (instance.value(j) > m_Max[j]) {	    m_Max[j] = instance.value(j);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -