📄 farthestfirst.java
字号:
/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* * FarthestFirst.java * Copyright (C) 2002 University of Waikato, Hamilton, New Zealand * */package weka.clusterers;import weka.core.Attribute;import weka.core.Capabilities;import weka.core.Instance;import weka.core.Instances;import weka.core.Option;import weka.core.TechnicalInformation;import weka.core.TechnicalInformationHandler;import weka.core.Utils;import weka.core.Capabilities.Capability;import weka.core.TechnicalInformation.Field;import weka.core.TechnicalInformation.Type;import weka.filters.Filter;import weka.filters.unsupervised.attribute.ReplaceMissingValues;import java.util.Enumeration;import java.util.Random;import java.util.Vector;/** <!-- globalinfo-start --> * Cluster data using the FarthestFirst algorithm.<br/> * <br/> * For more information see:<br/> * <br/> * Hochbaum, Shmoys (1985). A best possible heuristic for the k-center problem. Mathematics of Operations Research. 10(2):180-184.<br/> * <br/> * Sanjoy Dasgupta: Performance Guarantees for Hierarchical Clustering. In: 15th Annual Conference on Computational Learning Theory, 351-363, 2002.<br/> * <br/> * Notes:<br/> * - works as a fast simple approximate clusterer<br/> * - modelled after SimpleKMeans, might be a useful initializer for it * <p/> <!-- globalinfo-end --> * <!-- technical-bibtex-start --> * BibTeX: * <pre> * @article{Hochbaum1985, * author = {Hochbaum and Shmoys}, * journal = {Mathematics of Operations Research}, * number = {2}, * pages = {180-184}, * title = {A best possible heuristic for the k-center problem}, * volume = {10}, * year = {1985} * } * * @inproceedings{Dasgupta2002, * author = {Sanjoy Dasgupta}, * booktitle = {15th Annual Conference on Computational Learning Theory}, * pages = {351-363}, * publisher = {Springer}, * title = {Performance Guarantees for Hierarchical Clustering}, * year = {2002} * } * </pre> * <p/> <!-- technical-bibtex-end --> * <!-- options-start --> * Valid options are: <p/> * * <pre> -N <num> * number of clusters. (default = 2).</pre> * * <pre> -S <num> * Random number seed. * (default 1)</pre> * <!-- options-end --> * * @author Bernhard Pfahringer (bernhard@cs.waikato.ac.nz) * @version $Revision: 1.8 $ * @see RandomizableClusterer */public class FarthestFirst extends RandomizableClusterer implements TechnicalInformationHandler { //Todo: rewrite to be fully incremental // cleanup, like deleting m_instances /** for serialization */ static final long serialVersionUID = 7499838100631329509L; /** * training instances, not necessary to keep, * could be replaced by m_ClusterCentroids where needed for header info */ protected Instances m_instances; /** * replace missing values in training instances */ protected ReplaceMissingValues m_ReplaceMissingFilter; /** * number of clusters to generate */ protected int m_NumClusters = 2; /** * holds the cluster centroids */ protected Instances m_ClusterCentroids; /** * attribute min values */ private double [] m_Min; /** * attribute max values */ private double [] m_Max; /** * Returns a string describing this clusterer * @return a description of the evaluator suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return "Cluster data using the FarthestFirst algorithm.\n\n" + "For more information see:\n\n" + getTechnicalInformation().toString() + "\n\n" + "Notes:\n" + "- works as a fast simple approximate clusterer\n" + "- modelled after SimpleKMeans, might be a useful initializer for it"; } /** * Returns an instance of a TechnicalInformation object, containing * detailed information about the technical background of this class, * e.g., paper reference or book this class is based on. * * @return the technical information about this class */ public TechnicalInformation getTechnicalInformation() { TechnicalInformation result; TechnicalInformation additional; result = new TechnicalInformation(Type.ARTICLE); result.setValue(Field.AUTHOR, "Hochbaum and Shmoys"); result.setValue(Field.YEAR, "1985"); result.setValue(Field.TITLE, "A best possible heuristic for the k-center problem"); result.setValue(Field.JOURNAL, "Mathematics of Operations Research"); result.setValue(Field.VOLUME, "10"); result.setValue(Field.NUMBER, "2"); result.setValue(Field.PAGES, "180-184"); additional = result.add(Type.INPROCEEDINGS); additional.setValue(Field.AUTHOR, "Sanjoy Dasgupta"); additional.setValue(Field.TITLE, "Performance Guarantees for Hierarchical Clustering"); additional.setValue(Field.BOOKTITLE, "15th Annual Conference on Computational Learning Theory"); additional.setValue(Field.YEAR, "2002"); additional.setValue(Field.PAGES, "351-363"); additional.setValue(Field.PUBLISHER, "Springer"); return result; } /** * Returns default capabilities of the clusterer. * * @return the capabilities of this clusterer */ public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); // attributes result.enable(Capability.NOMINAL_ATTRIBUTES); result.enable(Capability.NUMERIC_ATTRIBUTES); result.enable(Capability.DATE_ATTRIBUTES); result.enable(Capability.MISSING_VALUES); return result; } /** * Generates a clusterer. Has to initialize all fields of the clusterer * that are not being set via options. * * @param data set of instances serving as training data * @throws Exception if the clusterer has not been * generated successfully */ public void buildClusterer(Instances data) throws Exception { // can clusterer handle the data? getCapabilities().testWithFail(data); //long start = System.currentTimeMillis(); m_ReplaceMissingFilter = new ReplaceMissingValues(); m_ReplaceMissingFilter.setInputFormat(data); m_instances = Filter.useFilter(data, m_ReplaceMissingFilter); initMinMax(m_instances); m_ClusterCentroids = new Instances(m_instances, m_NumClusters); int n = m_instances.numInstances(); Random r = new Random(getSeed()); boolean[] selected = new boolean[n]; double[] minDistance = new double[n]; for(int i = 0; i<n; i++) minDistance[i] = Double.MAX_VALUE; int firstI = r.nextInt(n); m_ClusterCentroids.add(m_instances.instance(firstI)); selected[firstI] = true; updateMinDistance(minDistance,selected,m_instances,m_instances.instance(firstI)); if (m_NumClusters > n) m_NumClusters = n; for(int i = 1; i < m_NumClusters; i++) { int nextI = farthestAway(minDistance, selected); m_ClusterCentroids.add(m_instances.instance(nextI)); selected[nextI] = true; updateMinDistance(minDistance,selected,m_instances,m_instances.instance(nextI)); } m_instances = new Instances(m_instances,0); //long end = System.currentTimeMillis(); //System.out.println("Clustering Time = " + (end-start)); } protected void updateMinDistance(double[] minDistance, boolean[] selected, Instances data, Instance center) { for(int i = 0; i<selected.length; i++) if (!selected[i]) { double d = distance(center,data.instance(i)); if (d<minDistance[i]) minDistance[i] = d; } } protected int farthestAway(double[] minDistance, boolean[] selected) { double maxDistance = -1.0; int maxI = -1; for(int i = 0; i<selected.length; i++) if (!selected[i]) if (maxDistance < minDistance[i]) { maxDistance = minDistance[i]; maxI = i; } return maxI; } protected void initMinMax(Instances data) { m_Min = new double [data.numAttributes()]; m_Max = new double [data.numAttributes()]; for (int i = 0; i < data.numAttributes(); i++) { m_Min[i] = m_Max[i] = Double.NaN; } for (int i = 0; i < data.numInstances(); i++) { updateMinMax(data.instance(i)); } } /** * Updates the minimum and maximum values for all the attributes * based on a new instance. * * @param instance the new instance */ private void updateMinMax(Instance instance) { for (int j = 0;j < instance.numAttributes(); j++) { if (Double.isNaN(m_Min[j])) { m_Min[j] = instance.value(j); m_Max[j] = instance.value(j); } else { if (instance.value(j) < m_Min[j]) { m_Min[j] = instance.value(j); } else { if (instance.value(j) > m_Max[j]) { m_Max[j] = instance.value(j);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -