📄 hac.java
字号:
/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* * MatrixHAC.java * Copyright (C) 2001 Mikhail Bilenko * *//** * Similarity-matrix Implementation of Hierarachical Agglomerative Clustering. * <p> * Valid options are:<p> * * -N <0-10000> <br> * Number of clusters. <p> * * @author Mikhail Bilenko (mbilenko@cs.utexas.edu) * @version $Revision: 1.11 $ */package weka.clusterers;import java.io.*;import java.util.*;import java.text.*;import weka.core.*;import weka.core.metrics.*;import weka.filters.unsupervised.attribute.Remove;import weka.filters.unsupervised.attribute.Normalize;import weka.filters.Filter;public class HAC extends Clusterer implements SemiSupClusterer, OptionHandler{ /* name of the clusterer */ String m_name = "HAC"; /** Number of clusters */ protected int m_numClusters = -1; /** Number of clusters in the process*/ protected int m_numCurrentClusters = 0; /** ID of current cluster */ protected int m_clusterID = 0; /** Number of seeded clusters */ protected int m_numSeededClusters = 0; /** Dot file name for dumping graph for tree visualization */ protected String m_dotFileName = "user-features.dot"; /** Dot file name for dumping graph for tree visualization */ protected PrintWriter m_dotWriter = null; /** Instances that we are working with */ Instances m_instances; Instances m_descrInstances; /** holds the clusters */ protected ArrayList m_clusters = null; /** * temporary variable holding cluster assignments */ protected int [] m_clusterAssignments; /** distance matrix */ protected double[][] m_distanceMatrix = null; /** cluster similarity type */ public final static int SINGLE_LINK = 0; public final static int COMPLETE_LINK = 1; public final static int GROUP_AVERAGE = 2; public static final Tag[] TAGS_LINKING = { new Tag(SINGLE_LINK, "Single link"), new Tag(COMPLETE_LINK, "Complete link"), new Tag(GROUP_AVERAGE, "Group-average") }; /** Default linking method */ protected int m_linkingType = GROUP_AVERAGE; /** starting index of test data in unlabeledData if transductive clustering */ protected int m_StartingIndexOfTest = -1; /** seeding */ protected boolean m_seedable = false; /** holds the ([seed instance] -> [clusterLabel of seed instance]) mapping */ protected HashMap m_SeedHash = null; /** A 'checksum hash' where indices are hashed to the sum of their attribute values */ protected HashMap m_checksumHash = null; protected double[] m_checksumPerturb = null; /** * holds the random Seed, useful for random selection initialization */ protected int m_randomSeed = 100; protected Random m_randomGen = null; /** instance hash */ protected HashMap m_instancesHash = null; /** reverse instance hash */ protected HashMap m_reverseInstancesHash = null; /** The threshold distance beyond which no clusters are merged (except for one - TODO) */ protected double m_mergeThreshold = 0.8; /** verbose? */ protected boolean m_verbose = false; /** metric used to calculate similarity/distance */// protected Metric m_metric = new WeightedDotP();// protected String m_metricName = new String("weka.core.metrics.WeightedDotP"); protected Metric m_metric = new WeightedEuclidean(); protected String m_metricName = new String("weka.core.metrics.WeightedEuclidean"); /** Is the metric (and hence the algorithm) relying on similarities or distances? */ protected boolean m_isDistanceBased = false; /** has the metric has been constructed? a fix for multiple buildClusterer's */ protected boolean m_metricBuilt = false; // =============== // Public methods. // =============== /** empty constructor, required to call using Class.forName */ public HAC() {} /* Constructor */ public HAC(Metric metric) { m_metric = metric; m_metricName = m_metric.getClass().getName(); m_isDistanceBased = metric.isDistanceBased(); } /** Sets training instances */ public void setInstances(Instances instances) { m_instances = instances; } /** Return training instances */ public Instances getInstances() { return m_instances; } /** * Set the number of clusters to generate * * @param n the number of clusters to generate */ public void setNumClusters(int n) { m_numClusters = n; if (m_verbose) { System.out.println("Number of clusters: " + n); } } /** Set the merge threshold */ public void setMergeThreshold(double threshold) { m_mergeThreshold = threshold; } /** Get the merge threshold */ public double getMergeThreshold() { return m_mergeThreshold; } /** * Set the distance metric * * @param s the metric */ public void setMetric (LearnableMetric m) { m_metric = m; m_metricName = m_metric.getClass().getName(); m_isDistanceBased = m.isDistanceBased(); } /** * Get the distance metric * * @returns the distance metric used */ public Metric getMetric () { return m_metric; } /** * Get the distance metric name * * @returns the name of the distance metric used */ public String metricName () { return m_metricName; } /** * We always want to implement SemiSupClusterer from a class extending Clusterer. * We want to be able to return the underlying parent class. * @return parent Clusterer class */ public Clusterer getThisClusterer() { return this; } /** * Cluster given instances to form the specified number of clusters. * * @param data instances to be clustered * @param num_clusters number of clusters to create * @exception Exception if something goes wrong. */ public void buildClusterer(Instances data, int num_clusters) throws Exception { setNumClusters(num_clusters); buildClusterer(data); } /** * Clusters unlabeledData and labeledData (with labels removed), * using labeledData as seeds * * @param labeledData labeled instances to be used as seeds * @param unlabeledData unlabeled instances * @param classIndex attribute index in labeledData which holds class info * @param numClusters number of clusters * @exception Exception if something goes wrong. */ public void buildClusterer(Instances labeledData, Instances unlabeledData, int classIndex, int numClusters) throws Exception { // remove labels of labeledData before putting in SeedHash Instances clusterData = new Instances(labeledData); clusterData.deleteClassAttribute(); // create SeedHash from labeledData if (getSeedable()) { Seeder seeder = new Seeder(clusterData, labeledData); setSeedHash(seeder.getAllSeeds()); } // add unlabeled data to labeled data (labels removed), not the // other way around, so that the hash table entries are consistent // with the labeled data without labels for (int i=0; i<unlabeledData.numInstances(); i++) { clusterData.add(unlabeledData.instance(i)); } if (m_verbose) { System.out.println("combinedData has size: " + clusterData.numInstances() + "\n"); } // learn metric using labeled data, then cluster both the labeled and unlabeled data m_metric.buildMetric(labeledData); m_metricBuilt = true; // check if the number of clusters is dynamically set to the number of classes if (m_numClusters == -1) { m_numClusters = labeledData.numClasses(); System.out.println("DYNAMIC NUMBER OF CLUSTERS, setting to " + m_numClusters); } buildClusterer(clusterData, numClusters); } /** * Clusters unlabeledData and labeledData (with labels removed), * using labeledData as seeds * * @param labeledData labeled instances to be used as seeds * @param unlabeledData unlabeled instances * @param classIndex attribute index in labeledData which holds class info * @param numClusters number of clusters * @param startingIndexOfTest from where test data starts in unlabeledData, useful if clustering is transductive * @exception Exception if something goes wrong. */ public void buildClusterer(Instances labeledData, Instances unlabeledData, int classIndex, int numClusters, int startingIndexOfTest) throws Exception { m_StartingIndexOfTest = startingIndexOfTest + labeledData.numInstances(); buildClusterer(labeledData, unlabeledData, classIndex, numClusters); } /** * Cluster given instances. If no threshold or number of clusters is set, * clustering proceeds until two clusters are left. * * @param data instances to be clustered * @exception Exception if something goes wrong. */ public void buildClusterer(Instances data) throws Exception { m_randomGen = new Random(m_randomSeed); m_dotWriter = new PrintWriter(new BufferedOutputStream(new FileOutputStream(m_dotFileName))); m_dotWriter.println("digraph HAC {\n"); setInstances(data); m_clusterAssignments = new int[m_instances.numInstances()]; if (m_verbose && m_SeedHash != null) { System.out.println("Using seeding ..."); } if (m_instances.checkForNominalAttributes() && m_instances.checkForStringAttributes()) { throw new UnsupportedAttributeTypeException("Cannot handle nominal attributes\n"); } //m_instances = filterInstanceDescriptions(m_instances); // Don't rebuild the metric if it was already trained if (!m_metricBuilt) { m_metric.buildMetric(data); } hashInstances(m_instances); createDistanceMatrix(); cluster(); unhashClusters(); m_dotWriter.println("}"); m_dotWriter.close(); } /** If some of the attributes start with "__", form a separate Instances set with * descriptions and filter them out of the argument dataset. * Return the original dataset without the filtered out attributes */ protected Instances filterInstanceDescriptions(Instances instances) throws Exception { Instances filteredInstances;// Normalize normalizeFilter = new Normalize();// normalizeFilter.setInputFormat(instances);// instances = Filter.useFilter(instances, normalizeFilter);// System.out.println("Normalized the instance attributes"); // go through the attributes and find the description attributes ArrayList descrIndexList = new ArrayList(); for (int i = 0; i < instances.numAttributes(); i++) { Attribute attr = instances.attribute(i); if (attr.name().startsWith("__")) { descrIndexList.add(new Integer(i)); System.out.println("filtering " + attr); } } // filter out the description attributes if necessary if (descrIndexList.size() > 0) { m_descrInstances = new Instances(instances); // filter out the descriptions first int[] descrIndeces = new int[descrIndexList.size()]; for (int i = 0; i < descrIndexList.size(); i++) { descrIndeces[i] = ((Integer) descrIndexList.get(i)).intValue(); } Remove attributeFilter = new Remove(); attributeFilter.setAttributeIndicesArray(descrIndeces); attributeFilter.setInvertSelection(false); attributeFilter.setInputFormat(instances); filteredInstances = Filter.useFilter(instances, attributeFilter); attributeFilter.setInvertSelection(true); attributeFilter.setInputFormat(instances); m_descrInstances = Filter.useFilter(instances, attributeFilter); } else { filteredInstances = new Instances(instances); } return filteredInstances; } /** * Reset all values that have been learned */ public void resetClusterer() throws Exception{ if (m_metric instanceof LearnableMetric)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -