📄 pckmeans.java
字号:
/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* * PCKMeans.java * Copyright (C) 2003 Sugato Basu * */package weka.clusterers;import java.io.*;import java.util.*;import weka.core.*;import weka.core.metrics.*;import weka.filters.Filter;import weka.filters.unsupervised.attribute.Remove;/** * Pairwise constrained k means clustering class. * * Valid options are:<p> * * -N <number of clusters> <br> * Specify the number of clusters to generate. <p> * * -R <random seed> <br> * Specify random number seed <p> * * -A <algorithm> <br> * The algorithm can be "Simple" (simple KMeans) or "Spherical" (spherical KMeans) * * -M <metric-class> <br> * Specifies the name of the distance metric class that should be used * * .... etc. * * @author Sugato Basu(sugato@cs.utexas.edu) * @see Clusterer * @see OptionHandler */public class PCKMeans extends Clusterer implements OptionHandler,SemiSupClusterer,ActiveLearningClusterer { /** Name of clusterer */ String m_name = "PCKMeans"; /** holds the instances in the clusters */ protected ArrayList m_Clusters = null; /** holds the instance indices in the clusters */ protected HashSet[] m_IndexClusters = null; /** holds the ([instance pair] -> [type of constraint]) mapping. Note that the instance pairs stored in the hash always have constraint type InstancePair.DONT_CARE_LINK, the actual link type is stored in the hashed value */ protected HashMap m_ConstraintsHash = null; /** holds the ([instance i] -> [Arraylist of constraints involving i]) mapping. Note that the instance pairs stored in the Arraylist have the actual link type */ protected HashMap m_instanceConstraintHash = null; /** adjacency list for neighborhoods */ protected HashSet[] m_AdjacencyList; /** colors required for keeping track of DFS visit */ final int WHITE = 300; final int GRAY = 301; final int BLACK = 302; /** holds the points involved in the constraints */ protected HashSet m_SeedHash = null; /** weight to be given to each constraint */ protected double m_CannotLinkWeight = 1; /** weight to be given to each constraint */ protected double m_MustLinkWeight = 1; /** the maximum number of cannot-link constraints allowed */ protected final static int m_MaxConstraintsAllowed = 10000; /** verbose? */ protected boolean m_verbose = false; /** distance Metric */ protected Metric m_metric = new WeightedEuclidean(); /** has the metric has been constructed? a fix for multiple buildClusterer's */ protected boolean m_metricBuilt = false; /** indicates whether instances are sparse */ protected boolean m_isSparseInstance = false; /** Is the objective function increasing or decreasing? Depends on type of metric used: for similarity-based metric - increasing, for distance-based - decreasing */ protected boolean m_objFunDecreasing = true; /** Seedable or not (true by default) */ protected boolean m_Seedable = true; /** Round robin or Random in active Phase Two */ protected boolean m_PhaseTwoRandom = false; /** Two-phase active learning or All Explore */ protected boolean m_AllExplore = false; /** keep track of the number of iterations completed before convergence */ protected int m_Iterations = 0; /** Define possible algorithms */ public static final int ALGORITHM_SIMPLE = 1; public static final int ALGORITHM_SPHERICAL = 2; public static final Tag[] TAGS_ALGORITHM = { new Tag(ALGORITHM_SIMPLE, "Simple"), new Tag(ALGORITHM_SPHERICAL, "Spherical") }; /** algorithm, by default spherical */ protected int m_Algorithm = ALGORITHM_SIMPLE; /** min difference of objective function values for convergence*/ protected double m_ObjFunConvergenceDifference = 1e-5; /** value of objective function */ protected double m_Objective; /** returns objective function */ public double objectiveFunction() { return m_Objective; } /** * training instances with labels */ protected Instances m_TotalTrainWithLabels; /** * training instances */ protected Instances m_Instances; /** A hash where the instance checksums are hashed */ protected HashMap m_checksumHash = null; protected double []m_checksumCoeffs = null; /** test data -- required to make sure that test points are not selected during active learning */ protected int m_StartingIndexOfTest = -1; /** * number of pairs to seed with */ protected int m_NumActive; /** * active mode? */ protected boolean m_Active = false; /** * number of clusters to generate, default is -1 to get it from labeled data */ protected int m_NumClusters = 3; /** Number of clusters in the process*/ protected int m_NumCurrentClusters = 0; /** * holds the cluster centroids */ protected Instances m_ClusterCentroids; /** * holds the global centroids */ protected Instance m_GlobalCentroid; /** * holds the default perturbation value for randomPerturbInit */ protected double m_DefaultPerturb = 0.7; /** * holds the default merge threshold for matchMergeStep */ protected double m_MergeThreshold = 0.15; /** * temporary variable holding cluster assignments while iterating */ protected int [] m_ClusterAssignments; /** * temporary variable holding cluster sums while iterating */ protected Instance [] m_SumOfClusterInstances; /** * holds the random Seed used to seed the random number generator */ protected int m_RandomSeed = 42; /** * holds the random number generator used in various parts of the code */ protected Random m_RandomNumberGenerator = null; /** Define possible orderings */ public static final int ORDERING_DEFAULT = 1; public static final int ORDERING_RANDOM = 2; public static final int ORDERING_SORTED = 3; public static final Tag[] TAGS_ORDERING = { new Tag(ORDERING_DEFAULT, "Default-Ordering"), new Tag(ORDERING_RANDOM, "Random-Ordering"), new Tag(ORDERING_SORTED, "Sorted-Ordering") }; protected int m_InstanceOrdering = ORDERING_DEFAULT; /** Move points in assignment step till stabilization? */ protected boolean m_MovePointsTillAssignmentStabilizes = false; /** neighbor list for active learning: points in each cluster neighborhood */ protected HashSet[] m_NeighborSets; /** assigned set for active learning: whether a point has been assigned or not */ HashSet m_AssignedSet; /* Constructor */ public PCKMeans() { } /* Constructor */ public PCKMeans(Metric metric) { m_metric = metric; m_objFunDecreasing = metric.isDistanceBased(); } /** * We always want to implement SemiSupClusterer from a class extending Clusterer. * We want to be able to return the underlying parent class. * @return parent Clusterer class */ public Clusterer getThisClusterer() { return this; } /** * Cluster given instances to form the specified number of clusters. * * @param data instances to be clustered * @param num_clusters number of clusters to create * @exception Exception if something goes wrong. */ public void buildClusterer(Instances data, int num_clusters) throws Exception { m_NumClusters = num_clusters; if (m_Algorithm == ALGORITHM_SPHERICAL && m_metric instanceof WeightedDotP) { ((WeightedDotP)m_metric).setLengthNormalized(false); // since instances and clusters are already normalized, we don't need to normalize again while computing similarity - saves time } if (data.instance(0) instanceof SparseInstance) { m_isSparseInstance = true; } buildClusterer(data); } /** * Generates the clustering using labeled seeds * * @param labeledData set of labeled instances to use as seeds * @param unlabeledData set of unlabeled instances * @param classIndex attribute index in labeledData which holds class info * @param numClusters number of clusters to create * @param startingIndexOfTest from where test data starts in unlabeledData, useful if clustering is transductive, set to -1 if not relevant * @exception Exception if something is wrong */ public void buildClusterer (Instances labeledData, Instances unlabeledData, int classIndex, int numClusters, int startingIndexOfTest) throws Exception { // Dummy function throw new Exception ("Not implemented for MPCKMeans, only here for compatibility to SemiSupClusterer interface"); } /** * Clusters unlabeledData and labeledData (with labels removed), * using labeledData as seeds * * @param labeledPairs labeled instances to be used as seeds * @param unlabeledData unlabeled training (+ test for transductive) instances * @param labeledTrain labeled training instances * @param startingIndexOfTest starting index of test set in unlabeled data * @exception Exception if something goes wrong. */ public void buildClusterer(ArrayList labeledPairs, Instances unlabeledData, Instances labeledTrain, int startingIndexOfTest) throws Exception { int classIndex = labeledTrain.numAttributes(); // assuming that the last attribute is always the class m_TotalTrainWithLabels = labeledTrain; m_SeedHash = new HashSet((int) (unlabeledData.numInstances()/0.75 + 10)) ; m_ConstraintsHash = new HashMap(m_MaxConstraintsAllowed); m_instanceConstraintHash = new HashMap(m_MaxConstraintsAllowed); if (!m_Active && labeledPairs != null) { for (int i=0; i<labeledPairs.size(); i++) { InstancePair pair = (InstancePair) labeledPairs.get(i); Integer firstInt = new Integer(pair.first); Integer secondInt = new Integer(pair.second); // for first point if(!m_SeedHash.contains(firstInt)) { // add instances with constraints to seedHash m_SeedHash.add(firstInt); } // for second point if(!m_SeedHash.contains(secondInt)) { m_SeedHash.add(secondInt); } if (pair.first >= pair.second) { throw new Exception("Ordering reversed - something wrong!!"); } else { InstancePair newPair = new InstancePair(pair.first, pair.second, InstancePair.DONT_CARE_LINK); m_ConstraintsHash.put(newPair, new Integer(pair.linkType)); // WLOG first < second // hash the constraints for the instances involved Object constraintList1 = m_instanceConstraintHash.get(firstInt); if (constraintList1 == null) { ArrayList constraintList = new ArrayList(); constraintList.add(pair); m_instanceConstraintHash.put(firstInt, constraintList); } else { ((ArrayList)constraintList1).add(pair); } Object constraintList2 = m_instanceConstraintHash.get(secondInt); if (constraintList2 == null) { ArrayList constraintList = new ArrayList(); constraintList.add(pair); m_instanceConstraintHash.put(secondInt, constraintList); } else { ((ArrayList)constraintList2).add(pair); } } } } else { m_NumActive = labeledPairs.size(); } // normalize all data for SPKMeans if (m_Algorithm == ALGORITHM_SPHERICAL) { for (int i=0; i<unlabeledData.numInstances(); i++) { normalize(unlabeledData.instance(i)); } } m_StartingIndexOfTest = startingIndexOfTest; if (m_verbose) { System.out.println("Starting index of test: " + m_StartingIndexOfTest); } // learn metric using labeled data, then cluster both the labeled and unlabeled data m_metric.buildMetric(unlabeledData.numAttributes()); m_metricBuilt = true; buildClusterer(unlabeledData, labeledTrain.numClasses()); } /** * Generates a clusterer. Instances in data have to be * either all sparse or all non-sparse * * @param data set of instances serving as training data * @exception Exception if the clusterer has not been * generated successfully
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -