📄 mpckmeans.java
字号:
/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* * MPCKMeans.java * Copyright (C) 2003 Sugato Basu and Misha Bilenko * */package weka.clusterers;import java.io.*;import java.util.*;import weka.core.*;import weka.core.metrics.*;import weka.filters.Filter;import weka.filters.unsupervised.attribute.Remove;import Jama.Matrix;import Jama.EigenvalueDecomposition;import weka.clusterers.assigners.*;import weka.clusterers.regularizers.*;import weka.clusterers.initializers.*;import weka.clusterers.metriclearners.*;/** * Pairwise constrained k means clustering class. * * Valid options are:<p> * * -N <number of clusters> <br> * Specify the number of clusters to generate. <p> * * -R <random seed> <br> * Specify random number seed <p> * * -M <metric-class> <br> * Specifies the name of the distance metric class that should be used * * @author Sugato Basu(sugato@cs.utexas.edu) and Misha Bilenko (mbilenko@cs.utexas.edu) * @see Clusterer * @see OptionHandler */public class MPCKMeans extends Clusterer implements OptionHandler,SemiSupClusterer { /** Name of clusterer */ String m_name = "MPCKMeans"; /** holds the instances in the clusters */ protected ArrayList m_Clusters = null; /** holds the instance indices in the clusters */ protected HashSet[] m_IndexClusters = null; /** holds the ([instance pair] -> [type of constraint]) mapping, where the hashed value stores the type of link but the instance pair does not hold the type of constraint - it holds (instanceIdx1, instanceIdx2, DONT_CARE_LINK). This is done to make lookup easier in future */ protected HashMap m_ConstraintsHash = null; public HashMap getConstraintsHash() { return m_ConstraintsHash; } /** stores the ([instanceIdx] -> [ArrayList of constraints]) mapping, where the arraylist contains the constraints in which instanceIdx is involved. Note that the instance pairs stored in the Arraylist have the actual link type. */ protected HashMap m_instanceConstraintHash = null; public HashMap getInstanceConstraintsHash() { return m_instanceConstraintHash; } public void setInstanceConstraintsHash(HashMap instanceConstraintHash) { m_instanceConstraintHash = instanceConstraintHash; } /** holds the points involved in the constraints */ protected HashSet m_SeedHash = null; /** Access */ public HashSet getSeedHash () { return m_SeedHash; } /** weight to be given to each constraint */ protected double m_CLweight = 1; /** weight to be given to each constraint */ protected double m_MLweight = 1; /** should constraints from transitive closure be added? */ protected boolean m_useTransitiveConstraints = true; /** is it an offline metric (BarHillelMetric or XingMetric)? */ protected boolean m_isOfflineMetric; public boolean getIsOfflineMetric () { return m_isOfflineMetric; } /** the maximum distance between cannot-link constraints */ protected double m_MaxCannotLinkDistance = 0; /** the min similarity between cannot-link constraints */ protected double m_MaxCannotLinkSimilarity = 0; /** the maximum distance between cannot-link constraints */ protected double m_maxCLPenalties[] = null; public Instance m_maxCLPoints[][] = null; public Instance m_maxCLDiffInstances[] = null; /** verbose? */ protected boolean m_verbose = false; /** distance Metric */ protected LearnableMetric m_metric = new WeightedEuclidean(); protected MPCKMeansMetricLearner m_metricLearner = new WEuclideanLearner(); /** Individual metrics for each cluster can be used */ protected boolean m_useMultipleMetrics = false; protected LearnableMetric [] m_metrics = null; protected MPCKMeansMetricLearner [] m_metricLearners = null; /** Relative importance of the log-term for the weights in the objective function */ protected double m_logTermWeight = 0.01; /** Regularization for weights */ protected boolean m_regularize = false; protected double m_regularizerTermWeight = 0.001; /** We will hash log terms to avoid recomputing every time TODO: implement for Euclidean*/ protected double[] m_logTerms = null; /** has the metric has been constructed? a fix for multiple buildClusterer's */ protected boolean m_metricBuilt = false; /** indicates whether instances are sparse */ protected boolean m_isSparseInstance = false; /** Is the objective function increasing or decreasing? Depends on type * of metric used: for similarity-based metric, increasing, for distance-based - decreasing */ protected boolean m_objFunDecreasing = true; /** Seedable or not (true by default) */ protected boolean m_Seedable = true; /** Possible metric training */ public static final int TRAINING_NONE = 1; public static final int TRAINING_EXTERNAL = 2; public static final int TRAINING_INTERNAL = 4; public static final Tag[] TAGS_TRAINING = { new Tag(TRAINING_NONE, "None"), new Tag(TRAINING_EXTERNAL, "External"), new Tag(TRAINING_INTERNAL, "Internal")}; protected int m_Trainable = TRAINING_INTERNAL; /** keep track of the number of iterations completed before convergence */ protected int m_Iterations = 0; /** number of constraint violations */ protected int m_numViolations = 0; /** keep track of the number of iterations when no points were moved */ protected int m_numBlankIterations = 0; /** the maximum number of iterations */ protected int m_maxIterations = Integer.MAX_VALUE; /** the maximum number of iterations with no points moved */ protected int m_maxBlankIterations = 20; /** min difference of objective function values for convergence*/ protected double m_ObjFunConvergenceDifference = 1e-5; /** value of current objective function */ protected double m_Objective = Double.MAX_VALUE; /** value of last objective function */ protected double m_OldObjective; /** Variables to track components of the objective function */ protected double m_objVariance; protected double m_objCannotLinks; protected double m_objMustLinks; protected double m_objNormalizer; protected double m_objRegularizer; /** Variable to track the contribution of the currently considered point */ protected double m_objVarianceCurrPoint; protected double m_objCannotLinksCurrPoint; protected double m_objMustLinksCurrPoint; protected double m_objNormalizerCurrPoint; protected double m_objVarianceCurrPointBest; protected double m_objCannotLinksCurrPointBest; protected double m_objMustLinksCurrPointBest; protected double m_objNormalizerCurrPointBest; /** returns objective function */ public double objectiveFunction() { return m_Objective; } /** * training instances with labels */ protected Instances m_TotalTrainWithLabels; public Instances getTotalTrainWithLabels() { return m_TotalTrainWithLabels; } public void setTotalTrainWithLabels(Instances inst) { m_TotalTrainWithLabels = inst; } /** * training instances */ protected Instances m_Instances; /** A hash where the instance checksums are hashed */ protected HashMap m_checksumHash = null; protected double []m_checksumCoeffs = null; /** test data -- required to make sure that test points are not selected during active learning */ protected int m_StartingIndexOfTest = -1; /** * number of clusters to generate, default is -1 to get it from labeled data */ protected int m_NumClusters = -1; /** * holds the cluster centroids */ protected Instances m_ClusterCentroids; /** Accessor */ public Instances getClusterCentroids() { return m_ClusterCentroids; } public void setClusterCentroids(Instances centroids) { m_ClusterCentroids = centroids; } /** * temporary variable holding cluster assignments while iterating */ protected int [] m_ClusterAssignments; public int[] getClusterAssignments() { return m_ClusterAssignments; } public void setClusterAssignments(int [] clusterAssignments) { m_ClusterAssignments = clusterAssignments; } protected String m_ClusterAssignmentsOutputFile; public String getClusterAssignmentsOutputFile() { return m_ClusterAssignmentsOutputFile; } public void setClusterAssignmentsOutputFile(String file) { m_ClusterAssignmentsOutputFile = file; } protected String m_ConstraintIncoherenceFile; public String getConstraintIncoherenceFile() { return m_ConstraintIncoherenceFile; } public void setConstraintIncoherenceFile(String file) { m_ConstraintIncoherenceFile = file; } /** * holds the random Seed, useful for randomPerturbInit */ protected int m_RandomSeed = 42; /** * holds the random number generator used in various parts of the code */ protected Random m_RandomNumberGenerator = null; /** Define possible assignment strategies */ protected MPCKMeansAssigner m_Assigner = new SimpleAssigner(this); /** Define possible initialization strategies */ // protected MPCKMeansInitializer m_Initializer = new RandomPerturbInitializer(this); protected MPCKMeansInitializer m_Initializer = new WeightedFFNeighborhoodInit(this); /** Access */ public Random getRandomNumberGenerator() { return m_RandomNumberGenerator; } /* Constructor */ public MPCKMeans() { } /* Constructor */ public MPCKMeans(LearnableMetric metric) { m_metric = metric; m_objFunDecreasing = metric.isDistanceBased(); } /** * We always want to implement SemiSupClusterer from a class extending Clusterer. * We want to be able to return the underlying parent class. * @return parent Clusterer class */ public Clusterer getThisClusterer() { return this; } /** * Cluster given instances to form the specified number of clusters. * * @param data instances to be clustered * @param numClusters number of clusters to create * @exception Exception if something goes wrong. */ public void buildClusterer(Instances data, int numClusters) throws Exception { m_NumClusters = numClusters; System.out.println("Creating " + m_NumClusters + " clusters"); m_Initializer.setNumClusters(m_NumClusters); if (data.instance(0) instanceof SparseInstance) { m_isSparseInstance = true; } buildClusterer(data); } /** * Generates the clustering using labeled seeds * * @param labeledData set of labeled instances to use as seeds * @param unlabeledData set of unlabeled instances * @param classIndex attribute index in labeledData which holds class info * @param numClusters number of clusters to create * @param startingIndexOfTest from where test data starts in unlabeledData, useful if clustering is transductive, set to -1 if not relevant * @exception Exception if something is wrong */ public void buildClusterer (Instances labeledData, Instances unlabeledData, int classIndex, int numClusters, int startingIndexOfTest) throws Exception { // Dummy function throw new Exception ("Not implemented for MPCKMeans, only here for " + "compatibility to SemiSupClusterer interface"); } /** * Clusters unlabeledData and labeledData (with labels removed), * using constraints in labeledPairs to initialize * * @param labeledPairs labeled pairs to be used to initialize * @param unlabeledData unlabeled instances * @param labeledData labeled instances * @param numClusters number of clusters * @param startingIndexOfTest starting index of test set in unlabeled data * @exception Exception if something goes wrong. */ public void buildClusterer(ArrayList labeledPairs, Instances unlabeledData, Instances labeledData, int numClusters, int startingIndexOfTest) throws Exception { m_TotalTrainWithLabels = labeledData; if (labeledPairs != null) { m_SeedHash = new HashSet((int) (unlabeledData.numInstances()/0.75 + 10)) ; m_ConstraintsHash = new HashMap(); m_instanceConstraintHash = new HashMap();
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -