📄 mpckmeans.java

📁 wekaUT是 university texas austin 开发的基于weka的半指导学习(semi supervised learning)的分类器
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
12 3 4 5 下一页
/* *    This program is free software; you can redistribute it and/or modify *    it under the terms of the GNU General Public License as published by *    the Free Software Foundation; either version 2 of the License, or *    (at your option) any later version. * *    This program is distributed in the hope that it will be useful, *    but WITHOUT ANY WARRANTY; without even the implied warranty of *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *    GNU General Public License for more details. * *    You should have received a copy of the GNU General Public License *    along with this program; if not, write to the Free Software *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* *    MPCKMeans.java *    Copyright (C) 2003 Sugato Basu and Misha Bilenko * */package weka.clusterers;import  java.io.*;import  java.util.*;import  weka.core.*;import  weka.core.metrics.*;import  weka.filters.Filter;import  weka.filters.unsupervised.attribute.Remove;import  Jama.Matrix;import  Jama.EigenvalueDecomposition;import  weka.clusterers.assigners.*;import  weka.clusterers.regularizers.*;import  weka.clusterers.initializers.*;import  weka.clusterers.metriclearners.*;/** * Pairwise constrained k means clustering class. * * Valid options are:<p> * * -N <number of clusters> <br> * Specify the number of clusters to generate. <p> * * -R <random seed> <br> * Specify random number seed <p> * * -M <metric-class> <br> * Specifies the name of the distance metric class that should be used *  * @author Sugato Basu(sugato@cs.utexas.edu) and Misha Bilenko (mbilenko@cs.utexas.edu) * @see Clusterer * @see OptionHandler */public class MPCKMeans extends Clusterer implements OptionHandler,SemiSupClusterer {  /** Name of clusterer */  String m_name = "MPCKMeans";  /** holds the instances in the clusters */  protected ArrayList m_Clusters = null;  /** holds the instance indices in the clusters */  protected HashSet[] m_IndexClusters = null;    /** holds the ([instance pair] -> [type of constraint]) mapping,      where the hashed value stores the type of link but the instance      pair does not hold the type of constraint - it holds (instanceIdx1,      instanceIdx2, DONT_CARE_LINK). This is done to make lookup easier      in future   */  protected HashMap m_ConstraintsHash = null;  public HashMap getConstraintsHash() {    return m_ConstraintsHash;  }  /** stores the ([instanceIdx] -> [ArrayList of constraints])      mapping, where the arraylist contains the constraints in which      instanceIdx is involved. Note that the instance pairs stored in      the Arraylist have the actual link type.    */  protected HashMap m_instanceConstraintHash = null;   public HashMap getInstanceConstraintsHash() {    return m_instanceConstraintHash;  }  public void setInstanceConstraintsHash(HashMap instanceConstraintHash) {    m_instanceConstraintHash = instanceConstraintHash;  }    /** holds the points involved in the constraints */  protected HashSet m_SeedHash = null;  /** Access */  public HashSet getSeedHash () {    return m_SeedHash;  }    /** weight to be given to each constraint */  protected double m_CLweight = 1;  /** weight to be given to each constraint */  protected double m_MLweight = 1;  /** should constraints from transitive closure be added? */  protected boolean m_useTransitiveConstraints = true;     /** is it an offline metric (BarHillelMetric or XingMetric)? */  protected boolean m_isOfflineMetric;  public boolean getIsOfflineMetric () {    return m_isOfflineMetric;  }    /** the maximum distance between cannot-link constraints */  protected double m_MaxCannotLinkDistance = 0;  /** the min similarity between cannot-link constraints */  protected double m_MaxCannotLinkSimilarity = 0;  /** the maximum distance between cannot-link constraints */  protected double m_maxCLPenalties[] = null;  public Instance m_maxCLPoints[][] = null;  public Instance m_maxCLDiffInstances[] = null;  /** verbose? */  protected boolean m_verbose = false;  /** distance Metric */  protected LearnableMetric m_metric = new WeightedEuclidean();  protected MPCKMeansMetricLearner m_metricLearner = new WEuclideanLearner();  /** Individual metrics for each cluster can be used */  protected boolean m_useMultipleMetrics = false;  protected LearnableMetric [] m_metrics = null;  protected MPCKMeansMetricLearner [] m_metricLearners = null;  /** Relative importance of the log-term for the weights in the objective function */  protected double m_logTermWeight = 0.01;  /** Regularization for weights */  protected boolean m_regularize = false;   protected double m_regularizerTermWeight = 0.001;    /** We will hash log terms to avoid recomputing every time TODO:  implement for Euclidean*/  protected double[] m_logTerms = null;   /** has the metric has been constructed?  a fix for multiple buildClusterer's */  protected boolean m_metricBuilt = false;  /** indicates whether instances are sparse */  protected  boolean m_isSparseInstance = false;  /** Is the objective function increasing or decreasing?  Depends on type   * of metric used:  for similarity-based metric, increasing, for distance-based - decreasing */  protected boolean m_objFunDecreasing = true;  /** Seedable or not (true by default) */  protected boolean m_Seedable = true;  /** Possible metric training */  public static final int TRAINING_NONE = 1;  public static final int TRAINING_EXTERNAL = 2;  public static final int TRAINING_INTERNAL = 4;  public static final Tag[] TAGS_TRAINING = {    new Tag(TRAINING_NONE, "None"),    new Tag(TRAINING_EXTERNAL, "External"),    new Tag(TRAINING_INTERNAL, "Internal")};    protected int m_Trainable = TRAINING_INTERNAL;  /** keep track of the number of iterations completed before convergence   */  protected int m_Iterations = 0;    /** number of constraint violations     */  protected int m_numViolations = 0;  /** keep track of the number of iterations when no points were moved */  protected int m_numBlankIterations = 0;  /** the maximum number of iterations */  protected int m_maxIterations = Integer.MAX_VALUE;  /** the maximum number of iterations with no points moved */  protected int m_maxBlankIterations = 20;    /** min difference of objective function values for convergence*/  protected double m_ObjFunConvergenceDifference = 1e-5;  /** value of current objective function */  protected double m_Objective = Double.MAX_VALUE;  /** value of last objective function */  protected double m_OldObjective;  /** Variables to track components of the objective function */  protected double m_objVariance;  protected double m_objCannotLinks;  protected double m_objMustLinks;  protected double m_objNormalizer;  protected double m_objRegularizer;  /** Variable to track the contribution of the currently considered point */  protected double m_objVarianceCurrPoint;  protected double m_objCannotLinksCurrPoint;  protected double m_objMustLinksCurrPoint;  protected double m_objNormalizerCurrPoint;  protected double m_objVarianceCurrPointBest;  protected double m_objCannotLinksCurrPointBest;  protected double m_objMustLinksCurrPointBest;  protected double m_objNormalizerCurrPointBest;  /** returns objective function */  public double objectiveFunction() {    return m_Objective;  }  /**   * training instances with labels   */  protected Instances m_TotalTrainWithLabels;  public Instances getTotalTrainWithLabels() {    return m_TotalTrainWithLabels;  }  public void setTotalTrainWithLabels(Instances inst) {    m_TotalTrainWithLabels = inst;  }  /**   * training instances   */  protected Instances m_Instances;  /** A hash where the instance checksums are hashed */  protected HashMap m_checksumHash = null;   protected double []m_checksumCoeffs = null;   /** test data -- required to make sure that test points are not      selected during active learning */  protected int m_StartingIndexOfTest = -1;  /**   * number of clusters to generate, default is -1 to get it from labeled data   */  protected int m_NumClusters = -1;  /**   * holds the cluster centroids   */  protected Instances m_ClusterCentroids;  /** Accessor */   public Instances getClusterCentroids() {    return m_ClusterCentroids;  }   public void setClusterCentroids(Instances centroids) {    m_ClusterCentroids = centroids;  }     /**   * temporary variable holding cluster assignments while iterating   */  protected int [] m_ClusterAssignments;  public int[] getClusterAssignments() {    return m_ClusterAssignments;  }   public void setClusterAssignments(int [] clusterAssignments) {    m_ClusterAssignments = clusterAssignments;  }   protected String m_ClusterAssignmentsOutputFile;  public String getClusterAssignmentsOutputFile() {    return m_ClusterAssignmentsOutputFile;  }   public void setClusterAssignmentsOutputFile(String file) {    m_ClusterAssignmentsOutputFile = file;  }   protected String m_ConstraintIncoherenceFile;  public String getConstraintIncoherenceFile() {    return m_ConstraintIncoherenceFile;  }   public void setConstraintIncoherenceFile(String file) {    m_ConstraintIncoherenceFile = file;  }   /**   * holds the random Seed, useful for randomPerturbInit   */  protected int m_RandomSeed = 42;  /**   * holds the random number generator used in various parts of the code   */  protected Random m_RandomNumberGenerator = null;  /** Define possible assignment strategies */  protected MPCKMeansAssigner m_Assigner = new SimpleAssigner(this);  /** Define possible initialization strategies */  //  protected MPCKMeansInitializer m_Initializer = new RandomPerturbInitializer(this);  protected MPCKMeansInitializer m_Initializer = new WeightedFFNeighborhoodInit(this);  /** Access */  public Random getRandomNumberGenerator() {    return m_RandomNumberGenerator;  }    /* Constructor */  public MPCKMeans() {  }  /* Constructor */  public MPCKMeans(LearnableMetric metric) {    m_metric = metric;    m_objFunDecreasing = metric.isDistanceBased();  }  /**   * We always want to implement SemiSupClusterer from a class extending Clusterer.     * We want to be able to return the underlying parent class.   * @return parent Clusterer class   */  public Clusterer getThisClusterer() {    return this;  }   /**   * Cluster given instances to form the specified number of clusters.   *   * @param data instances to be clustered   * @param numClusters number of clusters to create   * @exception Exception if something goes wrong.   */  public void buildClusterer(Instances data, int numClusters) throws Exception {    m_NumClusters = numClusters;    System.out.println("Creating " + m_NumClusters + " clusters");    m_Initializer.setNumClusters(m_NumClusters);        if (data.instance(0) instanceof SparseInstance) {      m_isSparseInstance = true;    }    buildClusterer(data);  }  /**   * Generates the clustering using labeled seeds   *   * @param labeledData set of labeled instances to use as seeds   * @param unlabeledData set of unlabeled instances   * @param classIndex attribute index in labeledData which holds class info   * @param numClusters number of clusters to create   * @param startingIndexOfTest from where test data starts in unlabeledData, useful if clustering is transductive, set to -1 if not relevant   * @exception Exception if something is wrong   */  public void buildClusterer (Instances labeledData, Instances unlabeledData,			      int classIndex, int numClusters,			      int startingIndexOfTest) throws Exception {    // Dummy function    throw new Exception ("Not implemented for MPCKMeans, only here for "			 + "compatibility to SemiSupClusterer interface");  }  /**   * Clusters unlabeledData and labeledData (with labels removed),   * using constraints in labeledPairs to initialize   *   * @param labeledPairs labeled pairs to be used to initialize   * @param unlabeledData unlabeled instances   * @param labeledData labeled instances   * @param numClusters number of clusters   * @param startingIndexOfTest starting index of test set in unlabeled data   * @exception Exception if something goes wrong.  */  public void buildClusterer(ArrayList labeledPairs, Instances unlabeledData,			     Instances labeledData, int numClusters,			     int startingIndexOfTest) throws Exception {    m_TotalTrainWithLabels = labeledData;    if (labeledPairs != null) {      m_SeedHash = new HashSet((int) (unlabeledData.numInstances()/0.75 + 10)) ;      m_ConstraintsHash = new HashMap();      m_instanceConstraintHash = new HashMap();
12 3 4 5 下一页
💿 文件大小 12323 K
👤 上传用户 ilovexzhu
📂 所属分类人工智能/神经网络
🏷️ 相关标签

#university #supervised #learning #wekaUT
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -