📄 pckmeans.java

📁 wekaUT是 university texas austin 开发的基于weka的半指导学习(semi supervised learning)的分类器
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
12 3 4 5 下一页
/* *    This program is free software; you can redistribute it and/or modify *    it under the terms of the GNU General Public License as published by *    the Free Software Foundation; either version 2 of the License, or *    (at your option) any later version. * *    This program is distributed in the hope that it will be useful, *    but WITHOUT ANY WARRANTY; without even the implied warranty of *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *    GNU General Public License for more details. * *    You should have received a copy of the GNU General Public License *    along with this program; if not, write to the Free Software *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* *    PCKMeans.java *    Copyright (C) 2003 Sugato Basu  * */package weka.clusterers;import  java.io.*;import  java.util.*;import  weka.core.*;import  weka.core.metrics.*;import  weka.filters.Filter;import  weka.filters.unsupervised.attribute.Remove;/** * Pairwise constrained k means clustering class. * * Valid options are:<p> * * -N <number of clusters> <br> * Specify the number of clusters to generate. <p> * * -R <random seed> <br> * Specify random number seed <p> * * -A <algorithm> <br> * The algorithm can be "Simple" (simple KMeans) or "Spherical" (spherical KMeans) * * -M <metric-class> <br> * Specifies the name of the distance metric class that should be used *  * .... etc.  * * @author Sugato Basu(sugato@cs.utexas.edu) * @see Clusterer * @see OptionHandler */public class PCKMeans extends Clusterer implements OptionHandler,SemiSupClusterer,ActiveLearningClusterer {  /** Name of clusterer */  String m_name = "PCKMeans";  /** holds the instances in the clusters */  protected ArrayList m_Clusters = null;  /** holds the instance indices in the clusters */  protected HashSet[] m_IndexClusters = null;    /** holds the ([instance pair] -> [type of constraint])      mapping. Note that the instance pairs stored in the hash always      have constraint type InstancePair.DONT_CARE_LINK, the actual      link type is stored in the hashed value   */    protected HashMap m_ConstraintsHash = null;    /** holds the ([instance i] -> [Arraylist of constraints involving i])      mapping. Note that the instance pairs stored in the Arraylist       have the actual link type  */  protected HashMap m_instanceConstraintHash = null;   /** adjacency list for neighborhoods */  protected HashSet[] m_AdjacencyList;  /** colors required for keeping track of DFS visit */  final int WHITE = 300;  final int GRAY = 301;  final int BLACK = 302;  /** holds the points involved in the constraints */  protected HashSet m_SeedHash = null;  /** weight to be given to each constraint */  protected double m_CannotLinkWeight = 1;  /** weight to be given to each constraint */  protected double m_MustLinkWeight = 1;  /** the maximum number of cannot-link constraints allowed */  protected final static int m_MaxConstraintsAllowed = 10000;  /** verbose? */  protected boolean m_verbose = false;  /** distance Metric */  protected Metric m_metric = new WeightedEuclidean();  /** has the metric has been constructed?  a fix for multiple buildClusterer's */  protected boolean m_metricBuilt = false;  /** indicates whether instances are sparse */  protected  boolean m_isSparseInstance = false;  /** Is the objective function increasing or decreasing?  Depends on type      of metric used:  for similarity-based metric - increasing,       for distance-based - decreasing   */  protected boolean m_objFunDecreasing = true;  /** Seedable or not (true by default) */  protected boolean m_Seedable = true;  /** Round robin or Random in active Phase Two */  protected boolean m_PhaseTwoRandom = false;  /** Two-phase active learning or All Explore */  protected boolean m_AllExplore = false;  /** keep track of the number of iterations completed before convergence   */  protected int m_Iterations = 0;    /** Define possible algorithms */  public static final int ALGORITHM_SIMPLE = 1;  public static final int ALGORITHM_SPHERICAL = 2;  public static final Tag[] TAGS_ALGORITHM = {    new Tag(ALGORITHM_SIMPLE, "Simple"),    new Tag(ALGORITHM_SPHERICAL, "Spherical")      };    /** algorithm, by default spherical */  protected int m_Algorithm = ALGORITHM_SIMPLE;    /** min difference of objective function values for convergence*/  protected double m_ObjFunConvergenceDifference = 1e-5;  /** value of objective function */  protected double m_Objective;  /** returns objective function */  public double objectiveFunction() {    return m_Objective;  }  /**   * training instances with labels   */  protected Instances m_TotalTrainWithLabels;  /**   * training instances   */  protected Instances m_Instances;  /** A hash where the instance checksums are hashed */  protected HashMap m_checksumHash = null;   protected double []m_checksumCoeffs = null;   /** test data -- required to make sure that test points are not      selected during active learning */  protected int m_StartingIndexOfTest = -1;  /**   * number of pairs to seed with   */  protected int m_NumActive;  /**   * active mode?   */  protected boolean m_Active = false;  /**   * number of clusters to generate, default is -1 to get it from labeled data   */  protected int m_NumClusters = 3;  /** Number of clusters in the process*/  protected int m_NumCurrentClusters = 0;   /**   * holds the cluster centroids   */  protected Instances m_ClusterCentroids;  /**   * holds the global centroids   */  protected Instance m_GlobalCentroid;  /**   * holds the default perturbation value for randomPerturbInit   */  protected double m_DefaultPerturb = 0.7;  /**   * holds the default merge threshold for matchMergeStep   */  protected double m_MergeThreshold = 0.15;  /**   * temporary variable holding cluster assignments while iterating   */  protected int [] m_ClusterAssignments;  /**   * temporary variable holding cluster sums while iterating   */  protected Instance [] m_SumOfClusterInstances;  /**   * holds the random Seed used to seed the random number generator   */  protected int m_RandomSeed = 42;  /**   * holds the random number generator used in various parts of the code   */  protected Random m_RandomNumberGenerator = null;  /** Define possible orderings */  public static final int ORDERING_DEFAULT = 1;  public static final int ORDERING_RANDOM = 2;  public static final int ORDERING_SORTED = 3;  public static final Tag[] TAGS_ORDERING = {    new Tag(ORDERING_DEFAULT, "Default-Ordering"),    new Tag(ORDERING_RANDOM, "Random-Ordering"),    new Tag(ORDERING_SORTED, "Sorted-Ordering")      };  protected int m_InstanceOrdering = ORDERING_DEFAULT;  /** Move points in assignment step till stabilization? */  protected boolean m_MovePointsTillAssignmentStabilizes = false;  /** neighbor list for active learning: points in each cluster neighborhood */  protected HashSet[] m_NeighborSets;  /** assigned set for active learning: whether a point has been assigned or not */  HashSet m_AssignedSet;  /* Constructor */  public PCKMeans() {  }  /* Constructor */  public PCKMeans(Metric metric) {    m_metric = metric;    m_objFunDecreasing = metric.isDistanceBased();  }  /**   * We always want to implement SemiSupClusterer from a class extending Clusterer.     * We want to be able to return the underlying parent class.   * @return parent Clusterer class   */  public Clusterer getThisClusterer() {    return this;  }   /**   * Cluster given instances to form the specified number of clusters.   *   * @param data instances to be clustered   * @param num_clusters number of clusters to create   * @exception Exception if something goes wrong.   */  public void buildClusterer(Instances data, int num_clusters) throws Exception {    m_NumClusters = num_clusters;    if (m_Algorithm == ALGORITHM_SPHERICAL && m_metric instanceof WeightedDotP) {      ((WeightedDotP)m_metric).setLengthNormalized(false); // since instances and clusters are already normalized, we don't need to normalize again while computing similarity - saves time    }    if (data.instance(0) instanceof SparseInstance) {      m_isSparseInstance = true;    }        buildClusterer(data);  }  /**   * Generates the clustering using labeled seeds   *   * @param labeledData set of labeled instances to use as seeds   * @param unlabeledData set of unlabeled instances   * @param classIndex attribute index in labeledData which holds class info   * @param numClusters number of clusters to create   * @param startingIndexOfTest from where test data starts in unlabeledData, useful if clustering is transductive, set to -1 if not relevant   * @exception Exception if something is wrong   */  public void buildClusterer (Instances labeledData, Instances unlabeledData, int classIndex, int numClusters, int startingIndexOfTest) throws Exception {    // Dummy function    throw new Exception ("Not implemented for MPCKMeans, only here for compatibility to SemiSupClusterer interface");  }  /**   * Clusters unlabeledData and labeledData (with labels removed),   * using labeledData as seeds   *   * @param labeledPairs labeled instances to be used as seeds   * @param unlabeledData unlabeled training (+ test for transductive) instances   * @param labeledTrain labeled training instances   * @param startingIndexOfTest starting index of test set in unlabeled data   * @exception Exception if something goes wrong.  */  public void buildClusterer(ArrayList labeledPairs, Instances unlabeledData, Instances labeledTrain,			     int startingIndexOfTest) throws Exception {    int classIndex = labeledTrain.numAttributes(); // assuming that the last attribute is always the class    m_TotalTrainWithLabels = labeledTrain;    m_SeedHash = new HashSet((int) (unlabeledData.numInstances()/0.75 + 10)) ;    m_ConstraintsHash = new HashMap(m_MaxConstraintsAllowed);     m_instanceConstraintHash = new HashMap(m_MaxConstraintsAllowed);    if (!m_Active && labeledPairs != null) {      for (int i=0; i<labeledPairs.size(); i++) {	InstancePair pair = (InstancePair) labeledPairs.get(i);	Integer firstInt = new Integer(pair.first);	Integer secondInt = new Integer(pair.second);	// for first point 	if(!m_SeedHash.contains(firstInt)) { // add instances with constraints to seedHash	  m_SeedHash.add(firstInt);	}	// for second point 	if(!m_SeedHash.contains(secondInt)) {	  m_SeedHash.add(secondInt);	}	if (pair.first >= pair.second) {	  throw new Exception("Ordering reversed - something wrong!!");	} 	else {	  InstancePair newPair = new InstancePair(pair.first, pair.second, InstancePair.DONT_CARE_LINK);	  m_ConstraintsHash.put(newPair, new Integer(pair.linkType)); // WLOG first < second	  // hash the constraints for the instances involved	  Object constraintList1 = m_instanceConstraintHash.get(firstInt);	  if (constraintList1 == null) {	    ArrayList constraintList = new ArrayList();	    constraintList.add(pair);	    m_instanceConstraintHash.put(firstInt, constraintList);	  } else {	    ((ArrayList)constraintList1).add(pair);	  }	  Object constraintList2 = m_instanceConstraintHash.get(secondInt);	  if (constraintList2 == null) {	    ArrayList constraintList = new ArrayList();	    constraintList.add(pair);	    m_instanceConstraintHash.put(secondInt, constraintList);	  } else {	    ((ArrayList)constraintList2).add(pair);	  }	}      }    } else {      m_NumActive = labeledPairs.size();    }    // normalize all data for SPKMeans    if (m_Algorithm == ALGORITHM_SPHERICAL) {      for (int i=0; i<unlabeledData.numInstances(); i++) {	normalize(unlabeledData.instance(i));      }    }    m_StartingIndexOfTest = startingIndexOfTest;    if (m_verbose) {      System.out.println("Starting index of test: " + m_StartingIndexOfTest);    }    // learn metric using labeled data, then cluster both the labeled and unlabeled data    m_metric.buildMetric(unlabeledData.numAttributes());    m_metricBuilt = true;    buildClusterer(unlabeledData, labeledTrain.numClasses());  }  /**   * Generates a clusterer. Instances in data have to be   * either all sparse or all non-sparse   *   * @param data set of instances serving as training data    * @exception Exception if the clusterer has not been    * generated successfully
12 3 4 5 下一页
💿 文件大小 12323 K
👤 上传用户 ilovexzhu
📂 所属分类人工智能/神经网络
🏷️ 相关标签

#university #supervised #learning #wekaUT
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -