📄 hac.java

📁 wekaUT是 university texas austin 开发的基于weka的半指导学习(semi supervised learning)的分类器
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
12 3 4 下一页
/* *    This program is free software; you can redistribute it and/or modify *    it under the terms of the GNU General Public License as published by *    the Free Software Foundation; either version 2 of the License, or *    (at your option) any later version. * *    This program is distributed in the hope that it will be useful, *    but WITHOUT ANY WARRANTY; without even the implied warranty of *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *    GNU General Public License for more details. * *    You should have received a copy of the GNU General Public License *    along with this program; if not, write to the Free Software *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* *    MatrixHAC.java *    Copyright (C) 2001 Mikhail Bilenko * *//** * Similarity-matrix Implementation of Hierarachical Agglomerative Clustering. * <p> * Valid options are:<p> * * -N <0-10000> <br> * Number of clusters. <p> * * @author Mikhail Bilenko (mbilenko@cs.utexas.edu) * @version $Revision: 1.11 $ */package weka.clusterers;import java.io.*;import java.util.*;import java.text.*;import weka.core.*;import weka.core.metrics.*;import weka.filters.unsupervised.attribute.Remove;import weka.filters.unsupervised.attribute.Normalize;import weka.filters.Filter;public class HAC extends Clusterer implements SemiSupClusterer, OptionHandler{  /* name of the clusterer */  String m_name = "HAC";  /** Number of clusters */  protected int m_numClusters = -1;   /** Number of clusters in the process*/  protected int m_numCurrentClusters = 0;   /** ID of current cluster */  protected int m_clusterID = 0;  /** Number of seeded clusters */  protected int m_numSeededClusters = 0;  /** Dot file name for dumping graph for tree visualization */  protected String m_dotFileName = "user-features.dot";  /** Dot file name for dumping graph for tree visualization */  protected PrintWriter m_dotWriter = null;        /** Instances that we are working with */  Instances m_instances;  Instances m_descrInstances;   /** holds the clusters */  protected ArrayList m_clusters = null;  /**   * temporary variable holding cluster assignments   */  protected int [] m_clusterAssignments;    /** distance matrix */    protected double[][] m_distanceMatrix = null;  /** cluster similarity type */  public final static int SINGLE_LINK = 0;  public final static int COMPLETE_LINK = 1;  public final static int GROUP_AVERAGE = 2;  public static final Tag[] TAGS_LINKING = {    new Tag(SINGLE_LINK, "Single link"),    new Tag(COMPLETE_LINK, "Complete link"),    new Tag(GROUP_AVERAGE, "Group-average")      };  /** Default linking method */  protected int m_linkingType = GROUP_AVERAGE;       /** starting index of test data in unlabeledData if transductive clustering */  protected int m_StartingIndexOfTest = -1;    /** seeding  */  protected boolean m_seedable = false;  /** holds the ([seed instance] -> [clusterLabel of seed instance]) mapping */  protected HashMap m_SeedHash = null;  /** A 'checksum hash' where indices are hashed to the sum of their attribute values */  protected HashMap m_checksumHash = null;  protected double[] m_checksumPerturb = null;    /**   * holds the random Seed, useful for random selection initialization   */  protected int m_randomSeed = 100;  protected Random m_randomGen = null;  /** instance hash */  protected HashMap m_instancesHash = null;  /** reverse instance hash */  protected HashMap m_reverseInstancesHash = null;  /** The threshold distance beyond which no clusters are merged (except for one - TODO) */  protected double m_mergeThreshold = 0.8;   /** verbose? */  protected boolean m_verbose = false;  /** metric used to calculate similarity/distance *///    protected Metric m_metric = new WeightedDotP();//    protected String m_metricName = new String("weka.core.metrics.WeightedDotP");  protected Metric m_metric = new WeightedEuclidean();    protected String m_metricName = new String("weka.core.metrics.WeightedEuclidean");  /** Is the metric (and hence the algorithm) relying on similarities or distances? */  protected boolean  m_isDistanceBased = false;  /** has the metric has been constructed?  a fix for multiple buildClusterer's */  protected boolean m_metricBuilt = false;    // ===============  // Public methods.  // ===============  /** empty constructor, required to call using Class.forName */  public HAC() {}  /* Constructor */  public HAC(Metric metric) {    m_metric = metric;    m_metricName = m_metric.getClass().getName();    m_isDistanceBased = metric.isDistanceBased();  }  /** Sets training instances */  public void setInstances(Instances instances) {    m_instances = instances;  }  /** Return training instances */  public Instances getInstances() {    return m_instances;  }  /**   * Set the number of clusters to generate   *   * @param n the number of clusters to generate   */  public void setNumClusters(int n) {    m_numClusters = n;    if (m_verbose) {      System.out.println("Number of clusters: " + n);    }  }  /** Set the merge threshold */  public void setMergeThreshold(double threshold) {    m_mergeThreshold = threshold;  }  /** Get the merge threshold */  public double getMergeThreshold() {    return m_mergeThreshold;  }  /**   * Set the distance metric   *   * @param s the metric   */  public void setMetric (LearnableMetric m) {    m_metric = m;    m_metricName = m_metric.getClass().getName();    m_isDistanceBased = m.isDistanceBased();  }  /**   * Get the distance metric   *   * @returns the distance metric used   */  public Metric getMetric () {    return m_metric;  }  /**   * Get the distance metric name   *   * @returns the name of the distance metric used   */  public String metricName () {    return m_metricName;  }    /**   * We always want to implement SemiSupClusterer from a class extending Clusterer.     * We want to be able to return the underlying parent class.   * @return parent Clusterer class   */  public Clusterer getThisClusterer() {    return this;  }  /**   * Cluster given instances to form the specified number of clusters.   *   * @param data instances to be clustered   * @param num_clusters number of clusters to create   * @exception Exception if something goes wrong.   */  public void buildClusterer(Instances data, int num_clusters) throws Exception {    setNumClusters(num_clusters);    buildClusterer(data);  }  /**   * Clusters unlabeledData and labeledData (with labels removed),   * using labeledData as seeds   *   * @param labeledData labeled instances to be used as seeds   * @param unlabeledData unlabeled instances   * @param classIndex attribute index in labeledData which holds class info   * @param numClusters number of clusters   * @exception Exception if something goes wrong.  */  public void buildClusterer(Instances labeledData, Instances unlabeledData,			     int classIndex, int numClusters) throws Exception {    // remove labels of labeledData before putting in SeedHash    Instances clusterData = new Instances(labeledData);    clusterData.deleteClassAttribute();    // create SeedHash from labeledData    if (getSeedable()) {      Seeder seeder = new Seeder(clusterData, labeledData);      setSeedHash(seeder.getAllSeeds());    }    // add unlabeled data to labeled data (labels removed), not the    // other way around, so that the hash table entries are consistent    // with the labeled data without labels    for (int i=0; i<unlabeledData.numInstances(); i++) {      clusterData.add(unlabeledData.instance(i));    }        if (m_verbose) {      System.out.println("combinedData has size: " + clusterData.numInstances() + "\n");    }    // learn metric using labeled data, then  cluster both the labeled and unlabeled data    m_metric.buildMetric(labeledData);    m_metricBuilt = true;    // check if the number of clusters is dynamically set to the number of classes    if (m_numClusters == -1) {      m_numClusters = labeledData.numClasses();      System.out.println("DYNAMIC NUMBER OF CLUSTERS, setting to " + m_numClusters);    }    buildClusterer(clusterData, numClusters);  }    /**   * Clusters unlabeledData and labeledData (with labels removed),   * using labeledData as seeds   *   * @param labeledData labeled instances to be used as seeds   * @param unlabeledData unlabeled instances   * @param classIndex attribute index in labeledData which holds class info   * @param numClusters number of clusters   * @param startingIndexOfTest from where test data starts in unlabeledData, useful if clustering is transductive   * @exception Exception if something goes wrong.  */  public void buildClusterer(Instances labeledData, Instances unlabeledData, int classIndex,			     int numClusters, int startingIndexOfTest) throws Exception {    m_StartingIndexOfTest = startingIndexOfTest + labeledData.numInstances();    buildClusterer(labeledData, unlabeledData, classIndex, numClusters);  }  /**   * Cluster given instances.  If no threshold or number of clusters is set,   * clustering proceeds until two clusters are left.    *   * @param data instances to be clustered   * @exception Exception if something goes wrong.   */  public void buildClusterer(Instances data) throws Exception {    m_randomGen = new Random(m_randomSeed);    m_dotWriter = new PrintWriter(new BufferedOutputStream(new FileOutputStream(m_dotFileName)));    m_dotWriter.println("digraph HAC {\n");    setInstances(data);    m_clusterAssignments = new int[m_instances.numInstances()];        if (m_verbose && m_SeedHash != null) {      System.out.println("Using seeding ...");    }    if (m_instances.checkForNominalAttributes() && m_instances.checkForStringAttributes()) {      throw new UnsupportedAttributeTypeException("Cannot handle nominal attributes\n");    }    //m_instances = filterInstanceDescriptions(m_instances);    // Don't rebuild the metric if it was already trained    if (!m_metricBuilt) {      m_metric.buildMetric(data);    }        hashInstances(m_instances);    createDistanceMatrix();    cluster();    unhashClusters();    m_dotWriter.println("}");    m_dotWriter.close();  }  /** If some of the attributes start with "__", form a separate Instances set with   * descriptions and filter them out of the argument dataset.   * Return the original dataset without the filtered out attributes   */  protected Instances filterInstanceDescriptions(Instances instances) throws Exception {    Instances filteredInstances;//      Normalize normalizeFilter = new Normalize();//      normalizeFilter.setInputFormat(instances);//      instances = Filter.useFilter(instances, normalizeFilter);//      System.out.println("Normalized the instance attributes");        // go through the attributes and find the description attributes    ArrayList descrIndexList = new ArrayList();    for (int i = 0; i < instances.numAttributes(); i++) {      Attribute attr = instances.attribute(i);      if (attr.name().startsWith("__")) {	descrIndexList.add(new Integer(i));	System.out.println("filtering " + attr);      }     }     // filter out the description attributes if necessary    if (descrIndexList.size() > 0) {      m_descrInstances = new Instances(instances);      // filter out the descriptions first      int[] descrIndeces = new int[descrIndexList.size()];      for (int i = 0; i < descrIndexList.size(); i++) {	descrIndeces[i] = ((Integer) descrIndexList.get(i)).intValue();      }      Remove attributeFilter = new Remove();      attributeFilter.setAttributeIndicesArray(descrIndeces);      attributeFilter.setInvertSelection(false);      attributeFilter.setInputFormat(instances);      filteredInstances = Filter.useFilter(instances, attributeFilter);      attributeFilter.setInvertSelection(true);      attributeFilter.setInputFormat(instances);      m_descrInstances = Filter.useFilter(instances, attributeFilter);    } else {      filteredInstances = new Instances(instances);    }    return filteredInstances;  }      /**   * Reset all values that have been learned   */  public void resetClusterer()  throws Exception{    if (m_metric instanceof LearnableMetric)
12 3 4 下一页
💿 文件大小 12323 K
👤 上传用户 ilovexzhu
📂 所属分类人工智能/神经网络
🏷️ 相关标签

#university #supervised #learning #wekaUT
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -