📄 hac.java

📁 wekaUT是 university texas austin 开发的基于weka的半指导学习(semi supervised learning)的分类器
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
      ((LearnableMetric)m_metric).resetMetric();    m_SeedHash = null;  }  /** Set the m_SeedHash */  public void setSeedHash(HashMap seedhash) {    m_SeedHash = seedhash;    m_seedable = true;  }     /**   * Set the random number seed   * @param s the seed   */  public void setRandomSeed (int s) {    m_randomSeed = s;  }      /** Return the random number seed */  public int getRandomSeed () {    return  m_randomSeed;  }    /** Turn seeding on and off   * @param seedable should seeding be done?   */  public void setSeedable(boolean seedable) {    m_seedable = seedable;  }  /** Turn seeding on and off   * @param seedable should seeding be done?   */  public boolean getSeedable() {    return m_seedable;  }       /** Read the seeds from a hastable, where every key is an instance and every value is:   * a FastVector of Doubles: [(Double) probInCluster0 ... (Double) probInClusterN]   * @param seedVector vector containing seeds   */  public void seedClusterer(HashMap SeedHash) {    if (m_seedable) {      m_SeedHash = SeedHash;    }  }  /**   * returns the SeedHash   * @return seeds hash   */  public HashMap getSeedHash() {return m_SeedHash;}  /**   * Create the hashtable from given Instances;   * keys are numeric indeces, values are actual Instances   *   * @param data Instances   *   */  protected void hashInstances (Instances data) {    int next_value = 0;    m_instancesHash = new HashMap();    m_reverseInstancesHash = new HashMap();    m_checksumHash = new HashMap();    // initialize checksum perturbations    m_checksumPerturb = new double[data.numAttributes()];    for (int i = 0; i < m_checksumPerturb.length; i++) {      m_checksumPerturb[i] = m_randomGen.nextFloat();    }        for (Enumeration enum = data.enumerateInstances(); enum.hasMoreElements();) {      Instance instance = (Instance) enum.nextElement();      if (!m_instancesHash.containsValue(instance)) {	Integer idx = new Integer(next_value);	next_value++;	m_instancesHash.put(idx, instance);	m_reverseInstancesHash.put(instance, idx);      	// hash the checksum value	double [] values = instance.toDoubleArray();	double checksum = 0; 	for (int i = 0; i < values.length; i++) { checksum += m_checksumPerturb[i] * values[i];}	Double checksumIdx = new Double(checksum);	if (m_checksumHash.containsKey(checksumIdx)) {	  Object prev = m_checksumHash.get(checksumIdx);	  ArrayList chain; 	  if (prev instanceof Integer) { 	    chain = new ArrayList();	    Integer prevIdx = (Integer) m_checksumHash.get(checksumIdx);	    chain.add(prevIdx);	  } else {  //instanceof Arraylist	    chain = (ArrayList) m_checksumHash.get(checksumIdx);	  }	  chain.add(idx);	  m_checksumHash.put(checksumIdx, chain);	} else {  // no collisions	  m_checksumHash.put(checksumIdx, idx);	}      } else {	System.err.println("Already encountered instance, skipping " + instance);      }    }  }  /**   * assuming m_clusters contains the clusters of indeces, convert it to   * clusters containing actual instances   */   protected void unhashClusters() throws Exception{    if (m_clusters == null  || m_instancesHash == null)      throw new Exception ("Clusters or hash not initialized");    ArrayList clusters = new ArrayList();    for (int i = 0; i < m_clusters.size(); i++ ) {      Cluster cluster = (Cluster) m_clusters.get(i);      Cluster newCluster = new Cluster();      for (int j = 0; j < cluster.size(); j++) {	Integer instanceIdx = (Integer) cluster.get(j);	double wt = cluster.weightAt(j);	newCluster.add((Instance)m_instancesHash.get(instanceIdx), wt);      }      clusters.add(newCluster);    }    m_clusters = clusters;  }	  /**   * Fill the distance matrix with values using the metric   *   */  protected void createDistanceMatrix () throws Exception {    int n = m_instancesHash.size();    double sim;	    m_distanceMatrix = new double[n][n];    for (int i = 0; i < n; i++) {      for (int j = i+1; j < n; j++) {	m_distanceMatrix[i][j] = m_distanceMatrix[j][i] =	  m_metric.distance((Instance) m_instancesHash.get(new Integer(i)),			    (Instance) m_instancesHash.get(new Integer(j)));      }    }  }  /**   * Set the type of clustering   *   * @param type Clustering type: can be HAC.SINGLE_LINK, HAC.COMPLETE_LINK,   * or HAC.GROUP_AVERAGE   */  public void setLinkingType (SelectedTag linkingType)  {    if (linkingType.getTags() == TAGS_LINKING) {      m_linkingType = linkingType.getSelectedTag().getID();    }  }  /**   * Get the linking type   *   * @returns the linking type   */  public SelectedTag getLinkingType ()  {      return new SelectedTag(m_linkingType, TAGS_LINKING);  }  /**   * Internal method that initializes distances between seed clusters to   * POSITIVE_INFINITY   */  protected void initConstraints() {    for (int i = 0; i < m_instances.numInstances(); i++) {      if (m_clusterAssignments[i] < m_numSeededClusters) {	// make distances to elements from other seeded clusters POSITIVE_INFINITY	for (int j = i+1; j < m_instances.numInstances(); j++) {	  if (m_clusterAssignments[j] < m_numSeededClusters &&	      m_clusterAssignments[j] != m_clusterAssignments[i]) {	    m_distanceMatrix[i][j] =	      m_distanceMatrix[j][i] = Double.POSITIVE_INFINITY;	  }	}      }    }   }  /**   * Internal method that produces the actual clusters   */  protected void cluster() throws Exception {    double last_distance = Double.MIN_VALUE;    m_numCurrentClusters = 0;    m_numSeededClusters = 0;    m_clusters = new ArrayList();    TreeSet leftOverSet = null;    // Initialize singleton clusters    m_clusterAssignments = new int[m_instances.numInstances()];    for (int i = 0; i < m_instances.numInstances(); i++) {      m_clusterAssignments[i] = -1;    }    // utilize seeds if available    if (m_SeedHash != null) {      if (m_verbose) {	System.out.println("Seeding HAC using " + m_SeedHash.size() + " seeds");      }      Iterator iterator = m_SeedHash.entrySet().iterator();      int maxClassIdx = -1;      HashSet classIdxSet = new HashSet();      while (iterator.hasNext()) {	Map.Entry entry = (Map.Entry) iterator.next();	Instance instance = (Instance) entry.getKey();	Integer instanceIdx = (Integer) m_reverseInstancesHash.get(instance);	Integer clusterIdx = (Integer) entry.getValue();	classIdxSet.add(clusterIdx);	m_clusterAssignments[instanceIdx.intValue()] = clusterIdx.intValue();	if (clusterIdx.intValue() > maxClassIdx) {	  maxClassIdx = clusterIdx.intValue();	}      }      m_numCurrentClusters = m_numSeededClusters = classIdxSet.size();      System.out.println("Seeded " + m_numSeededClusters + " clusters");      // If the seeding is incomplete, need to memorize "unseeded" cluster numbers      if (m_numCurrentClusters < m_numClusters) {	leftOverSet = new TreeSet();	for (int i = 0; i < m_numClusters; i++) {	  if (!classIdxSet.contains(new Integer(i))) {	    leftOverSet.add(new Integer(i));	  } 	}       }    }        // assign unseeded instances to singleton clusters    for (int i = 0; i < m_instances.numInstances(); i++) {      if (m_clusterAssignments[i] == -1) {	// utilize "left over clusters first"	if (leftOverSet != null) {	  Integer clusterIdx = (Integer) leftOverSet.first();	  m_clusterAssignments[i] = clusterIdx.intValue();	  leftOverSet.remove(clusterIdx);	  if (leftOverSet.isEmpty()) {	    leftOverSet = null;	  }	} else {	  m_clusterAssignments[i] = m_numCurrentClusters;	}	m_numCurrentClusters++;       }    }    // initialize m_clusters arraylist    getIntClusters();    if (m_SeedHash != null) {      initConstraints();    }        // merge clusters until desired number of clusters is reached    double mergeDistance = 0;     while (m_numCurrentClusters >  m_numClusters && mergeDistance < m_mergeThreshold) {      mergeDistance = mergeStep();      if (m_verbose) {	System.out.println("Merged with " + (m_numCurrentClusters) + " clusters left; distance=" + mergeDistance);      }    }    System.out.println("Done clustering with " + m_clusters.size() + " clusters");    for (int i = 0; i < m_clusters.size(); i++) System.out.print(((Cluster)m_clusters.get(i)).size() + "\t");    initClusterAssignments();  }  /**   * Internal method that finds two most similar clusters and merges them   */  protected double mergeStep() throws Exception{    double bestDistance = Double.MAX_VALUE;    double thisDistance;    Cluster thisCluster, nextCluster;    ArrayList mergeCandidatesList = new ArrayList();    int cluster1_index, cluster2_index;    if (m_verbose) {       System.out.println("\nBefore merge step there are " + m_clusters.size() +			 " clusters; m_numCurrentClusters=" + m_numCurrentClusters);    }    // find two most similar clusters     for (int i = 0; i < m_clusters.size()-1; i++){      thisCluster = (Cluster)m_clusters.get(i);      for (int j = i+1; j < m_clusters.size(); j++) {	thisDistance = clusterDistance(thisCluster, (Cluster) m_clusters.get(j));	if (m_verbose) {	  //	  System.out.println("Distance between " + i + " and " + j + " is " + thisDistance);	}	// If there is a tie, add to the list of top distances	if (thisDistance == bestDistance) {	  mergeCandidatesList.add(new Integer(i));	  mergeCandidatesList.add(new Integer(j));	} else if (thisDistance < bestDistance) {  // this is the best distance seen this far	  mergeCandidatesList.clear();	  mergeCandidatesList.add(new Integer(i));	  mergeCandidatesList.add(new Integer(j));	  bestDistance = thisDistance;	}      }    }    // randomly pick a most similar pair from the list of candidates    int i1 = (int) (mergeCandidatesList.size() * m_randomGen.nextFloat());    int i2 = (i1 % 2 > 0) ? (i1 - 1) : (i1 + 1);    int cluster1Idx = ((Integer) mergeCandidatesList.get(i1)).intValue();    int cluster2Idx = ((Integer) mergeCandidatesList.get(i2)).intValue();    if (m_verbose) {      System.out.println("\nMerging clusters " + cluster1Idx + " and " + cluster2Idx + ";  distance=" + bestDistance);    }    System.out.print("Best distance=" + ((float)bestDistance) + "; Merging:\n");    printCluster(cluster1Idx);    System.out.print("AND\n");    printCluster(cluster2Idx);    System.out.print("\n");    Cluster newCluster = mergeClusters(cluster1Idx, cluster2Idx);    // check if the new cluster is sufficiently large and "good"    HashMap groupCountMap = new HashMap();     for (int i = 0; i < newCluster.size(); i++) {      int idx = ((Integer)newCluster.get(i)).intValue();      Instance instance = m_descrInstances.instance(idx);      // get the set of groups      String groupString = instance.stringValue(1);      StringTokenizer tokenizer = new StringTokenizer(groupString, "|");      while (tokenizer.hasMoreTokens()) {	String group = tokenizer.nextToken();	if (groupCountMap.containsKey(group)) {	  Integer count = (Integer) groupCountMap.get(group);	  groupCountMap.put(group, new Integer(count.intValue() + 1));	} else {	  groupCountMap.put(group, new Integer(1));	}      }          }    int largestGroupCount = -1;    Iterator iterator = groupCountMap.entrySet().iterator();    while(iterator.hasNext()) {      Map.Entry entry = (Map.Entry) iterator.next();      int thisCount = ((Integer)entry.getValue()).intValue();      String group = (String) entry.getKey();      if (thisCount > largestGroupCount && !group.equals("grad")) {	largestGroupCount = thisCount;      }     }     // if the most common group includes 80% of cluster members, yell!    if ((largestGroupCount + 0.0)/(newCluster.size() + 0.0) > 0.6 && newCluster.size() > 2) {      System.out.println("HAPPY JOY JOY!  LOOK HERE!");     }             // have to remove in order because we're using index, argh    if (cluster1Idx > cluster2Idx) {      m_clusters.remove(cluster1Idx);      m_clusters.remove(cluster2Idx);    } else {      m_clusters.remove(cluster2Idx);      m_clusters.remove(cluster1Idx);    }    m_clusters.add(newCluster);    m_numCurrentClusters--;    return bestDistance;  }      /**   * Computes the clusters from the cluster assignments   *    * @exception Exception if clusters could not be computed successfully
💿 文件大小 12323 K
👤 上传用户 ilovexzhu
📂 所属分类人工智能/神经网络
🏷️ 相关标签

#university #supervised #learning #wekaUT
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -