📄 hac.java
字号:
((LearnableMetric)m_metric).resetMetric(); m_SeedHash = null; } /** Set the m_SeedHash */ public void setSeedHash(HashMap seedhash) { m_SeedHash = seedhash; m_seedable = true; } /** * Set the random number seed * @param s the seed */ public void setRandomSeed (int s) { m_randomSeed = s; } /** Return the random number seed */ public int getRandomSeed () { return m_randomSeed; } /** Turn seeding on and off * @param seedable should seeding be done? */ public void setSeedable(boolean seedable) { m_seedable = seedable; } /** Turn seeding on and off * @param seedable should seeding be done? */ public boolean getSeedable() { return m_seedable; } /** Read the seeds from a hastable, where every key is an instance and every value is: * a FastVector of Doubles: [(Double) probInCluster0 ... (Double) probInClusterN] * @param seedVector vector containing seeds */ public void seedClusterer(HashMap SeedHash) { if (m_seedable) { m_SeedHash = SeedHash; } } /** * returns the SeedHash * @return seeds hash */ public HashMap getSeedHash() {return m_SeedHash;} /** * Create the hashtable from given Instances; * keys are numeric indeces, values are actual Instances * * @param data Instances * */ protected void hashInstances (Instances data) { int next_value = 0; m_instancesHash = new HashMap(); m_reverseInstancesHash = new HashMap(); m_checksumHash = new HashMap(); // initialize checksum perturbations m_checksumPerturb = new double[data.numAttributes()]; for (int i = 0; i < m_checksumPerturb.length; i++) { m_checksumPerturb[i] = m_randomGen.nextFloat(); } for (Enumeration enum = data.enumerateInstances(); enum.hasMoreElements();) { Instance instance = (Instance) enum.nextElement(); if (!m_instancesHash.containsValue(instance)) { Integer idx = new Integer(next_value); next_value++; m_instancesHash.put(idx, instance); m_reverseInstancesHash.put(instance, idx); // hash the checksum value double [] values = instance.toDoubleArray(); double checksum = 0; for (int i = 0; i < values.length; i++) { checksum += m_checksumPerturb[i] * values[i];} Double checksumIdx = new Double(checksum); if (m_checksumHash.containsKey(checksumIdx)) { Object prev = m_checksumHash.get(checksumIdx); ArrayList chain; if (prev instanceof Integer) { chain = new ArrayList(); Integer prevIdx = (Integer) m_checksumHash.get(checksumIdx); chain.add(prevIdx); } else { //instanceof Arraylist chain = (ArrayList) m_checksumHash.get(checksumIdx); } chain.add(idx); m_checksumHash.put(checksumIdx, chain); } else { // no collisions m_checksumHash.put(checksumIdx, idx); } } else { System.err.println("Already encountered instance, skipping " + instance); } } } /** * assuming m_clusters contains the clusters of indeces, convert it to * clusters containing actual instances */ protected void unhashClusters() throws Exception{ if (m_clusters == null || m_instancesHash == null) throw new Exception ("Clusters or hash not initialized"); ArrayList clusters = new ArrayList(); for (int i = 0; i < m_clusters.size(); i++ ) { Cluster cluster = (Cluster) m_clusters.get(i); Cluster newCluster = new Cluster(); for (int j = 0; j < cluster.size(); j++) { Integer instanceIdx = (Integer) cluster.get(j); double wt = cluster.weightAt(j); newCluster.add((Instance)m_instancesHash.get(instanceIdx), wt); } clusters.add(newCluster); } m_clusters = clusters; } /** * Fill the distance matrix with values using the metric * */ protected void createDistanceMatrix () throws Exception { int n = m_instancesHash.size(); double sim; m_distanceMatrix = new double[n][n]; for (int i = 0; i < n; i++) { for (int j = i+1; j < n; j++) { m_distanceMatrix[i][j] = m_distanceMatrix[j][i] = m_metric.distance((Instance) m_instancesHash.get(new Integer(i)), (Instance) m_instancesHash.get(new Integer(j))); } } } /** * Set the type of clustering * * @param type Clustering type: can be HAC.SINGLE_LINK, HAC.COMPLETE_LINK, * or HAC.GROUP_AVERAGE */ public void setLinkingType (SelectedTag linkingType) { if (linkingType.getTags() == TAGS_LINKING) { m_linkingType = linkingType.getSelectedTag().getID(); } } /** * Get the linking type * * @returns the linking type */ public SelectedTag getLinkingType () { return new SelectedTag(m_linkingType, TAGS_LINKING); } /** * Internal method that initializes distances between seed clusters to * POSITIVE_INFINITY */ protected void initConstraints() { for (int i = 0; i < m_instances.numInstances(); i++) { if (m_clusterAssignments[i] < m_numSeededClusters) { // make distances to elements from other seeded clusters POSITIVE_INFINITY for (int j = i+1; j < m_instances.numInstances(); j++) { if (m_clusterAssignments[j] < m_numSeededClusters && m_clusterAssignments[j] != m_clusterAssignments[i]) { m_distanceMatrix[i][j] = m_distanceMatrix[j][i] = Double.POSITIVE_INFINITY; } } } } } /** * Internal method that produces the actual clusters */ protected void cluster() throws Exception { double last_distance = Double.MIN_VALUE; m_numCurrentClusters = 0; m_numSeededClusters = 0; m_clusters = new ArrayList(); TreeSet leftOverSet = null; // Initialize singleton clusters m_clusterAssignments = new int[m_instances.numInstances()]; for (int i = 0; i < m_instances.numInstances(); i++) { m_clusterAssignments[i] = -1; } // utilize seeds if available if (m_SeedHash != null) { if (m_verbose) { System.out.println("Seeding HAC using " + m_SeedHash.size() + " seeds"); } Iterator iterator = m_SeedHash.entrySet().iterator(); int maxClassIdx = -1; HashSet classIdxSet = new HashSet(); while (iterator.hasNext()) { Map.Entry entry = (Map.Entry) iterator.next(); Instance instance = (Instance) entry.getKey(); Integer instanceIdx = (Integer) m_reverseInstancesHash.get(instance); Integer clusterIdx = (Integer) entry.getValue(); classIdxSet.add(clusterIdx); m_clusterAssignments[instanceIdx.intValue()] = clusterIdx.intValue(); if (clusterIdx.intValue() > maxClassIdx) { maxClassIdx = clusterIdx.intValue(); } } m_numCurrentClusters = m_numSeededClusters = classIdxSet.size(); System.out.println("Seeded " + m_numSeededClusters + " clusters"); // If the seeding is incomplete, need to memorize "unseeded" cluster numbers if (m_numCurrentClusters < m_numClusters) { leftOverSet = new TreeSet(); for (int i = 0; i < m_numClusters; i++) { if (!classIdxSet.contains(new Integer(i))) { leftOverSet.add(new Integer(i)); } } } } // assign unseeded instances to singleton clusters for (int i = 0; i < m_instances.numInstances(); i++) { if (m_clusterAssignments[i] == -1) { // utilize "left over clusters first" if (leftOverSet != null) { Integer clusterIdx = (Integer) leftOverSet.first(); m_clusterAssignments[i] = clusterIdx.intValue(); leftOverSet.remove(clusterIdx); if (leftOverSet.isEmpty()) { leftOverSet = null; } } else { m_clusterAssignments[i] = m_numCurrentClusters; } m_numCurrentClusters++; } } // initialize m_clusters arraylist getIntClusters(); if (m_SeedHash != null) { initConstraints(); } // merge clusters until desired number of clusters is reached double mergeDistance = 0; while (m_numCurrentClusters > m_numClusters && mergeDistance < m_mergeThreshold) { mergeDistance = mergeStep(); if (m_verbose) { System.out.println("Merged with " + (m_numCurrentClusters) + " clusters left; distance=" + mergeDistance); } } System.out.println("Done clustering with " + m_clusters.size() + " clusters"); for (int i = 0; i < m_clusters.size(); i++) System.out.print(((Cluster)m_clusters.get(i)).size() + "\t"); initClusterAssignments(); } /** * Internal method that finds two most similar clusters and merges them */ protected double mergeStep() throws Exception{ double bestDistance = Double.MAX_VALUE; double thisDistance; Cluster thisCluster, nextCluster; ArrayList mergeCandidatesList = new ArrayList(); int cluster1_index, cluster2_index; if (m_verbose) { System.out.println("\nBefore merge step there are " + m_clusters.size() + " clusters; m_numCurrentClusters=" + m_numCurrentClusters); } // find two most similar clusters for (int i = 0; i < m_clusters.size()-1; i++){ thisCluster = (Cluster)m_clusters.get(i); for (int j = i+1; j < m_clusters.size(); j++) { thisDistance = clusterDistance(thisCluster, (Cluster) m_clusters.get(j)); if (m_verbose) { // System.out.println("Distance between " + i + " and " + j + " is " + thisDistance); } // If there is a tie, add to the list of top distances if (thisDistance == bestDistance) { mergeCandidatesList.add(new Integer(i)); mergeCandidatesList.add(new Integer(j)); } else if (thisDistance < bestDistance) { // this is the best distance seen this far mergeCandidatesList.clear(); mergeCandidatesList.add(new Integer(i)); mergeCandidatesList.add(new Integer(j)); bestDistance = thisDistance; } } } // randomly pick a most similar pair from the list of candidates int i1 = (int) (mergeCandidatesList.size() * m_randomGen.nextFloat()); int i2 = (i1 % 2 > 0) ? (i1 - 1) : (i1 + 1); int cluster1Idx = ((Integer) mergeCandidatesList.get(i1)).intValue(); int cluster2Idx = ((Integer) mergeCandidatesList.get(i2)).intValue(); if (m_verbose) { System.out.println("\nMerging clusters " + cluster1Idx + " and " + cluster2Idx + "; distance=" + bestDistance); } System.out.print("Best distance=" + ((float)bestDistance) + "; Merging:\n"); printCluster(cluster1Idx); System.out.print("AND\n"); printCluster(cluster2Idx); System.out.print("\n"); Cluster newCluster = mergeClusters(cluster1Idx, cluster2Idx); // check if the new cluster is sufficiently large and "good" HashMap groupCountMap = new HashMap(); for (int i = 0; i < newCluster.size(); i++) { int idx = ((Integer)newCluster.get(i)).intValue(); Instance instance = m_descrInstances.instance(idx); // get the set of groups String groupString = instance.stringValue(1); StringTokenizer tokenizer = new StringTokenizer(groupString, "|"); while (tokenizer.hasMoreTokens()) { String group = tokenizer.nextToken(); if (groupCountMap.containsKey(group)) { Integer count = (Integer) groupCountMap.get(group); groupCountMap.put(group, new Integer(count.intValue() + 1)); } else { groupCountMap.put(group, new Integer(1)); } } } int largestGroupCount = -1; Iterator iterator = groupCountMap.entrySet().iterator(); while(iterator.hasNext()) { Map.Entry entry = (Map.Entry) iterator.next(); int thisCount = ((Integer)entry.getValue()).intValue(); String group = (String) entry.getKey(); if (thisCount > largestGroupCount && !group.equals("grad")) { largestGroupCount = thisCount; } } // if the most common group includes 80% of cluster members, yell! if ((largestGroupCount + 0.0)/(newCluster.size() + 0.0) > 0.6 && newCluster.size() > 2) { System.out.println("HAPPY JOY JOY! LOOK HERE!"); } // have to remove in order because we're using index, argh if (cluster1Idx > cluster2Idx) { m_clusters.remove(cluster1Idx); m_clusters.remove(cluster2Idx); } else { m_clusters.remove(cluster2Idx); m_clusters.remove(cluster1Idx); } m_clusters.add(newCluster); m_numCurrentClusters--; return bestDistance; } /** * Computes the clusters from the cluster assignments * * @exception Exception if clusters could not be computed successfully
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -