📄 pckmeans.java
字号:
*/ public void buildClusterer(Instances data) throws Exception { System.out.println("Must link weight: " + m_MustLinkWeight); System.out.println("Cannot link weight: " + m_CannotLinkWeight); m_RandomNumberGenerator = new Random(m_RandomSeed); setInstances(data); // Don't rebuild the metric if it was already trained if (!m_metricBuilt) { m_metric.buildMetric(data.numAttributes()); } m_ClusterCentroids = new Instances(m_Instances, m_NumClusters); m_ClusterAssignments = new int [m_Instances.numInstances()]; if (m_Instances.checkForNominalAttributes() && m_Instances.checkForStringAttributes()) { throw new UnsupportedAttributeTypeException("Cannot handle nominal attributes\n"); } System.out.println("Initializing clustering ... "); if (m_Active) { bestPairsForActiveLearning(m_NumActive); } else { nonActivePairwiseInit(); } System.out.println("Done initializing clustering ..."); if (m_verbose) { if (m_Seedable) { System.out.println("Initial assignments of seed points:"); getIndexClusters(); printIndexClusters(); } for (int i=0; i<m_NumClusters; i++) { System.out.println("Centroid " + i + ": " + m_ClusterCentroids.instance(i)); } } runKMeans(); } /** * Reset all values that have been learned */ public void resetClusterer() throws Exception{ if (m_metric instanceof LearnableMetric) { ((LearnableMetric)m_metric).resetMetric(); } m_SeedHash = null; m_ConstraintsHash = null; m_instanceConstraintHash = null; } /** Set default perturbation value * @param p perturbation fraction */ public void setDefaultPerturb(double p) { m_DefaultPerturb = p; } /** Get default perturbation value * @return perturbation fraction */ public double getDefaultPerturb(){ return m_DefaultPerturb; } /** Turn seeding on and off * @param seedable should seeding be done? */ public void setSeedable(boolean seedable) { m_Seedable = seedable; } /** Is seeding performed? * @return is seeding being done? */ public boolean getSeedable() { return m_Seedable; } /** * We can have clusterers that don't utilize seeding */ public boolean seedable() { return m_Seedable; } /** Phase 1 code for active learning */ protected int activePhaseOne(int numQueries) throws Exception { int numInstances = m_Instances.numInstances(); int X, Y, Z; int query, lambda, Label, CLcount; boolean MLmode = true; System.out.println("In Explore phase, with numqueries: " + numQueries); // these are the main data-structures to be updated here m_NeighborSets = new HashSet[m_NumClusters]; // set of points in each cluster neighborhood m_SumOfClusterInstances = new Instance[m_NumClusters]; m_ClusterAssignments = new int[numInstances]; m_AssignedSet = new HashSet((int) (numQueries/0.75+10)); // whether a point has been assigned or not for (int i=0; i<m_Instances.numInstances(); i++) { m_ClusterAssignments[i] = -1; } query = 0; // current num queries lambda = -1; // curent number of disjoint neighborhoods X = 0; // current point under investigation while( query < numQueries ){ if( m_NeighborSets[0] == null ){ // start the first neighborhood from the first point lambda++; if (m_verbose) System.out.println("Setting cluster of " + X + " to " + lambda); // update data structures m_NeighborSets[lambda] = new HashSet(); m_NeighborSets[lambda].add(new Integer(X)); m_SumOfClusterInstances[lambda] = sumWithInstance(m_SumOfClusterInstances[lambda],m_Instances.instance(X)); m_ClusterAssignments[X] = lambda; m_AssignedSet.add(new Integer(X)); } else if( lambda == m_NumClusters-1 && !m_AllExplore) { // NOTE: this condition is fired only if we are doing 2 phase (Explore + Consolidate) System.out.println("Explore phase over after " + query + " queries"); m_NumCurrentClusters = lambda+1; return query; } else { Z = (int) farthestFromSet(m_AssignedSet, null); CLcount = -1; for( int h = 0; h <= lambda; h++ ){ if (m_verbose) System.out.println("Starting for loop CLcount: " + CLcount); Iterator NbrIt = null; if (m_NeighborSets[h] != null) { NbrIt = m_NeighborSets[h].iterator(); } if( NbrIt != null && NbrIt.hasNext() ){ X = ((Integer) NbrIt.next()).intValue(); if (m_verbose) System.out.println("Inside iterator next ... X: " + X); Label = askOracle(X,Z); query++; System.out.println("Making query: " + query); if( Label == InstancePair.CANNOT_LINK ){ // Cannot-link, update CLcount CLcount++; } else{ // Must-link, add to neighborset // update data structures m_NeighborSets[h].add(new Integer(Z)); m_SumOfClusterInstances[h] = sumWithInstance(m_SumOfClusterInstances[h],m_Instances.instance(Z)); m_ClusterAssignments[Z] = h; m_AssignedSet.add(new Integer(Z)); break; // get out of for loop } if(query >= numQueries){ if (m_verbose) System.out.println("Run out of queries"); m_NumCurrentClusters = lambda+1; return query; } } } if (m_verbose) { System.out.println("Out of for loop"); } if( CLcount == lambda ){ // found a point cannot-linked to all current clusters lambda++; // update data structures m_NeighborSets[lambda] = new HashSet(); m_NeighborSets[lambda].add(new Integer(Z)); m_SumOfClusterInstances[lambda] = sumWithInstance(m_SumOfClusterInstances[lambda],m_Instances.instance(Z)); m_ClusterAssignments[Z] = lambda; m_AssignedSet.add(new Integer(Z)); } } // close else } // close while if (m_verbose) System.out.println("Number of queries: " + query); m_NumCurrentClusters = lambda+1; return query; } /** Phase 2 code for active learning, with round robin */ protected void activePhaseTwoRoundRobin(int numQueries) throws Exception { int numInstances = m_Instances.numInstances(); int X,Y; int query = 0, Label; System.out.println("In Consolidate phase, with numqueries: " + numQueries); while( query < numQueries ){ if (m_verbose) System.out.println("Starting round robin"); // starting round robin Instance[] clusterCentroids = new Instance[m_NumClusters]; // find cluster with smallest size int smallestSize = Integer.MAX_VALUE, smallestCluster = -1; for (int i=0; i<m_NumClusters; i++) { if (m_NeighborSets[i].size() < smallestSize) { smallestSize = m_NeighborSets[i].size(); smallestCluster = i; } } // compute centroid for smallest cluster if (m_isSparseInstance) { clusterCentroids[smallestCluster] = new SparseInstance(m_SumOfClusterInstances[smallestCluster]); } else { clusterCentroids[smallestCluster] = new Instance(m_SumOfClusterInstances[smallestCluster]); } clusterCentroids[smallestCluster].setDataset(m_Instances); if (!m_objFunDecreasing) { normalize(clusterCentroids[smallestCluster]); } else { normalizeByWeight(clusterCentroids[smallestCluster]); } // find next point, closest to centroid of smallest cluster X = nearestFromPoint(clusterCentroids[smallestCluster], m_AssignedSet); if (X == -1) { if (m_verbose) System.out.println("No more points left unassigned, we are DONE!!"); createGlobalCentroids(); addMLAndCLTransitiveClosure(null); return; } if (m_verbose) System.out.println("Nearest point is " + X); if (X >= m_StartingIndexOfTest) { // Sanity Check throw new Exception ("Test point selected, something went wrong!"); } Iterator NbrIt = m_NeighborSets[smallestCluster].iterator(); Y = ((Integer) NbrIt.next()).intValue(); // get any point from the smallest neighborhood Label = askOracle(X,Y); query++; System.out.println("Making query:" + query); if (m_verbose) System.out.println("Number of queries: " + query); if( Label == InstancePair.MUST_LINK ){ // update data structures m_NeighborSets[smallestCluster].add(new Integer(X)); m_SumOfClusterInstances[smallestCluster] = sumWithInstance(m_SumOfClusterInstances[smallestCluster], m_Instances.instance(X)); m_ClusterAssignments[X] = smallestCluster; if (m_verbose) System.out.println("Adding " + X + " to cluster: " + smallestCluster); m_AssignedSet.add(new Integer(X)); if( query >= numQueries ){ if (m_verbose) System.out.println("Ran out of queries"); System.out.println("Consolidate phase over after " + query + " queries"); createGlobalCentroids(); addMLAndCLTransitiveClosure(null); return; } } else { // must-link not found with smallest neighborhood, process other neighborhoods now if (m_verbose) System.out.println("Processing other centroids now"); // compute centroids of other clusters for (int i=0; i<m_NumClusters; i++) { if (i != smallestCluster) { // already made query for smallest cluster if (m_isSparseInstance) { clusterCentroids[i] = new SparseInstance(m_SumOfClusterInstances[i]); } else { clusterCentroids[i] = new Instance(m_SumOfClusterInstances[i]); } clusterCentroids[i].setDataset(m_Instances); if (!m_objFunDecreasing) { normalize(clusterCentroids[i]); } else { normalizeByWeight(clusterCentroids[i]); } } } double[] similaritiesToCentroids = new double[m_NumClusters]; for (int i=0; i<m_NumClusters; i++) { if (i != smallestCluster) { // already made query for smallestCluster similaritiesToCentroids[i] = m_metric.similarity(clusterCentroids[i], m_Instances.instance(X)); } } // handles both Euclidean and WeightedDotP if (m_verbose) { System.out.println("Before sort"); for (int i=0; i<m_NumClusters; i++) { System.out.println(similaritiesToCentroids[i]); } } int[] indices = Utils.sort(similaritiesToCentroids); // sorts in ascending order of similarity if (m_verbose) { System.out.println("After sort"); for (int i=0; i<m_NumClusters; i++) { System.out.println(indices[i]); } } for(int h = m_NumClusters-1; h >=0; h-- ){ // since sort is ascending, and we want descending sort of similarity values int index = indices[h]; if (index != smallestCluster) { // already made query for smallest cluster NbrIt = m_NeighborSets[index].iterator(); Y = ((Integer) NbrIt.next()).intValue(); // get any point from the neighborhood Label = askOracle(X,Y); query++; System.out.println("Making query:" + query); if (m_verbose) System.out.println("Number of queries: " + query); if( Label == InstancePair.MUST_LINK ){ // update data structures m_NeighborSets[index].add(new Integer(X)); m_SumOfClusterInstances[index] = sumWithInstance(m_SumOfClusterInstances[index], m_Instances.instance(X)); m_ClusterAssignments[X] = index; if (m_verbose) System.out.println("Adding " + X + " to cluster: " + index); m_AssignedSet.add(new Integer(X)); if (m_verbose) System.out.println("Exiting phase 2 for loop"); break; // exit from for } if( query >= numQueries ){ if (m_verbose) System.out.println("Ran out of queries"); createGlobalCentroids(); addMLAndCLTransitiveClosure(null); return; } } } // end reverse for } // end else } // end while createGlobalCentroids(); addMLAndCLTransitiveClosure(null); return; } /** Phase 2 code for active learning, random */ protected void activePhaseTwoRandom(int numQueries) throws Exception { int numInstances = m_Instances.numInstances(); int X,Y; int query = 0, Label; System.out.println("In Phase 2 with random, with numqueries: " + numQueries); while( query < numQueries ){ if (m_verbose) System.out.println("Starting phase 2"); Instance[] clusterCentroids = new Instance[m_NumClusters]; if (m_AssignedSet.size() == m_StartingIndexOfTest) { if (m_verbose) System.out.println("No more points left unassigned, we are DONE!!"); createGlobalCentroids(); addMLAndCLTransitiveClosure(null); return; } // find next point at random X = m_RandomNumberGenerator.nextInt(m_StartingIndexOfTest); while (m_AssignedSet != null && m_AssignedSet.contains(new Integer(X))) { X = m_RandomNumberGenerator.nextInt(m_StartingIndexOfTest); } if (m_verbose) System.out.println("X = " + X + ", finding distances to centroids now"); // compute centroids of other clusters for (int i=0; i<m_NumClusters; i++) { if (m_isSparseInstance) { clusterCentroids[i] = new SparseInstance(m_SumOfClusterInstances[i]); } else { clusterCentroids[i] = new Instance(m_SumOfClusterInstances[i]); } clusterCentroids[i].setDataset(m_Instances);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -