📄 xmeans.java
字号:
* "finished" does get true as soon as: * 1. number of clusters gets >= m_MaxClusters, * 2. in the last round, none of the centers have been split * * if number of clusters is already >= m_MaxClusters * part 1 (= Improve-Params) is done at least once. */ while (!finished && !stopIteration(m_IterationCount, m_MaxIterations)) { /* ==================================================================== * 1. Improve-Params * conventional K-means */ PFD(D_FOLLOWSPLIT, "\nBeginning of main loop - centers:"); PrCentersFD(D_FOLLOWSPLIT); PFD(D_ITERCOUNT, "\n*** 1. Improve-Params " + m_IterationCount + ". time"); m_IterationCount++; // prepare to converge boolean converged = false; // initialize assignments to -1 m_ClusterAssignments = initAssignments(m_Instances.numInstances()); // stores a list of indexes of instances belonging to each center int[][] instOfCent = new int[m_ClusterCenters.numInstances()][]; // KMeans loop counter int kMeansIteration = 0; // converge in conventional K-means ---------------------------------- PFD(D_FOLLOWSPLIT, "\nConverge in K-Means:"); while (!converged && !stopKMeansIteration(kMeansIteration, m_MaxKMeans)) { kMeansIteration++; converged = true; // assign instances to centers ------------------------------------- converged = assignToCenters(m_UseKDTree ? m_KDTree : null, m_ClusterCenters, instOfCent, allInstList, m_ClusterAssignments, kMeansIteration); PFD(D_FOLLOWSPLIT, "\nMain loop - Assign - centers:"); PrCentersFD(D_FOLLOWSPLIT); // compute new centers = centers of mass of points converged = recomputeCenters(m_ClusterCenters, // clusters instOfCent, // their instances m_Model); // model information PFD(D_FOLLOWSPLIT, "\nMain loop - Recompute - centers:"); PrCentersFD(D_FOLLOWSPLIT); } PFD(D_FOLLOWSPLIT, ""); PFD(D_FOLLOWSPLIT, "End of Part: 1. Improve-Params - conventional K-means"); /** ===================================================================== * 2. Improve-Structur */ // BIC before split distortioning the centres m_Mle = distortion(instOfCent, m_ClusterCenters); m_Bic = calculateBIC(instOfCent, m_ClusterCenters, m_Mle); PFD(D_FOLLOWSPLIT, "m_Bic " + m_Bic); int currNumCent = m_ClusterCenters.numInstances(); Instances splitCenters = new Instances(m_ClusterCenters, currNumCent * 2); // store BIC values of parent and children double[] pbic = new double [currNumCent]; double[] cbic = new double [currNumCent]; // split each center for (int i = 0; i < currNumCent // this could help to optimize the algorithm // && currNumCent + numSplits <= m_MaxNumClusters ; i++) { PFD(D_FOLLOWSPLIT, "\nsplit center " + i + " " + m_ClusterCenters.instance(i)); Instance currCenter = m_ClusterCenters.instance(i); int[] currInstList = instOfCent[i]; int currNumInst = instOfCent[i].length; // not enough instances; than continue with next if (currNumInst <= 2) { pbic[i] = Double.MAX_VALUE; cbic[i] = 0.0; // add center itself as dummy splitCenters.add(currCenter); splitCenters.add(currCenter); continue; } // split centers ---------------------------------------------- double variance = m_Mle[i] / (double)currNumInst; children = splitCenter(random0, currCenter, variance, m_Model); // initialize assignments to -1 int[] oneCentAssignments = initAssignments(currNumInst); int[][] instOfChCent = new int [2][]; // todo maybe split didn't work // converge the children -------------------------------------- converged = false; int kMeansForChildrenIteration = 0; PFD(D_FOLLOWSPLIT, "\nConverge, K-Means for children: " + i); while (!converged && !stopKMeansIteration(kMeansForChildrenIteration, m_MaxKMeansForChildren)) { kMeansForChildrenIteration++; converged = assignToCenters(children, instOfChCent, currInstList, oneCentAssignments); if (!converged) { recomputeCentersFast(children, instOfChCent, m_Model); } } // store new centers for later decision if they are taken splitCenters.add(children.instance(0)); splitCenters.add(children.instance(1)); PFD(D_FOLLOWSPLIT, "\nconverged cildren "); PFD(D_FOLLOWSPLIT, " " + children.instance(0)); PFD(D_FOLLOWSPLIT, " " + children.instance(1)); // compare parent and children model by their BIC-value pbic[i] = calculateBIC(currInstList, currCenter, m_Mle[i], m_Model); double[] chMLE = distortion(instOfChCent, children); cbic[i] = calculateBIC(instOfChCent, children, chMLE); } // end of loop over clusters // decide which one to split and make new list of cluster centers Instances newClusterCenters = null; newClusterCenters = newCentersAfterSplit(pbic, cbic, m_CutOffFactor, splitCenters); /** * Compare with before Improve-Structure */ int newNumClusters = newClusterCenters.numInstances(); if (newNumClusters != m_NumClusters) { PFD(D_FOLLOWSPLIT, "Compare with non-split"); // initialize assignments to -1 int[] newClusterAssignments = initAssignments(m_Instances.numInstances()); // stores a list of indexes of instances belonging to each center int[][] newInstOfCent = new int[newClusterCenters.numInstances()][]; // assign instances to centers ------------------------------------- converged = assignToCenters(m_UseKDTree ? m_KDTree : null, newClusterCenters, newInstOfCent, allInstList, newClusterAssignments, m_IterationCount); double[] newMle = distortion(newInstOfCent, newClusterCenters); double newBic = calculateBIC(newInstOfCent, newClusterCenters, newMle); PFD(D_FOLLOWSPLIT, "newBic " + newBic); if (newBic > m_Bic) { PFD(D_FOLLOWSPLIT, "*** decide for new clusters"); m_Bic = newBic; m_ClusterCenters = newClusterCenters; m_ClusterAssignments = newClusterAssignments; } else { PFD(D_FOLLOWSPLIT, "*** keep old clusters"); } } newNumClusters = m_ClusterCenters.numInstances(); // decide if finished: max num cluster reached // or last centers where not split at all if ((newNumClusters >= m_MaxNumClusters) || (newNumClusters == m_NumClusters)) { finished = true; } m_NumClusters = newNumClusters; } } /** * Checks for nominal attributes in the dataset. * Class attribute is ignored. * @param data the data to check * @return false if no nominal attributes are present */ public boolean checkForNominalAttributes(Instances data) { int i = 0; while (i < data.numAttributes()) { if ((i != data.classIndex()) && data.attribute(i++).isNominal()) { return true; } } return false; } /** * Set array of int, used to store assignments, to -1. * @param ass integer array used for storing assignments * @return integer array used for storing assignments */ protected int[] initAssignments(int[] ass) { for (int i = 0; i < ass.length; i++) ass[i] = -1; return ass; } /** * Creates and initializes integer array, used to store assignments. * @param numInstances length of array used for assignments * @return integer array used for storing assignments */ protected int[] initAssignments(int numInstances) { int[] ass = new int[numInstances]; for (int i = 0; i < numInstances; i++) ass[i] = -1; return ass; } /** * Creates and initializes boolean array. * @param len length of new array * @return the new array */ boolean[] initBoolArray(int len) { boolean[] boolArray = new boolean [len]; for (int i = 0; i < len; i++) { boolArray[i] = false; } return boolArray; } /** * Returns new center list. * * The following steps 1. and 2. both take care that the number of centers * does not exceed maxCenters. * * 1. Compare BIC values of parent and children and takes the one as * new centers which do win (= BIC-value is smaller). * * 2. If in 1. none of the children are chosen * && and cutoff factor is > 0 * cutoff factor is taken as the percentage of "best" centers that are * still taken. * @param pbic array of parents BIC-values * @param cbic array of childrens BIC-values * @param cutoffFactor cutoff factor * @param splitCenters all children * @return the new centers */ protected Instances newCentersAfterSplit(double[] pbic, double[] cbic, double cutoffFactor, Instances splitCenters) { // store if split won boolean splitPerCutoff = false; boolean takeSomeAway = false; boolean[] splitWon = initBoolArray(m_ClusterCenters.numInstances()); int numToSplit = 0; Instances newCenters = null; // how many would be split, because the children have a better bic value for (int i = 0; i < cbic.length; i++) { if (cbic[i] > pbic[i]) { // decide for splitting ---------------------------------------- splitWon[i] = true; numToSplit++; PFD(D_FOLLOWSPLIT, "Center " + i + " decide for children"); } else { // decide for parents and finished stays true ----------------- PFD(D_FOLLOWSPLIT, "Center " + i + " decide for parent"); } } // no splits yet so split per cutoff factor if ((numToSplit == 0) && (cutoffFactor > 0)) { splitPerCutoff = true; // how many to split per cutoff factor numToSplit = (int) ((double) m_ClusterCenters.numInstances() * m_CutOffFactor); } // prepare indexes of values in ascending order double[] diff = new double [m_NumClusters]; for (int j = 0; j < diff.length; j++) { diff[j] = pbic[j] - cbic[j]; } int[] sortOrder = Utils.sort(diff); // check if maxNumClusters would be exceeded int possibleToSplit = m_MaxNumClusters - m_NumClusters; if (possibleToSplit > numToSplit) { // still enough possible, do the whole amount possibleToSplit = numToSplit; } else takeSomeAway = true; // prepare for splitting the one that are supposed to be split if (splitPerCutoff) { for (int j = 0; (j < possibleToSplit) && (cbic[sortOrder[j]] > 0.0); j++) { splitWon[sortOrder[j]] = true; } m_NumSplitsStillDone += possibleToSplit; } else { // take some splits away if max number of clusters would be exceeded if (takeSomeAway) { int count = 0; int j = 0; for (;j < splitWon.length && count < possibleToSplit; j++){ if (splitWon[sortOrder[j]] == true) count++; } while (j < splitWon.length) { splitWon[sortOrder[j]] = false; j++; } } } // finally split if (possibleToSplit > 0) newCenters = newCentersAfterSplit(splitWon, splitCenters); else newCenters = m_ClusterCenters; return newCenters; } /** * Returns new centers. Depending on splitWon: if true takes children, if * false takes parent = current center. * * @param splitWon * array of boolean to indicate to take split or not * @param splitCenters * list of splitted centers * @return the new centers */ protected Instances newCentersAfterSplit(boolean[] splitWon, Instances splitCenters) { Instances newCenters = new Instances(splitCenters, 0); int sIndex = 0; for (int i = 0; i < splitWon.length; i++) { if (splitWon[i]) { m_NumSplitsDone++; newCenters.add(splitCenters.instance(sIndex++)); newCenters.add(splitCenters.instance(sIndex++)); } else { sIndex++; sIndex++; newCenters.add(m_ClusterCenters.instance(i)); } } return newCenters; } /** * Controls that counter does not exceed max iteration value. Special function * for kmeans iterations. * * @param iterationCount * current value of counter * @param max * maximum value for counter * @return true if iteration should be stopped */ protected boolean stopKMeansIteration(int iterationCount, int max) { boolean stopIterate = false; if (max >= 0) stopIterate = (iterationCount >= max); if (stopIterate) m_KMeansStopped++; return stopIterate; } /** * Checks if iterationCount has to be checked and if yes * (this means max is > 0) compares it with max. * * @param iterationCount the current iteration count * @param max the maximum number of iterations * @return true if maximum has been reached */ protected boolean stopIteration(int iterationCount, int max) { boolean stopIterate = false; if (max >= 0) stopIterate = (iterationCount >= max); return stopIterate; } /** * Recompute the new centers. New cluster center is center of mass of its * instances. Returns true if cluster stays the same. * @param centers the input and output centers * @param instOfCent the instances to the centers * @param model data model information * @return true if converged. */ protected boolean recomputeCenters(Instances centers, int[][] instOfCent, Instances model) { boolean converged = true; for (int i = 0; i < centers.numInstances(); i++) { double val; for (int j = 0; j < model.numAttributes(); j++) { val = meanOrMode(m_Instances, instOfCent[i], j); for (int k = 0; k < instOfCent[i].length; k++) if (converged && m_ClusterCenters.instance(i).value(j) != val) converged = false; if (!converged) m_ClusterCenters.instance(i).setValue(j, val); } } return converged; } /** * Recompute the new centers - 2nd version
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -