📄 xmeans.java

📁 数据挖掘中聚类的算法
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
     * "finished" does get true as soon as:     * 1. number of clusters gets >= m_MaxClusters,      * 2. in the last round, none of the centers have been split     *      * if number of clusters is already >= m_MaxClusters      * part 1 (= Improve-Params) is done at least once.     */    while (!finished &&           !stopIteration(m_IterationCount, m_MaxIterations)) {            /* ====================================================================       * 1. Improve-Params                         *    conventional K-means       */      PFD(D_FOLLOWSPLIT, "\nBeginning of main loop - centers:");      PrCentersFD(D_FOLLOWSPLIT);      PFD(D_ITERCOUNT, "\n*** 1. Improve-Params " + m_IterationCount + 	  ". time");      m_IterationCount++;      // prepare to converge      boolean converged = false;      // initialize assignments to -1      m_ClusterAssignments = initAssignments(m_Instances.numInstances());      // stores a list of indexes of instances belonging to each center      int[][] instOfCent = new int[m_ClusterCenters.numInstances()][];      // KMeans loop counter      int kMeansIteration = 0;      // converge in conventional K-means ----------------------------------      PFD(D_FOLLOWSPLIT, "\nConverge in K-Means:");      while (!converged && 	     !stopKMeansIteration(kMeansIteration, m_MaxKMeans)) {		kMeansIteration++;	converged = true;	        // assign instances to centers -------------------------------------        converged = assignToCenters(m_UseKDTree ? m_KDTree : null,				    m_ClusterCenters, 				    instOfCent,				    allInstList, 				    m_ClusterAssignments,				    kMeansIteration);		PFD(D_FOLLOWSPLIT, "\nMain loop - Assign - centers:");	PrCentersFD(D_FOLLOWSPLIT);	// compute new centers = centers of mass of points        converged = recomputeCenters(m_ClusterCenters, // clusters				     instOfCent,       // their instances				     m_Model);         // model information      PFD(D_FOLLOWSPLIT, "\nMain loop - Recompute - centers:");      PrCentersFD(D_FOLLOWSPLIT);      }      PFD(D_FOLLOWSPLIT, "");      PFD(D_FOLLOWSPLIT, "End of Part: 1. Improve-Params - conventional K-means");      /** =====================================================================       * 2. Improve-Structur       */      // BIC before split distortioning the centres      m_Mle = distortion(instOfCent, m_ClusterCenters);      m_Bic = calculateBIC(instOfCent, m_ClusterCenters, m_Mle);      PFD(D_FOLLOWSPLIT, "m_Bic " + m_Bic);      int currNumCent = m_ClusterCenters.numInstances();      Instances splitCenters = new Instances(m_ClusterCenters, 					     currNumCent * 2);            // store BIC values of parent and children      double[] pbic = new double [currNumCent];      double[] cbic = new double [currNumCent];                  // split each center      for (int i = 0; i < currNumCent 	   // this could help to optimize the algorithm	   //	     && currNumCent + numSplits <= m_MaxNumClusters           ; 	   i++) {		PFD(D_FOLLOWSPLIT, "\nsplit center " + i +		      " " + m_ClusterCenters.instance(i));	Instance currCenter = m_ClusterCenters.instance(i);	int[] currInstList = instOfCent[i];	int currNumInst = instOfCent[i].length;		// not enough instances; than continue with next	if (currNumInst <= 2) {	  pbic[i] = Double.MAX_VALUE;	  cbic[i] = 0.0;	  // add center itself as dummy	  splitCenters.add(currCenter);	  splitCenters.add(currCenter);	  continue;	}		// split centers  ----------------------------------------------	double variance = m_Mle[i] / (double)currNumInst;	children = splitCenter(random0, currCenter, variance, m_Model);		// initialize assignments to -1	int[] oneCentAssignments = initAssignments(currNumInst);	int[][] instOfChCent = new int [2][]; // todo maybe split didn't work		// converge the children  --------------------------------------	converged = false;	int kMeansForChildrenIteration = 0;	PFD(D_FOLLOWSPLIT, "\nConverge, K-Means for children: " + i);	while (!converged &&           !stopKMeansIteration(kMeansForChildrenIteration, 			       m_MaxKMeansForChildren)) {	  kMeansForChildrenIteration++;	  	  converged =	    assignToCenters(children, instOfChCent,			    currInstList, oneCentAssignments);	  if (!converged) {       	    recomputeCentersFast(children, instOfChCent, m_Model);	  }	} 	// store new centers for later decision if they are taken	splitCenters.add(children.instance(0));	splitCenters.add(children.instance(1));	PFD(D_FOLLOWSPLIT, "\nconverged cildren ");	PFD(D_FOLLOWSPLIT, " " + children.instance(0));	PFD(D_FOLLOWSPLIT, " " + children.instance(1));	// compare parent and children model by their BIC-value	pbic[i] = calculateBIC(currInstList, currCenter,  m_Mle[i], m_Model);	double[] chMLE = distortion(instOfChCent, children);	cbic[i] = calculateBIC(instOfChCent, children, chMLE);      } // end of loop over clusters      // decide which one to split and make new list of cluster centers      Instances newClusterCenters = null;      newClusterCenters = newCentersAfterSplit(pbic, cbic, m_CutOffFactor,                                                 splitCenters);      /**       * Compare with before Improve-Structure       */      int newNumClusters = newClusterCenters.numInstances();      if (newNumClusters != m_NumClusters) {		PFD(D_FOLLOWSPLIT, "Compare with non-split");	// initialize assignments to -1	int[] newClusterAssignments = 	  initAssignments(m_Instances.numInstances());		// stores a list of indexes of instances belonging to each center	int[][] newInstOfCent = new int[newClusterCenters.numInstances()][];		// assign instances to centers -------------------------------------	converged = assignToCenters(m_UseKDTree ? m_KDTree : null,				    newClusterCenters, 				    newInstOfCent,				    allInstList, 				    newClusterAssignments,				    m_IterationCount);		double[] newMle = distortion(newInstOfCent, newClusterCenters);	double newBic = calculateBIC(newInstOfCent, newClusterCenters, newMle);	PFD(D_FOLLOWSPLIT, "newBic " + newBic);	if (newBic > m_Bic) {          PFD(D_FOLLOWSPLIT, "*** decide for new clusters");	  m_Bic = newBic;	  m_ClusterCenters = newClusterCenters;	  m_ClusterAssignments = newClusterAssignments;	} else {          PFD(D_FOLLOWSPLIT, "*** keep old clusters");        }      }      newNumClusters = m_ClusterCenters.numInstances();      // decide if finished: max num cluster reached       // or last centers where not split at all       if ((newNumClusters >= m_MaxNumClusters) 	  || (newNumClusters == m_NumClusters)) {	finished = true;      }      m_NumClusters = newNumClusters;    }  }  /**   * Checks for nominal attributes in the dataset.   * Class attribute is ignored.   * @param data the data to check   * @return false if no nominal attributes are present   */  public boolean checkForNominalAttributes(Instances data) {    int i = 0;    while (i < data.numAttributes()) {      if ((i != data.classIndex()) && data.attribute(i++).isNominal()) {	return true;      }    }    return false;  }  /**   * Set array of int, used to store assignments, to -1.   * @param ass integer array used for storing assignments   * @return integer array used for storing assignments   */  protected int[] initAssignments(int[] ass) {    for (int i = 0; i < ass.length; i++)      ass[i] = -1;    return ass;  }       /**   * Creates and initializes integer array, used to store assignments.   * @param numInstances length of array used for assignments   * @return integer array used for storing assignments   */  protected int[] initAssignments(int numInstances) {    int[] ass = new int[numInstances];    for (int i = 0; i < numInstances; i++)      ass[i] = -1;    return ass;  }        /**   * Creates and initializes boolean array.   * @param len length of new array   * @return the new array   */  boolean[] initBoolArray(int len) {    boolean[] boolArray = new boolean [len];    for (int i = 0; i < len; i++) {      boolArray[i] = false;    }    return boolArray;  }  /**   * Returns new center list.   *   * The following steps 1. and 2. both take care that the number of centers   * does not exceed maxCenters.   *   * 1. Compare BIC values of parent and children and takes the one as   * new centers which do win (= BIC-value is smaller).   *   * 2. If in 1. none of the children are chosen    *    && and cutoff factor is > 0   * cutoff factor is taken as the percentage of "best" centers that are   * still taken.   * @param pbic array of parents BIC-values   * @param cbic array of childrens BIC-values   * @param cutoffFactor cutoff factor    * @param splitCenters all children    * @return the new centers   */  protected Instances newCentersAfterSplit(double[] pbic, 					 double[] cbic,					 double cutoffFactor,					 Instances splitCenters) {    // store if split won    boolean splitPerCutoff = false;    boolean takeSomeAway = false;    boolean[] splitWon = initBoolArray(m_ClusterCenters.numInstances());    int numToSplit = 0;    Instances newCenters = null;        // how many would be split, because the children have a better bic value    for (int i = 0; i < cbic.length; i++) {      if (cbic[i] > pbic[i]) {	// decide for splitting ----------------------------------------	splitWon[i] = true; numToSplit++;	PFD(D_FOLLOWSPLIT, "Center " + i + " decide for children");      }      else {	// decide for parents and finished stays true  -----------------	PFD(D_FOLLOWSPLIT, "Center " + i + " decide for parent");      }    }    // no splits yet so split per cutoff factor    if ((numToSplit == 0) && (cutoffFactor > 0)) {      splitPerCutoff = true;            // how many to split per cutoff factor      numToSplit = (int)         ((double) m_ClusterCenters.numInstances() * m_CutOffFactor);     }    // prepare indexes of values in ascending order      double[] diff = new double [m_NumClusters];    for (int j = 0; j < diff.length; j++) {      diff[j] = pbic[j] - cbic[j];    }        int[] sortOrder = Utils.sort(diff);        // check if maxNumClusters would be exceeded    int possibleToSplit = m_MaxNumClusters - m_NumClusters;     if (possibleToSplit > numToSplit) {      // still enough possible, do the whole amount      possibleToSplit = numToSplit;    }    else      takeSomeAway = true;    // prepare for splitting the one that are supposed to be split    if (splitPerCutoff) {      for (int j = 0; (j < possibleToSplit) && (cbic[sortOrder[j]] > 0.0);	   j++) {	splitWon[sortOrder[j]] = true;      }      m_NumSplitsStillDone += possibleToSplit;    }     else {      // take some splits away if max number of clusters would be exceeded      if (takeSomeAway) {	int count = 0;	int j = 0;	for (;j < splitWon.length && count < possibleToSplit; j++){	  if (splitWon[sortOrder[j]] == true) count++;	}		while (j < splitWon.length) {	  splitWon[sortOrder[j]] = false;	  j++;	}      }    }       // finally split    if (possibleToSplit > 0)       newCenters = newCentersAfterSplit(splitWon, splitCenters);    else      newCenters = m_ClusterCenters;    return newCenters;  }  /**   * Returns new centers. Depending on splitWon: if true takes children, if   * false takes parent = current center.   *    * @param splitWon   *          array of boolean to indicate to take split or not   * @param splitCenters   *          list of splitted centers   * @return the new centers   */  protected Instances newCentersAfterSplit(boolean[] splitWon,      Instances splitCenters) {    Instances newCenters = new Instances(splitCenters, 0);    int sIndex = 0;    for (int i = 0; i < splitWon.length; i++) {      if (splitWon[i]) {        m_NumSplitsDone++;        newCenters.add(splitCenters.instance(sIndex++));        newCenters.add(splitCenters.instance(sIndex++));      } else {        sIndex++;        sIndex++;        newCenters.add(m_ClusterCenters.instance(i));      }    }    return newCenters;  }  /**   * Controls that counter does not exceed max iteration value. Special function   * for kmeans iterations.   *    * @param iterationCount   *          current value of counter   * @param max   *          maximum value for counter   * @return true if iteration should be stopped   */   protected boolean stopKMeansIteration(int iterationCount, int max) {    boolean stopIterate = false;    if (max >= 0)       stopIterate = (iterationCount >= max);    if (stopIterate)       m_KMeansStopped++;    return stopIterate;  }  /**   * Checks if iterationCount has to be checked and if yes   * (this means max is > 0) compares it with max.   *    * @param iterationCount the current iteration count   * @param max the maximum number of iterations   * @return true if maximum has been reached   */   protected boolean stopIteration(int iterationCount, int max) {    boolean stopIterate = false;    if (max >= 0)       stopIterate = (iterationCount >= max);    return stopIterate;  }  /**   * Recompute the new centers. New cluster center is center of mass of its    * instances. Returns true if cluster stays the same.   * @param centers the input and output centers   * @param instOfCent the instances to the centers    * @param model data model information   * @return true if converged.   */   protected boolean recomputeCenters(Instances centers,          				   int[][] instOfCent, 				   Instances model) {    boolean converged = true;        for (int i = 0; i < centers.numInstances(); i++) {      double val;      for (int j = 0; j < model.numAttributes(); j++) {	val = meanOrMode(m_Instances, instOfCent[i], j);	for (int k = 0; k < instOfCent[i].length; k++)	if (converged && m_ClusterCenters.instance(i).value(j) != val) 	  converged = false;	if (!converged)	  m_ClusterCenters.instance(i).setValue(j, val);      }    }    return converged;  }  /**   * Recompute the new centers - 2nd version
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -