📄 hcl.java

📁 数据挖掘中的聚合层次聚类算法
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
import java.text.*;
import java.io.*;

/**  
 * Carry out pairwise agglomerations.  For n items, therefore there
 * are n-1 agglomerations.  Represent the cluster labels is an nxn 
 * cluster label matrix.  Column no. n will be the singleton labels, 
 * 1 to n.  Column no. n-1 will have n-1 unique values (or label sequence
 * numbers).  Column no. n-2 will have n-2 unique values.  Column no. 1 
 * will have the value 1 only, implying that all n items are in one 
 * cluster.
 * <p>
 * ClustMat is our agglomeration "engine".  It looks after labeling 
 * only, and is independent of any agglomerative clustering criterion.
 * <p>
 * Other utility methods:
 * <p>
 * Dissim ... calculate dissimilarity matrix <br>
 * getNNs ... get nearest neighbors and associated
 *            nearest neighbor dissimilarities <br>
 * getSpaces ... helping in output formating <br>
 * printMatrix ... print matrix of doubles, or integers <br>
 * printVect ... print vector of doubles, or integers
 * <p> 
 * main does the following:
 * <ol>
 * <li> Reads data. (Format: integer row, column dimensions, followed
 *      by matrix values, read row-wise.)  No preprocessing on input. 
 * <li> Calculate pairwise dissimilarities, and determines nearest neighbors 
 *       and corresponding dissimilarities.  (Squared Euclidean distance 
 *       used.) 
 * <li> Determines the closest nearest neighbors.
 * <li> Carries out an agglomeration in ClustMat.
 * <li> Updates the  pairwise dissimilarity matrix, and then, on the basis
 *  of this, the nearest neighbors, and the nearest neighbor 
 *  dissimilarities.
 * <li> Repeats while no. of clusters is greater than 2. 
 * </ol>
 * Constant MAXVAL is used in, resp., dissimilarities and 
 * nearest neighbor dissimilarities, to indicate when items are processed and 
 * no longer exist as singletons.  <br>
 * Note also how flag = 1 denotes an active observation, and flag = 0
 * denotes an inactive one (since it has been agglomerated).  It is not 
 * necessary to use the flag since exceptionally high dissimilarities 
 * will signify inactive observations.  However the use of flag is 
 * helpful computationally.  
 * <p>
 * Step 5 here determines the agglomerative clustering criterion.  We are
 * currently using the minimum variance method.  It is indicated
 * in the code where to change to use other agglomerative criteria. <p>
 * Output cluster labels using original sequence numbers.  The ordering
 * of observations is not such that a dendrogram can be directly 
 * constructed.  However the cluster labels do allow selections and
 * further inter and intra cluster processing of the input data.  <p>
 * Example of use: <p>
 * <tt> javac HCL.java </tt> <br>
 * <tt> java HCL <a href="../iris.dat">iris.dat</a> > 
 *           <a href="../hcloutput.txt">hcloutput.txt</a> </tt> <p>
 * First version: 1999 Nov. <br>
 * Version: 2002 Oct. 26 <br>
 * Author: F. Murtagh, f.murtagh@qub.ac.uk
 * @version 2002 Oct. 26
 * @author F. Murtagh, f.murtagh@qub.ac.uk
 */

public class HCL
{
    public static final double MAXVAL = 1.0e12;

    /** 
     * Method Dissim, calculates dissimilarity n x n array
     * @param nrow integer row dimension
     * @param ncol integer column dimension
     * @param A floating row/column matrix
     * @return Adiss floating n x n dissimilarity array 
     */
    public static double[][] Dissim(int nrow, int ncol, 
                  double[] mass, double[][] A)
    {
    double[][] Adiss = new double[nrow][nrow];

       // Initialize to 0.  Caters for the diagonal which stays 0.
       for (int i1 = 0; i1 < nrow; i1++)  {
         for (int i2 = 0; i2 < nrow; i2++) Adiss[i1][i2] = 0.0;
       }
       for (int i1 = 0; i1 < nrow; i1++)  {
	   // All dissimilarity we are dealing with are symmetric
	   // so just calculate for half the array and fill in later
	   for (int i2 = 0; i2 < i1; i2++) {
	     for (int j = 0; j < ncol; j++) {
	       // We are happy with the squared dissimilarity
	       // 0.5 term since this equals 
               // (clcard[i1]*clcard[i2])/(clcard[i1]+clcard[i2]) 
               // for minimum variance criterion
	       Adiss[i1][i2] += 0.5*Math.pow(A[i1][j] - A[i2][j], 2.0); }
	       // Adiss[i1][i2] += ((mass[i1]*mass[i2])/(mass[i1]+mass[i2]))*
	       //   Math.pow( A[i1][j] - A[i2][j], 2.0 ); 
  	       // Fill the other half of the array 
               Adiss[i2][i1] = Adiss[i1][i2];
	   }
        }
        return Adiss;
    } // Dissim 
           

    /**
     * Method getNNs, determine NNs and NN dissimilarities
     * @param nrow row dimension or number of observations (input)
     * @param flag =1 for active observation, = 0 for inactive one (input) 
     * @param diss dissimilarity matrix (input)
     * @param nn nearest neighbor sequence number (calculated)
     * @param nndiss nearest neigbor dissimilarity (calculated)
     */
   public static void getNNs(int nrow, int[] flag, double[][] diss,
           int[] nn, double[]nndiss)
    {
       int minobs;  
       double mindist; 
       for (int i1 = 0; i1 < nrow; i1++)  {
           if (flag[i1] == 1) {
              minobs = -1;
              mindist = MAXVAL;
              for (int i2 = 0; i2 < nrow; i2++)   {
	         if ((diss[i1][i2] < mindist) && (i1 != i2)) {
                    mindist = diss[i1][i2];
                    minobs = i2;
                  }
              }
              nn[i1] = minobs + 1;
              nndiss[i1] = mindist;
           }
       }
       // Return type void => no return stmt
    } // getNNs


    /**
     * Method ClustMat, updates cluster structure matrix following 
     * an agglomeration
     * @param nrow row dimension or number of observations (input)
     * @param clusters list of agglomerations, stored as array of 
     *        pairs of cluster sequence numbers (input, and updated)
     * @param clust1 first agglomerand (input)
     * @param clust2 second agglomerand (input)
     * @param ncl number of clusters remaining (input)
     */
    public static void ClustMat(int nrow, int[][] clusters, 
           int clust1, int clust2, int ncl)
    {
        // If either clust* is not initialized, then we must init. clusters
	if ((clust1 == 0) || (clust2 == 0)) {
           for (int j = 0; j < nrow; j++) {
               for (int i = 0; i < nrow; i++) {
                   clusters[i][j] = 0;
	       }
	   }
	   // Adjust for 0-sequencing in extreme right-hand col values.
           for (int i = 0; i < nrow; i++) clusters[i][ncl-1] = i + 1;
           return;
	}

        // For some agglomeration, we are told that label clust1 and 
        // label clust2, among all labels in col. ncl-1 (ncl ranges over
        // 0 thru nrow-1) are to be agglomerated
	// We have arranged that order is such that clust1 < clust2
	// Note: clust1 and clust2 are 1-sequenced and not 0-sequenced 

        int ncl1; 
        ncl1 = ncl - 1; 
        for (int i = 0; i < nrow; i++) {
            clusters[i][ncl1] = clusters[i][ncl];
            if (clusters[i][ncl1] == clust2) clusters[i][ncl1] = clust1;
        }
        // Return type void => no return stmt
    } // ClustMat
 

    // Little method for helping in output formating  
    public static String getSpaces(int n) {
    StringBuffer sb = new StringBuffer(n);
    for (int i = 0; i < n; i++) sb.append(' ');
    return sb.toString();
    }


    /**
     * Method for printing a double float matrix  <br>
     * Based on ER Harold, "Java I/O", O'Reilly, around p. 473.
     * @param n1 row dimension of matrix
     * @param n2 column dimension of matrix
     * @param m input matrix values, double 
     * @param d display precision, number of decimal places
     * @param w display precision, total width of floating value
     */
    public static void printMatrix(int n1, int n2, double[][] m, int d, int w)
    {
        // Some definitions for handling output formating
      NumberFormat myFormat = NumberFormat.getNumberInstance();
      FieldPosition fp = new FieldPosition(NumberFormat.INTEGER_FIELD);
      myFormat.setMaximumIntegerDigits(d);
      myFormat.setMaximumFractionDigits(d);
      myFormat.setMinimumFractionDigits(d);
      for (int i=0; i<n1; i++)
          {
              // Print each row, elements separated by spaces
              for (int j=0; j<n2; j++) {
                  String valString = myFormat.format(
                       m[i][j], new StringBuffer(), fp).toString();
                  valString = getSpaces(w - fp.getEndIndex()) + valString;
                  System.out.print(valString);
                  }
              // Start a new line at the end of a row
              System.out.println();
          }
      // Leave a gap after the entire matrix
      System.out.println();
    }


    /**
     * Method for printing an integer matrix  <br>
     * Based on ER Harold, "Java I/O", O'Reilly, around p. 473.
     * @param n1 row dimension of matrix
     * @param n2 column dimension of matrix
     * @param m input matrix values 
     * @param d display precision, number of decimal places
     * @param w display precision, total width of floating value
     */
    public static void printMatrix(int n1, int n2, int[][] m, int d, int w)
    {
        // Some definitions for handling output formating
      NumberFormat myFormat = NumberFormat.getNumberInstance();
      FieldPosition fp = new FieldPosition(NumberFormat.INTEGER_FIELD);
      myFormat.setMaximumIntegerDigits(d);
      for (int i=0; i<n1; i++)   {
              // Print each row, elements separated by spaces
              for (int j=0; j<n2; j++)  {
                  String valString = myFormat.format(
                       m[i][j], new StringBuffer(), fp).toString();
		  // 4 character locations per integer number 
                  valString = getSpaces(w - fp.getEndIndex()) + valString;
                  System.out.print(valString);
                  }
              // Start a new line at the end of a row
              System.out.println();
          }
      // Leave a gap after the entire matrix
      System.out.println();
    }  // printMatrix


    /**
     * Method printVect for printing a double float vector <br>
     * Based on ER Harold, "Java I/O", O'Reilly, around p. 473.
     * @param m input vector of length m.length
     * @param d display precision, number of decimal places
     * @param w display precision, total width of floating value
     */
    public static void printVect(double[] m, int d, int w)
    {
        // Some definitions for handling output formating
      NumberFormat myFormat = NumberFormat.getNumberInstance();
      FieldPosition fp = new FieldPosition(NumberFormat.INTEGER_FIELD);
      myFormat.setMaximumIntegerDigits(d);
      myFormat.setMaximumFractionDigits(d);
      myFormat.setMinimumFractionDigits(d);
      int len = m.length;
      for (int i=0; i<len; i++)  {
             String valString = myFormat.format(
                   m[i], new StringBuffer(), fp).toString();
             valString = getSpaces(w - fp.getEndIndex()) + valString;
                   System.out.print(valString);
          }
          // Start a new line at the row end
          System.out.println();
      // Leave a gap after the entire vector
      System.out.println();
    }  // printVect


    /**
     * Method printVect for printing an integer vector <br>
12 下一页
💿 文件大小 7 K
👤 上传用户 a1102882595
📂 所属分类数值算法/人工智能
🏷️ 相关标签

#数据挖掘 #层次聚类 #算法
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -