📄 hcl.java

📁 数据挖掘中的聚合层次聚类算法
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
     * Based on ER Harold, "Java I/O", O'Reilly, around p. 473.
     * @param m input vector of length m.length
     * @param d display precision, number of decimal places
     * @param w display precision, total width of floating value
     */
    public static void printVect(int[] m, int d, int w)
    {
        // Some definitions for handling output formating
      NumberFormat myFormat = NumberFormat.getNumberInstance();
      FieldPosition fp = new FieldPosition(NumberFormat.INTEGER_FIELD);
      myFormat.setMaximumIntegerDigits(d);
      int len = m.length;
      for (int i=0; i<len; i++)  {
             String valString = myFormat.format(
                   m[i], new StringBuffer(), fp).toString();
             valString = getSpaces(w - fp.getEndIndex()) + valString;
                   System.out.print(valString);
          }
          // Start a new line at the row end
          System.out.println();
      // Leave a gap after the entire vector
      System.out.println();
    } // printVect


    // The main method contains the body of the program
    public static void main(String[] argv)
    {  
   PrintStream out = System.out;

   try{
     if (argv.length == 0) {
        System.out.println(" Syntax: java HCL infile.dat ");
        System.out.println(" Input file format: ");
        System.out.println(" Line 1: integer no. rows, no. cols.");
        System.out.println(" Successive lines: matrix values, floating");
        System.out.println(" Read in row-wise");
        System.exit (1);
     }
     String filname = argv[0]; 
     System.out.println (" Input file name: " + filname);

     // Open the matrix file
     FileInputStream is  = new FileInputStream(filname);
     BufferedReader  bis = new BufferedReader(new InputStreamReader(is));
     StreamTokenizer st  = new StreamTokenizer(bis);

     // Row and column sizes, read in first
     st.nextToken();  int nrow = (int)st.nval;
     st.nextToken();  int ncol = (int)st.nval;

     System.out.println(" No. of rows, nrow = " + nrow);
     System.out.println(" No. of cols, ncol = " + ncol);
     
     // Input array, values to be read in successively, float
     double[][] indat = new double[nrow][ncol]; 
     double inval;

     // New read in input array values, successively
     System.out.println
        (" Input data sample follows as a check, first 4 values.");
     for (int i = 0; i < nrow; i++) {
        for (int j = 0; j < ncol; j++) {
             st.nextToken();
             inval = (double)st.nval;
             indat[i][j] = inval;
             if (i < 2 && j < 2) {
                System.out.println(" value = " + inval);
             }
        }
     }  
     System.out.println();

     //-------------------------------------------------------------------

     // Some definitions and initializations
        int[][] clusters = new int[nrow][nrow];  // cluster label matrix
        int[] nn = new int[nrow];                // cluster nearest neigh's
        int[] flag = new int[nrow];              // flag of active obs
        double[] nndiss = new double[nrow];      // nearest neigh diss's
        double[] clcard = new double[nrow];      // cluster cardinality
	double[] mass = new double[nrow];        // observation masses
	double[] cpoids = new double[ncol];      // column weights
        int minobs;
        double mindist;  
        int ncl;   // Initial number of clusters 
        ncl = nrow; 

        for (int i = 0; i < nrow; i++) {
            flag[i] = 1;
            clcard[i] = 1.0;
            mass[i] = 1.0; 
        }
        for (int j = 0; j < ncol; j++) {
            cpoids[j] = 0.0; 
        }

        // We may wish to normalize out input data, 
	// and create mass and cpoids, resp. row and col. masses
	// In the following: row/col. masses are row/col sums div. by total
	// double tot = 0.0; 
        // for (int i = 0; i < nrow; i++) {
        //   for (int j = 0; j < ncol; j++) {
        //      mass[i] += indat[i][j];
	//	cpoids[j] += indat[i][j]; 
        //      tot += indat[i][j];
	//    }
	// }
	// And then we det. fij <-  xij/fi*sqrt(fj)
	// where fi and fj are, resp., row and col. masses
        // for (int j = 0; j < ncol; j++) {
	//  cpoids[j] = Math.sqrt(cpoids[j]/tot);
	// }
	// for (int i = 0; i < nrow; i++) {
	//  mass[i] /= tot; 
	//  for (int j = 0; j < ncol; j++) {
	//	indat[i][j] /= (tot*mass[i]*cpoids[j]);
	//    }
	// }
	//System.out.println("Normalized array to be analyzed:");
        //printMatrix(nrow, ncol, indat, 4, 10);
	//System.out.println("Row masses:");
        //printVect(mass, 4, 10);

	// Alternative normalizations include:
	// Centre row vectors to zero mean, and reduce to unit std. dev.
	// We'll use no normalization.  We'll take masses as equal to 
	// cluster cardinalities.  

        // Get dissimilarities
        double[][] diss = new double[nrow][nrow];
        diss = Dissim(nrow, ncol, mass, indat); 
        // Print it out
        System.out.println("Dissimilarity matrix for analysis:");
        printMatrix(nrow, nrow, diss, 4, 10);

	// Get nearest neighbors and nearest neighbor dissimilarities
        getNNs(nrow, flag, diss, nn, nndiss);
	//  System.out.println("Nearest neighbors:");
	//  printVect(nn, 4, 10); 
        //  System.out.println("Nearest neighbors dissimilarities:");
        //  printVect(nndiss, 4, 10);

        // Get closest neighbors, using nndiss, followed by nn
        int clust1 = 0;
        int clust2 = 0;
        int cl1    = 0;
        int cl2    = 0;
	// Call to initialize
        ClustMat(nrow, clusters, clust1, clust2, ncl);
        //  System.out.println("No. of clusters:  " + ncl);  
	//  System.out.println(" Cluster label matrix:");
        //  printMatrix(nrow, nrow, clusters, 4, 10);


	// Loop to carry out series of nrow-1 agglomerations
        do 
	    {

        minobs = -1;
        mindist = MAXVAL;
	    for (int i = 0; i < nrow; i++) {
                if (flag[i] == 1) {
                   if (nndiss[i] < mindist) {
                      mindist = nndiss[i];
                      minobs = i;
		   } 
                }
	    } 
	    // minobs is one cluster label, the other is nn[minobs]
	    // Adjust for 0-sequencing
  	    if (minobs < nn[minobs]) {
               clust1 = minobs + 1;
               clust2 = nn[minobs];
            }
  	    if (minobs > nn[minobs]) {
               clust2 = minobs + 1;
               clust1 = nn[minobs];
            }
	     //    Now we have clust1 < clust2, and we'll agglomerate
	    System.out.println(" clus#1: "  + clust1 +
                               ";  clus#2: "  + clust2 +
                               ";  new card: " + (clcard[clust1-1]+
                                                  clcard[clust2-1]) + 
                               "; # clus left: " + ncl +
                               "; mindiss: " + mindist);

	    // Now we will carry out an agglomeration
            ncl = ncl - 1; 
            ClustMat(nrow, clusters, clust1, clust2, ncl);
	    // System.out.println("#clusters left: " + ncl +
            //                 "; cluster label matrix: ");
            // printMatrix(nrow, nrow, clusters, 4, 10);

	    // Update the following: diss, nndiss
            // Strategy:
            // nn[clust2] ceases to exist; similarly nndiss[clust2]
            // ... for all occurrences of clust2
            // nn[clust1] must be updated, as must nndiss[clust1]
            // Only other change is for any nn[i] such that nn[i] =
            // ... clust1 or clust2; this must be updated.

            // First, update diss
           
            cl1 = clust1 - 1;
            cl2 = clust2 - 1;
            for (int i = 0; i < nrow; i++) {
		// The following test is important:
		if ( (i != cl1) && (i != cl2) && (flag[i] == 1) ) {
 		   // Minimum variance criterion
                   diss[cl1][i] = 
                   (mass[cl1]+mass[i])/(mass[cl1]+mass[cl2]+mass[i])
                   *diss[cl1][i]   +
                   (mass[cl2]+mass[i])/(mass[cl1]+mass[cl2]+mass[i])
                   *diss[cl2][i]   -
                   (mass[i])/(mass[cl1]+mass[cl2]+mass[i])
                   *diss[cl1][cl2] ;
		   // For other alternative agglomeration criteria,
		   // e.g. single or complete linkage, see the update
		   // formulas in F. Murtagh, "Multidimensional 
		   // Clustering Algorithms", Physica-Verlag, 1985, p. 68.
		// ( (clcard[cl1]*clcard[cl2])/(clcard[cl1]+clcard[cl2]) )*
                //   (diss[cl1][i] - diss[cl2][i]);
                diss[i][cl1] = diss[cl1][i];
		}
	    }
            clcard[cl1] = clcard[cl1] + clcard[cl2]; 
	    mass[cl1] = mass[cl1] + mass[cl2];
	    // Cluster label clust2 is knocked out
            for (int i = 0; i < nrow; i++) {
                diss[cl2][i] = MAXVAL;
                diss[i][cl2] = diss[cl2][i];
                flag[cl2] = 0; 
                nndiss[cl2] = MAXVAL; 
		mass[cl2] = 0; 
	    }


	// Get nearest neighbors and nearest neighbor dissimilarities
        getNNs(nrow, flag, diss, nn, nndiss);
	// System.out.println("Nearest neighbors of items 1, 2, ... n:");
	// printVect(nn, 4, 10); 
        // System.out.println 
        //   ("Corresponding nearest neighbors dissimilarities:");
        // printVect(nndiss, 4, 10);

	    }  // End of agglomerations loop 
            while (ncl > 1);

	// For convenience, transpose the cluster labels
        int[][] tclusters = new int[nrow][nrow]; 
        for (int i1 = 0; i1 < nrow; i1++) {
            for (int i2 = 0; i2 < nrow; i2++) {
                tclusters[i2][i1] = clusters[i1][i2];
            }
        }

	// Row 1: just one cluster
        // Row 2: two clusters
        // Row 3: three clusters
        // ...
        // Row nrow: nrow clusters
	printMatrix(nrow, nrow, tclusters, 4, 4);

  } // End of try

  catch (IOException e) {
     out.println("error: " + e);
     System.exit(1);
  }
  
    } // End of main  

} // End of class HCL
上一页 12
💿 文件大小 7 K
👤 上传用户 a1102882595
📂 所属分类数值算法/人工智能
🏷️ 相关标签

#数据挖掘 #层次聚类 #算法
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -