📄 hcl.java
字号:
import java.text.*;
import java.io.*;
/**
* Carry out pairwise agglomerations. For n items, therefore there
* are n-1 agglomerations. Represent the cluster labels is an nxn
* cluster label matrix. Column no. n will be the singleton labels,
* 1 to n. Column no. n-1 will have n-1 unique values (or label sequence
* numbers). Column no. n-2 will have n-2 unique values. Column no. 1
* will have the value 1 only, implying that all n items are in one
* cluster.
* <p>
* ClustMat is our agglomeration "engine". It looks after labeling
* only, and is independent of any agglomerative clustering criterion.
* <p>
* Other utility methods:
* <p>
* Dissim ... calculate dissimilarity matrix <br>
* getNNs ... get nearest neighbors and associated
* nearest neighbor dissimilarities <br>
* getSpaces ... helping in output formating <br>
* printMatrix ... print matrix of doubles, or integers <br>
* printVect ... print vector of doubles, or integers
* <p>
* main does the following:
* <ol>
* <li> Reads data. (Format: integer row, column dimensions, followed
* by matrix values, read row-wise.) No preprocessing on input.
* <li> Calculate pairwise dissimilarities, and determines nearest neighbors
* and corresponding dissimilarities. (Squared Euclidean distance
* used.)
* <li> Determines the closest nearest neighbors.
* <li> Carries out an agglomeration in ClustMat.
* <li> Updates the pairwise dissimilarity matrix, and then, on the basis
* of this, the nearest neighbors, and the nearest neighbor
* dissimilarities.
* <li> Repeats while no. of clusters is greater than 2.
* </ol>
* Constant MAXVAL is used in, resp., dissimilarities and
* nearest neighbor dissimilarities, to indicate when items are processed and
* no longer exist as singletons. <br>
* Note also how flag = 1 denotes an active observation, and flag = 0
* denotes an inactive one (since it has been agglomerated). It is not
* necessary to use the flag since exceptionally high dissimilarities
* will signify inactive observations. However the use of flag is
* helpful computationally.
* <p>
* Step 5 here determines the agglomerative clustering criterion. We are
* currently using the minimum variance method. It is indicated
* in the code where to change to use other agglomerative criteria. <p>
* Output cluster labels using original sequence numbers. The ordering
* of observations is not such that a dendrogram can be directly
* constructed. However the cluster labels do allow selections and
* further inter and intra cluster processing of the input data. <p>
* Example of use: <p>
* <tt> javac HCL.java </tt> <br>
* <tt> java HCL <a href="../iris.dat">iris.dat</a> >
* <a href="../hcloutput.txt">hcloutput.txt</a> </tt> <p>
* First version: 1999 Nov. <br>
* Version: 2002 Oct. 26 <br>
* Author: F. Murtagh, f.murtagh@qub.ac.uk
* @version 2002 Oct. 26
* @author F. Murtagh, f.murtagh@qub.ac.uk
*/
public class HCL
{
public static final double MAXVAL = 1.0e12;
/**
* Method Dissim, calculates dissimilarity n x n array
* @param nrow integer row dimension
* @param ncol integer column dimension
* @param A floating row/column matrix
* @return Adiss floating n x n dissimilarity array
*/
public static double[][] Dissim(int nrow, int ncol,
double[] mass, double[][] A)
{
double[][] Adiss = new double[nrow][nrow];
// Initialize to 0. Caters for the diagonal which stays 0.
for (int i1 = 0; i1 < nrow; i1++) {
for (int i2 = 0; i2 < nrow; i2++) Adiss[i1][i2] = 0.0;
}
for (int i1 = 0; i1 < nrow; i1++) {
// All dissimilarity we are dealing with are symmetric
// so just calculate for half the array and fill in later
for (int i2 = 0; i2 < i1; i2++) {
for (int j = 0; j < ncol; j++) {
// We are happy with the squared dissimilarity
// 0.5 term since this equals
// (clcard[i1]*clcard[i2])/(clcard[i1]+clcard[i2])
// for minimum variance criterion
Adiss[i1][i2] += 0.5*Math.pow(A[i1][j] - A[i2][j], 2.0); }
// Adiss[i1][i2] += ((mass[i1]*mass[i2])/(mass[i1]+mass[i2]))*
// Math.pow( A[i1][j] - A[i2][j], 2.0 );
// Fill the other half of the array
Adiss[i2][i1] = Adiss[i1][i2];
}
}
return Adiss;
} // Dissim
/**
* Method getNNs, determine NNs and NN dissimilarities
* @param nrow row dimension or number of observations (input)
* @param flag =1 for active observation, = 0 for inactive one (input)
* @param diss dissimilarity matrix (input)
* @param nn nearest neighbor sequence number (calculated)
* @param nndiss nearest neigbor dissimilarity (calculated)
*/
public static void getNNs(int nrow, int[] flag, double[][] diss,
int[] nn, double[]nndiss)
{
int minobs;
double mindist;
for (int i1 = 0; i1 < nrow; i1++) {
if (flag[i1] == 1) {
minobs = -1;
mindist = MAXVAL;
for (int i2 = 0; i2 < nrow; i2++) {
if ((diss[i1][i2] < mindist) && (i1 != i2)) {
mindist = diss[i1][i2];
minobs = i2;
}
}
nn[i1] = minobs + 1;
nndiss[i1] = mindist;
}
}
// Return type void => no return stmt
} // getNNs
/**
* Method ClustMat, updates cluster structure matrix following
* an agglomeration
* @param nrow row dimension or number of observations (input)
* @param clusters list of agglomerations, stored as array of
* pairs of cluster sequence numbers (input, and updated)
* @param clust1 first agglomerand (input)
* @param clust2 second agglomerand (input)
* @param ncl number of clusters remaining (input)
*/
public static void ClustMat(int nrow, int[][] clusters,
int clust1, int clust2, int ncl)
{
// If either clust* is not initialized, then we must init. clusters
if ((clust1 == 0) || (clust2 == 0)) {
for (int j = 0; j < nrow; j++) {
for (int i = 0; i < nrow; i++) {
clusters[i][j] = 0;
}
}
// Adjust for 0-sequencing in extreme right-hand col values.
for (int i = 0; i < nrow; i++) clusters[i][ncl-1] = i + 1;
return;
}
// For some agglomeration, we are told that label clust1 and
// label clust2, among all labels in col. ncl-1 (ncl ranges over
// 0 thru nrow-1) are to be agglomerated
// We have arranged that order is such that clust1 < clust2
// Note: clust1 and clust2 are 1-sequenced and not 0-sequenced
int ncl1;
ncl1 = ncl - 1;
for (int i = 0; i < nrow; i++) {
clusters[i][ncl1] = clusters[i][ncl];
if (clusters[i][ncl1] == clust2) clusters[i][ncl1] = clust1;
}
// Return type void => no return stmt
} // ClustMat
// Little method for helping in output formating
public static String getSpaces(int n) {
StringBuffer sb = new StringBuffer(n);
for (int i = 0; i < n; i++) sb.append(' ');
return sb.toString();
}
/**
* Method for printing a double float matrix <br>
* Based on ER Harold, "Java I/O", O'Reilly, around p. 473.
* @param n1 row dimension of matrix
* @param n2 column dimension of matrix
* @param m input matrix values, double
* @param d display precision, number of decimal places
* @param w display precision, total width of floating value
*/
public static void printMatrix(int n1, int n2, double[][] m, int d, int w)
{
// Some definitions for handling output formating
NumberFormat myFormat = NumberFormat.getNumberInstance();
FieldPosition fp = new FieldPosition(NumberFormat.INTEGER_FIELD);
myFormat.setMaximumIntegerDigits(d);
myFormat.setMaximumFractionDigits(d);
myFormat.setMinimumFractionDigits(d);
for (int i=0; i<n1; i++)
{
// Print each row, elements separated by spaces
for (int j=0; j<n2; j++) {
String valString = myFormat.format(
m[i][j], new StringBuffer(), fp).toString();
valString = getSpaces(w - fp.getEndIndex()) + valString;
System.out.print(valString);
}
// Start a new line at the end of a row
System.out.println();
}
// Leave a gap after the entire matrix
System.out.println();
}
/**
* Method for printing an integer matrix <br>
* Based on ER Harold, "Java I/O", O'Reilly, around p. 473.
* @param n1 row dimension of matrix
* @param n2 column dimension of matrix
* @param m input matrix values
* @param d display precision, number of decimal places
* @param w display precision, total width of floating value
*/
public static void printMatrix(int n1, int n2, int[][] m, int d, int w)
{
// Some definitions for handling output formating
NumberFormat myFormat = NumberFormat.getNumberInstance();
FieldPosition fp = new FieldPosition(NumberFormat.INTEGER_FIELD);
myFormat.setMaximumIntegerDigits(d);
for (int i=0; i<n1; i++) {
// Print each row, elements separated by spaces
for (int j=0; j<n2; j++) {
String valString = myFormat.format(
m[i][j], new StringBuffer(), fp).toString();
// 4 character locations per integer number
valString = getSpaces(w - fp.getEndIndex()) + valString;
System.out.print(valString);
}
// Start a new line at the end of a row
System.out.println();
}
// Leave a gap after the entire matrix
System.out.println();
} // printMatrix
/**
* Method printVect for printing a double float vector <br>
* Based on ER Harold, "Java I/O", O'Reilly, around p. 473.
* @param m input vector of length m.length
* @param d display precision, number of decimal places
* @param w display precision, total width of floating value
*/
public static void printVect(double[] m, int d, int w)
{
// Some definitions for handling output formating
NumberFormat myFormat = NumberFormat.getNumberInstance();
FieldPosition fp = new FieldPosition(NumberFormat.INTEGER_FIELD);
myFormat.setMaximumIntegerDigits(d);
myFormat.setMaximumFractionDigits(d);
myFormat.setMinimumFractionDigits(d);
int len = m.length;
for (int i=0; i<len; i++) {
String valString = myFormat.format(
m[i], new StringBuffer(), fp).toString();
valString = getSpaces(w - fp.getEndIndex()) + valString;
System.out.print(valString);
}
// Start a new line at the row end
System.out.println();
// Leave a gap after the entire vector
System.out.println();
} // printVect
/**
* Method printVect for printing an integer vector <br>
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -