📄 wkmeans.java
字号:
package com.prudsys.pdm.Models.Clustering.CDBased.Algorithms.WKMeans;
import java.util.ArrayList;
import java.util.Random;
import java.util.Vector;
import com.prudsys.pdm.Core.AttributeType;
import com.prudsys.pdm.Core.CategoricalAttribute;
import com.prudsys.pdm.Core.MiningException;
import com.prudsys.pdm.Core.NumericAttribute;
import com.prudsys.pdm.Input.MiningVector;
import com.prudsys.pdm.Models.Clustering.Cluster;
import com.prudsys.pdm.Models.Clustering.CDBased.CDBasedClusteringAlgorithm;
/**
* @author Administrator
*/
public class WKMeans extends CDBasedClusteringAlgorithm {
// ------------------------------------------------------------------
// these variables are specified by user and
// need to define in config File "algorithm.xml"
// ------------------------------------------------------------------
/**
* number of clusters to be generated by the algorithm.(is specified by
* user)
*/
private int numberOfClusters;
/**
* the maximum number of iterations (is specified by user),not used in this
* algorithm
*/
private int maxNumberOfIterations = 100;
/** the number of iterations (is specified by user) */
private int numberOfIterations;
/** False no weighting process(is specified by user) */
private boolean weight;
// ------------------------------------------------------------------
// global variables in WKMeas and are used by some method ,so
// these variables don't need to define in config File "algorithm.xml"
// ------------------------------------------------------------------
/** the number of all attributes */
private int numAtt;
/** the number of all objects to be clustered */
private int numVec;
/** the number of categorical attributes */
private int numOfCat;
/** the number of numeric attributes */
private int numOfNumeric;
/** the index array of Categorical attributes in MiningVector */
private int[] indexOfCate;
/** the index array of Numeric attributes in MinningVector */
private int[] indexOfNum;
/** stores the mean value of each numberic attribute */
private double[] meanValue;
/** object cluster membership identification */
private int[] objClusterID;
/** dispersions of each clusters */
private double[] clusterDispersions;
/** summation of values of variables in each cluster */
private double[][] totalSum;
/** summation of distances of categorical attributes */
private long[][] misMatch;
/** count the number of vectors(objects) in each cluster */
private int[] clusterCount;
/** cluster category attributes */
private ArrayList[][] clusterCateVar;
/** Distance between vectors to be clustered. */
// private Distance distance;
/**
* empty constructor
*/
public WKMeans() {
}
/**
* checks mining algorithm for completeness by calling vefity mothod of
* superclass. Addtionally ,it checks whether numberOfClusters and
* maxNumberOfIterations are admittable.
*
* @throws IllegalArgumentException
* if some algorithm attibutes are incorrect.
*/
public void verify() throws IllegalArgumentException {
super.verify();
if (numberOfClusters < 0) {
throw new IllegalArgumentException(
"numberOfClusters can't be negative");
}
if (maxNumberOfIterations < 0) {
throw new IllegalArgumentException(
"maxNumberOfIterations can't be negative");
}
}
/**
*
*
*/
private void initialization(int numAtt) {
this.numAtt = numAtt;
this.indexOfCate = this.getIndexOfCate(numAtt);
this.numOfCat = indexOfCate.length;
this.indexOfNum = this.getIndexOfNum(numAtt);
this.numOfNumeric = indexOfNum.length;
this.meanValue = new double[numOfNumeric];
this.clusterDispersions = new double[numberOfClusters];
this.totalSum = new double[numberOfClusters][numOfNumeric];
this.misMatch = new long[numberOfClusters][numOfCat];
this.clusterCount = new int[numberOfClusters];
// this.distance=new Distance();
this.clusterCateVar = new ArrayList[numberOfClusters][numOfCat];
for (int i = 0; i < numberOfClusters; i++) {
if (numOfCat > 0) {
for (int j = 0; j < numOfCat; j++) {
clusterCateVar[i][j] = new ArrayList();
}
for (int j = 0; j < numOfCat; j++) {
CategoricalAttribute cateAtt = (CategoricalAttribute) metaData
.getMiningAttribute(indexOfCate[j]);
for (int k = 0; k < cateAtt.getCategoriesNumber(); k++)
clusterCateVar[i][j].add(k, new Integer(0));
}
}
}
}
/**
* initializes the weights of all attributes.
*
* @param the
* number of attributes.
*/
private void weightInit() {
double[] weights = new double[numAtt];
for (int i = 0; i < numAtt; i++) {
weights[i] = 1.0 / numAtt;
}
distance.setFieldWeights(weights);
}
/**
* chooses the numberOfClusters vectors(objects)randomly as the initial
* prototypes.
*
* @param clusters
* ,the cluster array
* @param numVec
* ,the number of vectors(Objects).
* @throws MiningException
*/
private void prototypeInit() throws MiningException {
boolean selected[] = new boolean[numVec];
Random rand = new Random(10);
for (int i = 0; i < numberOfClusters; i++) {
int index = 0;
do {
index = Math.abs(rand.nextInt()) % numVec;
} while (selected[index]);
// Add center vector to cluster array:
MiningVector vec = miningInputStream.read(index);
clusters[i].setCenterVec(vec);
selected[index] = true;
}
}
/**
* compute object function value of P1 and assign every object to the
* nearest prototype
*
* @param start,
* the first index of vector(object)
* @param end,
* the last index of vector(objec)
* @param numAtt,the
* number of attributes
* @return the object function value of P1
* @throws MiningException
*/
private double getP1Cost(int start, int end) throws MiningException {
double dmin = 0.0;
double dispersion = 0.0;
for (int i = 0; i < numberOfClusters; i++) {
clusterDispersions[i] = 0.0;
clusterCount[i] = 0;
for (int j = 0; j < numOfNumeric; j++) {
totalSum[i][j] = 0.0;
}
for (int j = 0; j < numOfCat; j++) {
misMatch[i][j] = 0;
}
}
this.clearFrequency();
/***/
for (int i = start; i < end; i++) {
MiningVector mingVec = miningInputStream.read(i);
dmin = findNearestCluster(i, mingVec);
dispersion += dmin;
/* Sum up distance in clusters */
clusterDispersions[objClusterID[i]] += dmin;
/* Count the number of examples in cluster */
clusterCount[objClusterID[i]]++;
/* Sum up values of each variable in cluster kc */
if (numOfNumeric > 0) {
for (int j = 0; j < numOfNumeric; j++) {
totalSum[objClusterID[i]][j] += mingVec
.getValue(indexOfNum[j]);
}
}
/* Sum up the mismatches of attribute values */
if (numOfCat > 0) {
for (int j = 0; j < numOfCat; j++) {
int freq;
int key = (int) mingVec.getValue(indexOfCate[j]);
freq = ((Integer) clusterCateVar[objClusterID[i]][j]
.get(key)).intValue();
freq++;
clusterCateVar[objClusterID[i]][j].set(key, new Integer(
freq));
if (mingVec.getValue(indexOfCate[j]) == clusters[objClusterID[i]]
.getCenterVec().getValue(indexOfCate[j]))
misMatch[objClusterID[i]][j]++;
}
}
}
/* All examples are allocated to clusters */
return dispersion;
}
/**
* compute object function value of P2
*
* @param start,
* the first index of vector(object)
* @param end,
* the last index of vector(objec)
* @return the object function value of P2
* @throws MiningException
*/
private double getP2Cost(int start, int end) throws MiningException {
double dmin, numDist, catDist;
double dispersion1 = 0.0;
for (int i = 0; i < numberOfClusters; i++)
clusterDispersions[i] = 0.0;
for (int i = start; i < end; i++) {
MiningVector mingVec = miningInputStream.read(i);
if (numOfNumeric > 0)
numDist = distance.distance(mingVec, clusters[objClusterID[i]]
.getCenterVec(), indexOfNum, AttributeType.NUMERICAL);
else
numDist = 0.0;
if (numOfCat > 0)
catDist = distance
.distance(mingVec, clusters[objClusterID[i]]
.getCenterVec(), indexOfCate,
AttributeType.CATEGORICAL);
else
catDist = 0.0;
dmin = numDist + this.getGamma() * catDist;
/* Sum up distance in clusters */
clusterDispersions[objClusterID[i]] += dmin;
dispersion1 += dmin;
}
return dispersion1;
}
/**
* get the parameter Gamma that parameter for weight balancing numeric and
* categorical If numeric attributes exist, if numeric attributes don't
* exist then set Gamma=1.0
*
* @return Gamma
* @throws MiningException
*/
private double getGamma() throws MiningException {
double squareSum, gamma = 0.0;
double[] globalGamma = new double[numOfNumeric];
if (numOfNumeric > 0) {
for (int j = 0; j < numOfNumeric; j++) {
squareSum = 0.0;
for (int i = 0; i < numVec; i++) {
MiningVector vec = miningInputStream.read(i);
squareSum += Math.pow(vec.getValue(indexOfNum[j])
- meanValue[j], 2);
}
globalGamma[j] = Math.sqrt(squareSum / (numVec - 1));
}
for (int j = 0; j < numOfNumeric; j++)
gamma += globalGamma[j];
gamma /= numOfNumeric;
gamma = 0.3 * gamma * gamma;
} else {
gamma = 1.0;
}
return gamma;
}
/**
* method to clear frequency values
*/
private void clearFrequency() {
for (int i = 0; i < numberOfClusters; i++) {
for (int j = 0; j < numOfCat; j++) {
for (int k = 0; k < clusterCateVar[i][j].size(); k++) {
clusterCateVar[i][j].set(k, new Integer(0));
}
}
}
}
/**
* assign the vector(object) to the nearest cluster prototype
*
* @param i,the
* index of vector(object) to be clustered
* @return the distance between the vector(object)and the the nearest
* cluster prototype
* @throws MiningException
*/
private double findNearestCluster(int vecIndex, MiningVector vec)
throws MiningException {
double numDist, catDist, dmin, dist;
/* Compute distance to the first cluster centroid */
if (numOfNumeric > 0)
numDist = distance.distance(vec, clusters[0].getCenterVec(),
indexOfNum, AttributeType.NUMERICAL);
else
numDist = 0.0;
if (numOfCat > 0)
catDist = distance.distance(vec, clusters[0].getCenterVec(),
indexOfCate, AttributeType.CATEGORICAL);
else
catDist = 0.0;
dmin = numDist + this.getGamma() * catDist;
objClusterID[vecIndex] = 0;
/* Test distances to the remaining cluster centroids. */
for (int i = 1; i < numberOfClusters; i++) {
if (numOfNumeric > 0)
numDist = distance.distance(vec, clusters[i].getCenterVec(),
indexOfNum, AttributeType.NUMERICAL);
else
numDist = 0.0;
if (numOfCat > 0)
catDist = distance.distance(vec, clusters[i].getCenterVec(),
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -