📄 wkmeans.java

📁 一个数据挖掘软件ALPHAMINERR的整个过程的JAVA版源代码
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
package com.prudsys.pdm.Models.Clustering.CDBased.Algorithms.WKMeans;

import java.util.ArrayList;
import java.util.Random;
import java.util.Vector;

import com.prudsys.pdm.Core.AttributeType;
import com.prudsys.pdm.Core.CategoricalAttribute;
import com.prudsys.pdm.Core.MiningException;
import com.prudsys.pdm.Core.NumericAttribute;
import com.prudsys.pdm.Input.MiningVector;
import com.prudsys.pdm.Models.Clustering.Cluster;
import com.prudsys.pdm.Models.Clustering.CDBased.CDBasedClusteringAlgorithm;

/**
 * @author   Administrator
 */
public class WKMeans extends CDBasedClusteringAlgorithm {
	// ------------------------------------------------------------------
	// these variables are specified by user and
	// need to define in config File "algorithm.xml"
	// ------------------------------------------------------------------
	/**
	 * number of clusters to be generated by the algorithm.(is specified by
	 * user)
	 */
	private int numberOfClusters;

	/**
	 * the maximum number of iterations (is specified by user),not used in this
	 * algorithm
	 */
	private int maxNumberOfIterations = 100;

	/** the number of iterations (is specified by user) */
	private int numberOfIterations;

	/** False no weighting process(is specified by user) */
	private boolean weight;

	// ------------------------------------------------------------------
	// global variables in WKMeas and are used by some method ,so
	// these variables don't need to define in config File "algorithm.xml"
	// ------------------------------------------------------------------
	/** the number of all attributes */
	private int numAtt;

	/** the number of all objects to be clustered */
	private int numVec;

	/** the number of categorical attributes */
	private int numOfCat;

	/** the number of numeric attributes */
	private int numOfNumeric;

	/** the index array of Categorical attributes in MiningVector */
	private int[] indexOfCate;

	/** the index array of Numeric attributes in MinningVector */
	private int[] indexOfNum;

	/** stores the mean value of each numberic attribute */
	private double[] meanValue;

	/** object cluster membership identification */
	private int[] objClusterID;

	/** dispersions of each clusters */
	private double[] clusterDispersions;

	/** summation of values of variables in each cluster */
	private double[][] totalSum;

	/** summation of distances of categorical attributes */
	private long[][] misMatch;

	/** count the number of vectors(objects) in each cluster */
	private int[] clusterCount;

	/** cluster category attributes */
	private ArrayList[][] clusterCateVar;

	/** Distance between vectors to be clustered. */
	// private Distance distance;
	/**
	 * empty constructor
	 */
	public WKMeans() {
	}

	/**
	 * checks mining algorithm for completeness by calling vefity mothod of
	 * superclass. Addtionally ,it checks whether numberOfClusters and
	 * maxNumberOfIterations are admittable.
	 * 
	 * @throws IllegalArgumentException
	 *             if some algorithm attibutes are incorrect.
	 */
	public void verify() throws IllegalArgumentException {
		super.verify();
		if (numberOfClusters < 0) {
			throw new IllegalArgumentException(
					"numberOfClusters can't be negative");
		}
		if (maxNumberOfIterations < 0) {
			throw new IllegalArgumentException(
					"maxNumberOfIterations can't be negative");
		}
	}

	/**
	 * 
	 * 
	 */
	private void initialization(int numAtt) {
		this.numAtt = numAtt;
		this.indexOfCate = this.getIndexOfCate(numAtt);
		this.numOfCat = indexOfCate.length;
		this.indexOfNum = this.getIndexOfNum(numAtt);
		this.numOfNumeric = indexOfNum.length;
		this.meanValue = new double[numOfNumeric];
		this.clusterDispersions = new double[numberOfClusters];
		this.totalSum = new double[numberOfClusters][numOfNumeric];
		this.misMatch = new long[numberOfClusters][numOfCat];
		this.clusterCount = new int[numberOfClusters];
		// this.distance=new Distance();
		this.clusterCateVar = new ArrayList[numberOfClusters][numOfCat];
		for (int i = 0; i < numberOfClusters; i++) {
			if (numOfCat > 0) {
				for (int j = 0; j < numOfCat; j++) {
					clusterCateVar[i][j] = new ArrayList();
				}
				for (int j = 0; j < numOfCat; j++) {
					CategoricalAttribute cateAtt = (CategoricalAttribute) metaData
							.getMiningAttribute(indexOfCate[j]);
					for (int k = 0; k < cateAtt.getCategoriesNumber(); k++)
						clusterCateVar[i][j].add(k, new Integer(0));
				}
			}
		}
	}

	/**
	 * initializes the weights of all attributes.
	 * 
	 * @param the
	 *            number of attributes.
	 */
	private void weightInit() {
		double[] weights = new double[numAtt];
		for (int i = 0; i < numAtt; i++) {
			weights[i] = 1.0 / numAtt;
		}
		distance.setFieldWeights(weights);
	}

	/**
	 * chooses the numberOfClusters vectors(objects)randomly as the initial
	 * prototypes.
	 * 
	 * @param clusters
	 *            ,the cluster array
	 * @param numVec
	 *            ,the number of vectors(Objects).
	 * @throws MiningException
	 */
	private void prototypeInit() throws MiningException {
		boolean selected[] = new boolean[numVec];
		Random rand = new Random(10);
		for (int i = 0; i < numberOfClusters; i++) {
			int index = 0;
			do {
				index = Math.abs(rand.nextInt()) % numVec;
			} while (selected[index]);
			// Add center vector to cluster array:
			MiningVector vec = miningInputStream.read(index);
			clusters[i].setCenterVec(vec);
			selected[index] = true;
		}
	}

	/**
	 * compute object function value of P1 and assign every object to the
	 * nearest prototype
	 * 
	 * @param start,
	 *            the first index of vector(object)
	 * @param end,
	 *            the last index of vector(objec)
	 * @param numAtt,the
	 *            number of attributes
	 * @return the object function value of P1
	 * @throws MiningException
	 */
	private double getP1Cost(int start, int end) throws MiningException {
		double dmin = 0.0;
		double dispersion = 0.0;
		for (int i = 0; i < numberOfClusters; i++) {
			clusterDispersions[i] = 0.0;
			clusterCount[i] = 0;
			for (int j = 0; j < numOfNumeric; j++) {
				totalSum[i][j] = 0.0;
			}
			for (int j = 0; j < numOfCat; j++) {
				misMatch[i][j] = 0;
			}
		}

		this.clearFrequency();

		/***/
		for (int i = start; i < end; i++) {
			MiningVector mingVec = miningInputStream.read(i);
			dmin = findNearestCluster(i, mingVec);
			dispersion += dmin;

			/* Sum up distance in clusters */
			clusterDispersions[objClusterID[i]] += dmin;

			/* Count the number of examples in cluster */
			clusterCount[objClusterID[i]]++;

			/* Sum up values of each variable in cluster kc */
			if (numOfNumeric > 0) {
				for (int j = 0; j < numOfNumeric; j++) {
					totalSum[objClusterID[i]][j] += mingVec
							.getValue(indexOfNum[j]);
				}
			}
			/* Sum up the mismatches of attribute values */
			if (numOfCat > 0) {
				for (int j = 0; j < numOfCat; j++) {
					int freq;
					int key = (int) mingVec.getValue(indexOfCate[j]);

					freq = ((Integer) clusterCateVar[objClusterID[i]][j]
							.get(key)).intValue();
					freq++;
					clusterCateVar[objClusterID[i]][j].set(key, new Integer(
							freq));
					if (mingVec.getValue(indexOfCate[j]) == clusters[objClusterID[i]]
							.getCenterVec().getValue(indexOfCate[j]))
						misMatch[objClusterID[i]][j]++;
				}
			}
		}

		/* All examples are allocated to clusters */
		return dispersion;
	}

	/**
	 * compute object function value of P2
	 * 
	 * @param start,
	 *            the first index of vector(object)
	 * @param end,
	 *            the last index of vector(objec)
	 * @return the object function value of P2
	 * @throws MiningException
	 */
	private double getP2Cost(int start, int end) throws MiningException {
		double dmin, numDist, catDist;
		double dispersion1 = 0.0;
		for (int i = 0; i < numberOfClusters; i++)
			clusterDispersions[i] = 0.0;
		for (int i = start; i < end; i++) {
			MiningVector mingVec = miningInputStream.read(i);
			if (numOfNumeric > 0)
				numDist = distance.distance(mingVec, clusters[objClusterID[i]]
						.getCenterVec(), indexOfNum, AttributeType.NUMERICAL);
			else
				numDist = 0.0;

			if (numOfCat > 0)
				catDist = distance
						.distance(mingVec, clusters[objClusterID[i]]
								.getCenterVec(), indexOfCate,
								AttributeType.CATEGORICAL);
			else
				catDist = 0.0;

			dmin = numDist + this.getGamma() * catDist;

			/* Sum up distance in clusters */

			clusterDispersions[objClusterID[i]] += dmin;
			dispersion1 += dmin;
		}
		return dispersion1;
	}

	/**
	 * get the parameter Gamma that parameter for weight balancing numeric and
	 * categorical If numeric attributes exist, if numeric attributes don't
	 * exist then set Gamma=1.0
	 * 
	 * @return Gamma
	 * @throws MiningException
	 */
	private double getGamma() throws MiningException {
		double squareSum, gamma = 0.0;
		double[] globalGamma = new double[numOfNumeric];
		if (numOfNumeric > 0) {
			for (int j = 0; j < numOfNumeric; j++) {
				squareSum = 0.0;
				for (int i = 0; i < numVec; i++) {
					MiningVector vec = miningInputStream.read(i);
					squareSum += Math.pow(vec.getValue(indexOfNum[j])
							- meanValue[j], 2);
				}
				globalGamma[j] = Math.sqrt(squareSum / (numVec - 1));
			}
			for (int j = 0; j < numOfNumeric; j++)
				gamma += globalGamma[j];
			gamma /= numOfNumeric;
			gamma = 0.3 * gamma * gamma;
		} else {
			gamma = 1.0;
		}
		return gamma;
	}

	/**
	 * method to clear frequency values
	 */
	private void clearFrequency() {
		for (int i = 0; i < numberOfClusters; i++) {
			for (int j = 0; j < numOfCat; j++) {
				for (int k = 0; k < clusterCateVar[i][j].size(); k++) {
					clusterCateVar[i][j].set(k, new Integer(0));
				}
			}
		}
	}

	/**
	 * assign the vector(object) to the nearest cluster prototype
	 * 
	 * @param i,the
	 *            index of vector(object) to be clustered
	 * @return the distance between the vector(object)and the the nearest
	 *         cluster prototype
	 * @throws MiningException
	 */
	private double findNearestCluster(int vecIndex, MiningVector vec)
			throws MiningException {
		double numDist, catDist, dmin, dist;

		/* Compute distance to the first cluster centroid */
		if (numOfNumeric > 0)
			numDist = distance.distance(vec, clusters[0].getCenterVec(),
					indexOfNum, AttributeType.NUMERICAL);
		else
			numDist = 0.0;
		if (numOfCat > 0)
			catDist = distance.distance(vec, clusters[0].getCenterVec(),
					indexOfCate, AttributeType.CATEGORICAL);
		else
			catDist = 0.0;
		dmin = numDist + this.getGamma() * catDist;
		objClusterID[vecIndex] = 0;

		/* Test distances to the remaining cluster centroids. */
		for (int i = 1; i < numberOfClusters; i++) {
			if (numOfNumeric > 0)
				numDist = distance.distance(vec, clusters[i].getCenterVec(),
						indexOfNum, AttributeType.NUMERICAL);
			else
				numDist = 0.0;
			if (numOfCat > 0)
				catDist = distance.distance(vec, clusters[i].getCenterVec(),
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -