📄 simplechameleon.java
字号:
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
/*
* SimpleChameleon.java
* Copyright (C) 2008 Athrun Zala
*/
package weka.clusterers;
import weka.core.Capabilities;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.Utils;
import weka.core.Capabilities.Capability;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.ReplaceMissingValues;
import weka.core.matrix.Matrix;
import java.util.Enumeration;
import java.util.Vector;
/**
<!-- globalinfo-start -->
* Cluster data using the spectral clustering algorithm
* <p/>
<!-- globalinfo-end -->
*
<!-- options-start -->
* Valid options are: <p/>
*
* <pre> -N <num>
* number of clusters.
* (default 2).</pre>
*
* <pre> -S <num>
* Random number seed.
* (default 10)</pre>
*
<!-- options-end -->
*
* @author Athrun Zala
* @version 1.00
* @see RandomizableClusterer
*/
public class SimpleChameleon
extends RandomizableClusterer
implements NumberOfClustersRequestable{
/**
* for serialization
*/
private static final long serialVersionUID = 1L;
/**
* replace the missing values in training instances
*/
private ReplaceMissingValues m_ReplaceMissingFilter;
/**
* number of clusters to generate
*/
private int m_NumClusters = 2;
/**
* threshold
*/
private int processed = 1000000;
/**
* holds the index of the instance
*/
private int m_Index;
/**
* Holds the standard deviations of the numeric attributes in each cluster
*/
private Instances m_ClusterStdDevs;
/**
* Holds the number of instances in each cluster
*/
private int [] m_ClusterSizes;
/**
* the vector to hold the cluster
*/
private int[][] category;
/**
* the default constructor
*/
public SimpleChameleon() {
super();
m_SeedDefault = 10;
setSeed(m_SeedDefault);
}
/**
* Returns a string describing this clusterer
* @return a description of the evaluator suitable for
* displaying in the explorer/experimenter gui
*/
public String globalInfo() {
return "Cluster data using the Chameleon Clustering Algorithm";
}
/**
* Returns default capabilities of the clusterer.
* @return the capabilities of this clusterer
*/
public Capabilities getCapabilities() {
Capabilities result = super.getCapabilities();
result.enable(Capability.NUMERIC_ATTRIBUTES);
result.enable(Capability.MISSING_VALUES);
return result;
}
/**
* Generates a clusterer. Has to initialize all fields of the clusterer
* that are not being set via options.
*
* @param data set of instances serving as training data
* @throws Exception if the clusterer has not been
* generated successfully
*/
public void buildClusterer(Instances data) throws Exception {
// can clusterer handle the data?
getCapabilities().testWithFail(data);
m_ReplaceMissingFilter = new ReplaceMissingValues();
Instances instances = new Instances(data);
instances.setClassIndex(-1);
m_ReplaceMissingFilter.setInputFormat(instances);
instances = Filter.useFilter(instances, m_ReplaceMissingFilter);
int n = instances.numInstances();
int []clusterAssignments = new int [n];
Instances [] tempI = new Instances[m_NumClusters];
int i;
merge(instances);
labelCluster(instances);
for (i = 0; i < instances.numInstances(); i++) {
Instance toCluster = instances.instance(i);
setIndex(i);
int newC = clusterProcessedInstance(toCluster, true);
clusterAssignments[i] = newC;
}
for (i = 0; i < m_NumClusters; i++) {
tempI[i] = new Instances(instances, 0);
}
for (i = 0; i < instances.numInstances(); i++) {
tempI[clusterAssignments[i]].add(instances.instance(i));
}
setIndex(0);
m_ClusterStdDevs = new Instances(instances, m_NumClusters);
m_ClusterSizes = new int [m_NumClusters];
for (i = 0; i < m_NumClusters; i++) {
double [] vals2 = new double[instances.numAttributes()];
for (int j = 0; j < instances.numAttributes(); j++) {
if (instances.attribute(j).isNumeric()) {
vals2[j] = Math.sqrt(tempI[i].variance(j));
} else {
vals2[j] = Instance.missingValue();
}
}
m_ClusterStdDevs.add(new Instance(1.0, vals2));
}
}
/**
* clusters an instance that has been through the filters
*
* @param instance the instance to assign a cluster to
* @param updateErrors if true, update the within clusters sum of errors
* @return a cluster number
*/
private int clusterProcessedInstance(Instance instance, boolean updateErrors) {
int bestCluster = 0;
for (int i = 0; i < m_NumClusters; i++) {
if (category[0][m_Index] == i) {
bestCluster = i;
}
}
m_Index++;
return bestCluster;
}
/**
* Classifies a given instance.
*
* @param instance the instance to be assigned to a cluster
* @return the number of the assigned cluster as an interger
* if the class is enumerated, otherwise the predicted value
* @throws Exception if instance could not be classified
* successfully
*/
public int clusterInstance(Instance instance) throws Exception {
m_ReplaceMissingFilter.input(instance);
m_ReplaceMissingFilter.batchFinished();
Instance inst = m_ReplaceMissingFilter.output();
return clusterProcessedInstance(inst, false);
}
/**
* Calculates distance, the distance between two instances
*
* @param first the first instance
* @param second the second instance
* @return the distance between the two given instances
*/
private double distance(Instance first, Instance second) {
double val1;
double val2;
double dist = 0.0;
for (int i = 0; i <first.numAttributes(); i++) {
val1 = first.value(i);
val2 = second.value(i);
dist += (val1 - val2) * (val1 - val2);
}
dist = Math.sqrt(dist);
return dist;
}
/**
* construct the similar matrix
*
* @param instances the data set
* @return W the weighted similar matrix
*/
private double[][] solveW(Instances instances) {
int i, j;
int n = instances.numInstances();
double[][] W = new double[n][n];
for (i = 0; i < n; i++) {
for (j = i + 1; j < n; j++) {
W[i][j] = distance(instances.instance(i), instances.instance(j));
W[j][i] = processed;
}
W[i][i] = processed;
}
return W;
}
/**
* Initialize the category vector
* @param instances the data set
*/
private void initCategory(Instances instances) {
int n = instances.numInstances();
category = new int[1][n];
for (int i = 0; i < n; i++) {
category[0][i] = i;
}
}
/**
* merge the two closer cluster
* @param instances
*/
private void merge(Instances instances) {
int i, j;
int idx_of_clusterA = 0, idx_of_clusterB = 0;
int n = instances.numInstances();
int num_instance = instances.numInstances();
Matrix W = new Matrix(solveW(instances));
initCategory(instances);
while (num_instance > m_NumClusters) {
double mindist = 1000000;
for (i = 0; i < n; i++) {
for (j = i + 1; j < n; j++) {
if (W.get(i, j) < mindist) {
mindist = W.get(i, j);
idx_of_clusterA = i;
idx_of_clusterB = j;
}
}
}
W.set(idx_of_clusterA, idx_of_clusterB, processed);
W.set(idx_of_clusterB, idx_of_clusterA, processed);
for (i = 0; i < n; i++) {
if (category[0][i] == idx_of_clusterB) {
category[0][i] = idx_of_clusterA;
}
}
for (i = 0; i < n; i++) {
if (W.get(idx_of_clusterB, i) < W.get(idx_of_clusterA, i)) {
W.set(idx_of_clusterA, i, W.get(idx_of_clusterB, i));
}
W.set(idx_of_clusterB, i, processed);
W.set(i, idx_of_clusterB, processed);
}
num_instance -= 1;
}
}
/**
* label the clusters
* @param instances
*/
private void labelCluster(Instances instances) {
int i, j;
int n = instances.numInstances();
int categories = -1;
int[][] labeled = new int[1][n];
for (i = 0; i < n; i++) {
int temp = category[0][i];
if (labeled[0][temp] == 0) {
categories += 1;
for (j = i; j < n; j++) {
if (category[0][j] == temp) {
category[0][j] = categories;
}
}
labeled[0][categories] = 1;
}
}
}
/**
*
* set the index of the instance
*/
public void setIndex(int position) {
m_Index = position;
}
/**
* get the number of clusters
*
* @return m_NumClusters
*/
public int getNumClusters() {
return m_NumClusters;
}
/**
*
* @return m_ClusterSizes
*/
public int [] getClusterSizes() {
return m_ClusterSizes;
}
/**
* Returns the number of clusters.
*
* @return the number of clusters generated for a training dataset.
* @throws Exception if number of clusters could not be returned
* successfully
*/
public int numberOfClusters() throws Exception {
return m_NumClusters;
}
/**
* set the number of clusters to generate
*
* @param n the number of clusters to generate
* @throws Exception if number of clusters is negative
*/
public void setNumClusters(int n) throws Exception {
if (n <= 0) {
throw new Exception("Number of clusters must be > 0");
}
m_NumClusters = n;
}
/**
* Returns an enumeration describing the available options.
*
* @return an enumeration of all the available options.
*/
public Enumeration listOptions () {
Vector result = new Vector();
result.addElement(new Option(
"\tnumber of clusters.\n"
+ "\t(default 2).",
"N", 1, "-N <num>"));
Enumeration en = super.listOptions();
while (en.hasMoreElements())
result.addElement(en.nextElement());
return result.elements();
}
/**
* Parses a given list of options. <p/>
*
<!-- options-start -->
* Valid options are: <p/>
*
* <pre> -N <num>
* number of clusters.
* (default 2).</pre>
*
* <pre> -S <num>
* Random number seed.
* (default 10)</pre>
*
<!-- options-end -->
*
* @param options the list of options as an array of strings
* @throws Exception if an option is not supported
*/
public void setOptions (String[] options) throws Exception {
String optionString = Utils.getOption('N', options);
if (optionString.length() != 0) {
setNumClusters(Integer.parseInt(optionString));
}
super.setOptions(options);
}
/**
* Gets the current settings of SimpleKMeans
*
* @return an array of strings suitable for passing to setOptions()
*/
public String[] getOptions () {
int i;
Vector result;
String[] options;
result = new Vector();
result.add("-N");
result.add("" + getNumClusters());
options = super.getOptions();
for (i = 0; i < options.length; i++)
result.add(options[i]);
return (String[]) result.toArray(new String[result.size()]);
}
/**
* return a string describing this clusterer
*
* @return a description of the clusterer as a string
*/
public String toString() {
StringBuffer temp = new StringBuffer();
temp.append("\nSimple Chameleon\n======\n");
temp.append("sum of StdDevs: ");
double sum = 0;
for (int i = 0; i < m_NumClusters; i++) {
for (int j = 0; j < m_ClusterStdDevs.numAttributes(); j++) {
sum += m_ClusterStdDevs.instance(i).value(j);
}
}
temp.append(sum + "\n\n\n");
for (int i = 0; i < m_NumClusters; i++) {
temp.append("\nCluster "+i+"\t");
temp.append("\n\tStd Devs: ");
for (int j = 0; j < m_ClusterStdDevs.numAttributes(); j++) {
temp.append(" " + Utils.doubleToString(m_ClusterStdDevs.instance(i).value(j), 5, 4));
}
}
temp.append("\n\n");
return temp.toString();
}
/**
* Main method for testing this class.
*
* @param argv should contain the following arguments: <p>
* -t training file [-N number of clusters]
*/
public static void main (String[] argv) {
runClusterer(new SimpleChameleon(), argv);
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -