📄 kwmeans.txt

📁 改进的k-means方法
💻 TXT
📖 第 1 页 / 共 2 页
字号:
12 下一页
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    SimpleKMeans.java
 *    Copyright (C) 2000 University of Waikato, Hamilton, New Zealand
 *
 */
package weka.clusterers;

import weka.classifiers.rules.DecisionTableHashKey;
import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.Utils;
import weka.core.WeightedInstancesHandler;
import weka.core.Capabilities.Capability;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.ReplaceMissingValues;

import java.util.Enumeration;
import java.util.HashMap;
import java.util.Random;
import java.util.Vector;

/**
 <!-- globalinfo-start -->
 * Cluster data using the k means algorithm
 * <p/>
 <!-- globalinfo-end -->
 *
 <!-- options-start -->
 * Valid options are: <p/>
 * 
 * <pre> -N &lt;num&gt;
 *  number of clusters.
 *  (default 2).</pre>
 * 
 * <pre> -S &lt;num&gt;
 *  Random number seed.
 *  (default 10)</pre>
 * 
 <!-- options-end -->
 *
 * @author Mark Hall (mhall@cs.waikato.ac.nz)
 * @author Eibe Frank (eibe@cs.waikato.ac.nz)
 * @version $Revision: 1.29 $
 * @see RandomizableClusterer
 */
public class KwMeans 
  extends RandomizableClusterer 
  implements NumberOfClustersRequestable, WeightedInstancesHandler {

  /** for serialization */
  static final long serialVersionUID = -3235809600124455376L;
  
  /**
   * replace missing values in training instances
   */
  private ReplaceMissingValues m_ReplaceMissingFilter;

  /**
   * number of clusters to generate
   */
  private int m_NumClusters = 2;

  /**
   * holds the cluster centroids
   */
  private Instances m_ClusterCentroids;

  /**
   * Holds the standard deviations of the numeric attributes in each cluster
   */
  private Instances m_ClusterStdDevs;

  
  /**
   * For each cluster, holds the frequency counts for the values of each 
   * nominal attribute
   */
  private int [][][] m_ClusterNominalCounts;

  /**
   * The number of instances in each cluster
   */
  private int [] m_ClusterSizes;

  /**
   * attribute min values
   */
  private double [] m_Min;
  
  /**
   * attribute max values
   */
  private double [] m_Max;

  /**
   * Keep track of the number of iterations completed before convergence
   */
  private int m_Iterations = 0;

  /**
   * Holds the squared errors for all clusters
   */
  private double [] m_squaredErrors;

  /**
   * the default constructor
   */
  public KwMeans() {
    super();
    
    m_SeedDefault = 10;
    setSeed(m_SeedDefault);
  }
  
  /**
   * Returns a string describing this clusterer
   * @return a description of the evaluator suitable for
   * displaying in the explorer/experimenter gui
   */
  public String globalInfo() {
    return "Cluster data using the k means algorithm";
  }

  /**
   * Returns default capabilities of the clusterer.
   *
   * @return      the capabilities of this clusterer
   */
  public Capabilities getCapabilities() {
    Capabilities result = super.getCapabilities();

    // attributes
    result.enable(Capability.NOMINAL_ATTRIBUTES);
    result.enable(Capability.NUMERIC_ATTRIBUTES);
    result.enable(Capability.MISSING_VALUES);

    return result;
  }

  /**
   * Generates a clusterer. Has to initialize all fields of the clusterer
   * that are not being set via options.
   *
   * @param data set of instances serving as training data 
   * @throws Exception if the clusterer has not been 
   * generated successfully
   */
  public void buildClusterer(Instances data) throws Exception {
	  //生成聚类过程
     // can clusterer handle the data?
    getCapabilities().testWithFail(data);//测试这个聚类能否处理这个数据集

    m_Iterations = 0;//迭代次数初始化为0
    /*ReplaceMissingValues()
     * 用训练数据中的众数或是平均值来代替数据集中的名词性或数值性属性的缺失值
     */
    m_ReplaceMissingFilter = new ReplaceMissingValues();
    //实例化，创建一个样例集对象，从给定的数据集复制所有的实例和引用
    Instances instances = new Instances(data);
    instances.setClassIndex(-1);//设置下标，如果为负数则设置为没有下标
    m_ReplaceMissingFilter.setInputFormat(instances);//设置输入样例的格式
    instances = Filter.useFilter(instances, m_ReplaceMissingFilter);
    //对全部样例进行过滤，返回过滤后的新的样例集

    m_Min = new double [instances.numAttributes()];
    m_Max = new double [instances.numAttributes()];
    for (int i = 0; i < instances.numAttributes(); i++) {
      m_Min[i] = m_Max[i] = Double.NaN;
      //A constant holding a Not-a-Number (NaN) value of type double
      //设置初始值
    }
    //创建一个空的样例集合，从给定的数据集复制引用，如果是负的则设置样例集的容量为0
    m_ClusterCentroids = new Instances(instances, m_NumClusters);
    //此数组用来存放样例所属的聚类标号（一维数组）
    int[] clusterAssignments = new int [instances.numInstances()];
    //基于第i个实例更新所有属性的最大值和最小值
    for (int i = 0; i < instances.numInstances(); i++) {
      updateMinMax(instances.instance(i));
      //给每个实例赋予权重
     
    }
    setWeight(instances,instances.numInstances());
    /*随机取聚类中心
     * 获得的随机数每次运行都是相同的，取得的聚类中心也是相同的。
       因此程序的运行结果也应该是不变的。
     */
    
    Random RandomO = new Random(getSeed());//随机数组
    int instIndex;
    HashMap initC = new HashMap();
    DecisionTableHashKey hk = null;//为决策表提供hash值的类

    for (int j = instances.numInstances() - 1; j >= 0; j--) {
      instIndex = RandomO.nextInt(j+1);
      //Instance weka.core.Instances.instance(int index)返回给定位置上的
      //实例
      hk = new DecisionTableHashKey(instances.instance(instIndex), 
				     instances.numAttributes(), true);
     
      if (!initC.containsKey(hk)) {
    	  //把一个新的实例添加到聚类中心集合中
	m_ClusterCentroids.add(instances.instance(instIndex));
	initC.put(hk, null);
      }
      instances.swap(j, instIndex);//交换j和instIndex的下标
      //如果聚类中心集合包含的样例数目与生成聚类数目相同，则停止添加
      //聚类中心的过程
      if (m_ClusterCentroids.numInstances() == m_NumClusters) {
	break;
      }
    }
   //给生成的聚类数赋值：聚类中心实例个数
    m_NumClusters = m_ClusterCentroids.numInstances();
    
    int i;
    boolean converged = false;//判断是否进行下次迭代
    int emptyClusterCount;
    //生成一个Instances类型的数组，其大小为聚类的数目
    Instances [] tempI = new Instances[m_NumClusters];
    //存放聚类方差的数组，大小为生成的聚类数目
    m_squaredErrors = new double [m_NumClusters];
    //保存每个聚类中每个名词属性出现频率的数组（为什么第三维为0？？？）
    m_ClusterNominalCounts = new int [m_NumClusters][instances.numAttributes()][0];
    while (!converged) {
      emptyClusterCount = 0;//设置空聚类数目为0
      m_Iterations++;//收敛前迭代次数+1
      converged = true;
      //对每个样例进行聚类
      for (i = 0; i < instances.numInstances(); i++) {
   /* wake.core.Instance:Class for handling an instance.所有的值被
    * 初始化并保存为浮点类型。如果是名词性属性，存储的值为这个属性的索引
    * 以下几个方法是这个类的典型应用
    * 1.用三个属性值创建一个空的实例集
    *   Instance inst=new Instance(3);
    * 2.设置实例的属性为length,weight,position
    *   inst.setValue(length,5.3);
    *   inst.setValue(weight,300);
    *   inst.setValue(position,first);
    * 3.设置实例的数据集为race
    *   inst.setDataset(race);
    * 4.输出实例
    *   System.out.println("The instance:"+inst);
   */
	Instance toCluster = instances.instance(i);//将第i个实例赋给toCluster
	//对实例进行聚类，返回一个聚类标号
	int newC = clusterProcessedInstance(toCluster, true);
	if (newC != clusterAssignments[i]) {
	//clusterAssignments[]存放的聚类标号，
	//clusterAssignments[i]表示第i个聚类
	  converged = false;
	}
	clusterAssignments[i] = newC;//存放每个样例所属的聚类标号
      }
      
      // 更新聚类中心
      m_ClusterCentroids = new Instances(instances, m_NumClusters);
      for (i = 0; i < m_NumClusters; i++) {
	tempI[i] = new Instances(instances, 0);
      }
      for (i = 0; i < instances.numInstances(); i++) {
    //把第i个实例添加到相应的聚类集合中
	tempI[clusterAssignments[i]].add(instances.instance(i));
	//tempI数组里面存放的是每一个聚类和每一个聚类包含的样例 
	//此数组为Instance型 每一维都是一个样例集
	//有多少个聚类就有多少维  每一维的样例数目不等 
	//完成添加之后tempI数组中就是分聚类存放所有的样例
      }
      for (i = 0; i < m_NumClusters; i++) {
    //数组大小为实例属性的个数，用来存放每个属性的平均值
	double [] vals = new double[instances.numAttributes()];
	//如果第i个聚类的样例数目为0则判定为空，空聚类的数目+1
	if (tempI[i].numInstances() == 0) {
	  // empty cluster
	  emptyClusterCount++;
	} 
	else {
	  for (int j = 0; j < instances.numAttributes(); j++) {
    /*加权平均 double weka.core.Instances.meanOrMode(int attIndex)
     * 返回数值（名词）属性的平均值为浮点类型，如果属性既不是数值的也不
     * 是名词性的则返回0；若属性丢失返回0
     */
	  	  
	    vals[j] = wMean(tempI[i],j);
	//第i个聚类的第j个属性出现的频率
	//int[][][] weka.clusterers.SimpleKMeans.m_ClusterNominalCounts
	//为每个聚类中的每个名词性属性的值保存出现的频率
	    m_ClusterNominalCounts[i][j] = 
	      tempI[i].attributeStats(j).nominalCounts;
	/*AttributeStats weka.core.Instances.attributeStats(int index)
	 * 统计在此实例集上出现的属性的次数
	 */
	  }
	//初始化实例变量，1.0为设置的权重的值
	  m_ClusterCentroids.add(new Instance(1.0, vals));
	}
      }

      if (emptyClusterCount > 0) {
	m_NumClusters -= emptyClusterCount;
	tempI = new Instances[m_NumClusters];
      }
      if (!converged) {
    //如果有空聚类，则修改聚类数量
	m_squaredErrors = new double [m_NumClusters];
	//修改聚类数组的长度
	m_ClusterNominalCounts = new int [m_NumClusters][instances.numAttributes()][0];
      }
    }
    //ClusterStdDevs存放每个聚类中数值属性的标准差
    m_ClusterStdDevs = new Instances(instances, m_NumClusters);
    m_ClusterSizes = new int [m_NumClusters];//数组的大小为聚类的数目
    for (i = 0; i < m_NumClusters; i++) {
      double [] vals2 = new double[instances.numAttributes()];
      for (int j = 0; j < instances.numAttributes(); j++) {
    //如果第j个属性是数值的，则计算其标准差，否则返回缺失值
	if (instances.attribute(j).isNumeric()) {
	  vals2[j] = Math.sqrt(tempI[i].variance(j));
	} else {
	  vals2[j] = Instance.missingValue();
	}	
      }
      m_ClusterStdDevs.add(new Instance(1.0, vals2));
      //重新定义每个聚类的大小
      m_ClusterSizes[i] = tempI[i].numInstances();
    }
  }

  /**
   * clusters an instance that has been through the filters
   *
   * @param instance the instance to assign a cluster to
   * @param updateErrors if true, update the within clusters sum of errors
   * @return a cluster number
   */
  private int clusterProcessedInstance(Instance instance, boolean updateErrors) {
    double minDist = Integer.MAX_VALUE;
    int bestCluster = 0;
    for (int i = 0; i < m_NumClusters; i++) {
      double dist = distance(instance, m_ClusterCentroids.instance(i));
      if (dist < minDist) {
	minDist = dist;
	bestCluster = i;
      }
    }
    if (updateErrors) {
      m_squaredErrors[bestCluster] += minDist;
    }
    return bestCluster;
  }

  /**
   * Classifies a given instance.
   *
   * @param instance the instance to be assigned to a cluster
   * @return the number of the assigned cluster as an interger
   * if the class is enumerated, otherwise the predicted value
   * @throws Exception if instance could not be classified
   * successfully
   */
  public int clusterInstance(Instance instance) throws Exception {
    m_ReplaceMissingFilter.input(instance);
    m_ReplaceMissingFilter.batchFinished();
    Instance inst = m_ReplaceMissingFilter.output();

    return clusterProcessedInstance(inst, false);
  }

  /**
   * Calculates the distance between two instances
   *
   * @param first the first instance
   * @param second the second instance
   * @return the distance between the two given instances, between 0 and 1
   */          
  private double distance(Instance first, Instance second) {  

    double distance = 0;
    int firstI, secondI;

    for (int p1 = 0, p2 = 0; 
	 p1 < first.numValues() || p2 < second.numValues();) {
      if (p1 >= first.numValues()) {
	firstI = m_ClusterCentroids.numAttributes();
      } else {
12 下一页
💿 文件大小 10 K
👤 上传用户 dongjunxi126
📂 所属分类 Java编程
🏷️ 相关标签

#k-means
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -