📄 kwmeans.txt

📁 改进的k-means方法
💻 TXT
📖 第 1 页 / 共 2 页
字号:
上一页 12
	firstI = first.index(p1); 
      }
      if (p2 >= second.numValues()) {
	secondI = m_ClusterCentroids.numAttributes();
      } else {
	secondI = second.index(p2);
      }
      /*      if (firstI == m_ClusterCentroids.classIndex()) {
	p1++; continue;
      } 
      if (secondI == m_ClusterCentroids.classIndex()) {
	p2++; continue;
        } */
      double diff;
      if (firstI == secondI) {
	diff = difference(firstI, 
			  first.valueSparse(p1),
			  second.valueSparse(p2));
	p1++; p2++;
      } else if (firstI > secondI) {
	diff = difference(secondI, 
			  0, second.valueSparse(p2));
	p2++;
      } else {
	diff = difference(firstI, 
			  first.valueSparse(p1), 0);
	p1++;
      }
      distance += diff * diff;
    }
    
    //return Math.sqrt(distance / m_ClusterCentroids.numAttributes());
    return distance;
  }

  /**
   * Computes the difference between two given attribute
   * values.
   * 
   * @param index the attribute index
   * @param val1 the first value
   * @param val2 the second value
   * @return the difference
   */
  private double difference(int index, double val1, double val2) {

    switch (m_ClusterCentroids.attribute(index).type()) {
    case Attribute.NOMINAL:
      
      // If attribute is nominal
      if (Instance.isMissingValue(val1) || 
	  Instance.isMissingValue(val2) ||
	  ((int)val1 != (int)val2)) {
	return 1;
      } else {
	return 0;
      }
    case Attribute.NUMERIC:

      // If attribute is numeric
      if (Instance.isMissingValue(val1) || 
	  Instance.isMissingValue(val2)) {
	if (Instance.isMissingValue(val1) && 
	    Instance.isMissingValue(val2)) {
	  return 1;
	} else {
	  double diff;
	  if (Instance.isMissingValue(val2)) {
	    diff = norm(val1, index);
	  } else {
	    diff = norm(val2, index);
	  }
	  if (diff < 0.5) {
	    diff = 1.0 - diff;
	  }
	  return diff;
	}
      } else {
	return norm(val1, index) - norm(val2, index);
      }
    default:
      return 0;
    }
  }
  protected void setWeight(Instances instances,int num){
	double Alldistance=0.0,Allpreweight=0.0;
	double[] preweight=new double[num];
	double[] weight=new double[num];
	for(int j=0;j<num;j++){
		for(int i=0;i<num;i++){
		 Alldistance+=distance(instances.instance(i),instances.instance(j));
			}
		preweight[j]=Alldistance/num;
	}
	for(int i=0;i<num;i++){
		Allpreweight+=preweight[i];
		}
	for(int i=0;i<num;i++){
		
		weight[i]=Allpreweight/preweight[i];
		instances.instance(i).setWeight(weight[i]);
	}
  }
  //index为属性索引，instance为所要加权的属性
  protected double wMean(Instances instance,int index){
	  double weightmean=0.0,awmean=0.0,sumweight=0.0;
	  int [] counts;
	  if(instance.attribute(index).isNumeric()){
	  for(int i=0;i<instance.numInstances();i++){
		  sumweight += instance.instance(i).weight();
		weightmean+=instance.instance(i).weight()*instance.instance(i).value(index);
	    awmean=weightmean/sumweight;
	  }
	  }
	  //如果属性是名词性的
	 
	  else if (instance.attribute(index).isNominal()) {
	    	
	    	 counts = new int[instance.attribute(index).numValues()];
	      for (int j = 0; j < instance.numInstances(); j++) {
		if (!instance.instance(j).isMissing(index)) {//如果是名词性属性则直接求权值和
		  counts[(int) instance.instance(j).value(index)] += instance.instance(j).weight();
		   
			}
		  }
	      awmean= (double)Utils.maxIndex(counts);//返回权值最大的属性值的权重作为均值
	     
	    }else{awmean=0.0;}
	 
	  return awmean;
	    
  }
  /**
   * Normalizes a given value of a numeric attribute.
   *
   * @param x the value to be normalized
   * @param i the attribute's index
   * @return the normalized value
   */
  private double norm(double x, int i) {

    if (Double.isNaN(m_Min[i]) || Utils.eq(m_Max[i],m_Min[i])) {
      return 0;
    } else {
      return (x - m_Min[i]) / (m_Max[i] - m_Min[i]);
    }
  }

  /**
   * Updates the minimum and maximum values for all the attributes
   * based on a new instance.
   *
   * @param instance the new instance
   */
  private void updateMinMax(Instance instance) {  

    for (int j = 0;j < m_ClusterCentroids.numAttributes(); j++) {
      if (!instance.isMissing(j)) {
	if (Double.isNaN(m_Min[j])) {
	  m_Min[j] = instance.value(j);
	  m_Max[j] = instance.value(j);
	} else {
	  if (instance.value(j) < m_Min[j]) {
	    m_Min[j] = instance.value(j);
	  } else {
	    if (instance.value(j) > m_Max[j]) {
	      m_Max[j] = instance.value(j);
	    }
	  }
	}
      }
    }
  }
  
  /**
   * Returns the number of clusters.
   *
   * @return the number of clusters generated for a training dataset.
   * @throws Exception if number of clusters could not be returned
   * successfully
   */
  public int numberOfClusters() throws Exception {
    return m_NumClusters;
  }

  /**
   * Returns an enumeration describing the available options.
   *
   * @return an enumeration of all the available options.
   */
  public Enumeration listOptions () {
    Vector result = new Vector();

    result.addElement(new Option(
	"\tnumber of clusters.\n"
	+ "\t(default 2).", 
	"N", 1, "-N <num>"));

    Enumeration en = super.listOptions();
    while (en.hasMoreElements())
      result.addElement(en.nextElement());

     return  result.elements();
  }

  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String numClustersTipText() {
    return "set number of clusters";
  }

  /**
   * set the number of clusters to generate
   *
   * @param n the number of clusters to generate
   * @throws Exception if number of clusters is negative
   */
  public void setNumClusters(int n) throws Exception {
    if (n <= 0) {
      throw new Exception("Number of clusters must be > 0");
    }
    m_NumClusters = n;
  }

  /**
   * gets the number of clusters to generate
   *
   * @return the number of clusters to generate
   */
  public int getNumClusters() {
    return m_NumClusters;
  }

  /**
   * Parses a given list of options. <p/>
   * 
   <!-- options-start -->
   * Valid options are: <p/>
   * 
   * <pre> -N &lt;num&gt;
   *  number of clusters.
   *  (default 2).</pre>
   * 
   * <pre> -S &lt;num&gt;
   *  Random number seed.
   *  (default 10)</pre>
   * 
   <!-- options-end -->
   *
   * @param options the list of options as an array of strings
   * @throws Exception if an option is not supported
   */
  public void setOptions (String[] options)
    throws Exception {

    String optionString = Utils.getOption('N', options);

    if (optionString.length() != 0) {
      setNumClusters(Integer.parseInt(optionString));
    }
    
    super.setOptions(options);
  }

  /**
   * Gets the current settings of SimpleKMeans
   *
   * @return an array of strings suitable for passing to setOptions()
   */
  public String[] getOptions () {
    int       	i;
    Vector    	result;
    String[]  	options;

    result = new Vector();

    result.add("-N");
    result.add("" + getNumClusters());

    options = super.getOptions();
    for (i = 0; i < options.length; i++)
      result.add(options[i]);

    return (String[]) result.toArray(new String[result.size()]);	  
  }

  /**
   * return a string describing this clusterer
   *
   * @return a description of the clusterer as a string
   */
  public String toString() {
    int maxWidth = 0;
    for (int i = 0; i < m_NumClusters; i++) {
      for (int j = 0 ;j < m_ClusterCentroids.numAttributes(); j++) {
	if (m_ClusterCentroids.attribute(j).isNumeric()) {
	  double width = Math.log(Math.abs(m_ClusterCentroids.instance(i).value(j))) /
	    Math.log(10.0);
	  width += 1.0;
	  if ((int)width > maxWidth) {
	    maxWidth = (int)width;
	  }
	}
      }
    }
    StringBuffer temp = new StringBuffer();
    String naString = "N/A";
    for (int i = 0; i < maxWidth+2; i++) {
      naString += " ";
    }
    temp.append("\nkMeans\n======\n");
    temp.append("\nNumber of iterations: " + m_Iterations+"\n");
    temp.append("Within cluster sum of squared errors: " + Utils.sum(m_squaredErrors));

    temp.append("\n\nCluster centroids:\n");
    for (int i = 0; i < m_NumClusters; i++) {
      temp.append("\nCluster "+i+"\n\t");
      temp.append("Mean/Mode: ");
      for (int j = 0; j < m_ClusterCentroids.numAttributes(); j++) {
	if (m_ClusterCentroids.attribute(j).isNominal()) {
	  temp.append(" "+m_ClusterCentroids.attribute(j).
		      value((int)m_ClusterCentroids.instance(i).value(j)));
	} else {
	  temp.append(" "+Utils.doubleToString(m_ClusterCentroids.instance(i).value(j),
					       maxWidth+5, 4));
	}
      }
      temp.append("\n\tStd Devs:  ");
      for (int j = 0; j < m_ClusterStdDevs.numAttributes(); j++) {
	if (m_ClusterStdDevs.attribute(j).isNumeric()) {
	  temp.append(" "+Utils.doubleToString(m_ClusterStdDevs.instance(i).value(j), 
					       maxWidth+5, 4));
	} else {
	  temp.append(" "+naString);
	}
      }
    }
    temp.append("\n\n");
    return temp.toString();
  }

  /**
   * Gets the the cluster centroids
   * 
   * @return		the cluster centroids
   */
  public Instances getClusterCentroids() {
    return m_ClusterCentroids;
  }

  /**
   * Gets the standard deviations of the numeric attributes in each cluster
   * 
   * @return		the standard deviations of the numeric attributes 
   * 			in each cluster
   */
  public Instances getClusterStandardDevs() {
    return m_ClusterStdDevs;
  }

  /**
   * Returns for each cluster the frequency counts for the values of each 
   * nominal attribute
   * 
   * @return		the counts
   */
  public int [][][] getClusterNominalCounts() {
    return m_ClusterNominalCounts;
  }

  /**
   * Gets the squared error for all clusters
   * 
   * @return		the squared error
   */
  public double getSquaredError() {
    return Utils.sum(m_squaredErrors);
  }

  /**
   * Gets the number of instances in each cluster
   * 
   * @return		The number of instances in each cluster
   */
  public int [] getClusterSizes() {
    return m_ClusterSizes;
  }

  /**
   * Main method for testing this class.
   *
   * @param argv should contain the following arguments: <p>
   * -t training file [-N number of clusters]
   */
  public static void main (String[] argv) {
    runClusterer(new KwMeans(), argv);
  }
}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -