📄 simplekmeans.java

📁 数据挖掘聚类算法：SimpleKMeans源代码
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
    }    if (updateErrors) {      m_squaredErrors[bestCluster] += minDist;    }    return bestCluster;  }  /**   * Classifies a given instance.   *   * @param instance the instance to be assigned to a cluster   * @return the number of the assigned cluster as an interger   * if the class is enumerated, otherwise the predicted value   * @throws Exception if instance could not be classified   * successfully   */  public int clusterInstance(Instance instance) throws Exception {    Instance inst = null;    if (!m_dontReplaceMissing) {      m_ReplaceMissingFilter.input(instance);      m_ReplaceMissingFilter.batchFinished();      inst = m_ReplaceMissingFilter.output();    } else {      inst = instance;    }    return clusterProcessedInstance(inst, false);  }  /**   * Calculates the distance between two instances   *   * @param first the first instance   * @param second the second instance   * @return the distance between the two given instances, between 0 and 1   */            private double distance(Instance first, Instance second) {      double distance = 0;    int firstI, secondI;    for (int p1 = 0, p2 = 0; 	 p1 < first.numValues() || p2 < second.numValues();) {      if (p1 >= first.numValues()) {	firstI = m_ClusterCentroids.numAttributes();      } else {	firstI = first.index(p1);       }      if (p2 >= second.numValues()) {	secondI = m_ClusterCentroids.numAttributes();      } else {	secondI = second.index(p2);      }      /*      if (firstI == m_ClusterCentroids.classIndex()) {	p1++; continue;      }       if (secondI == m_ClusterCentroids.classIndex()) {	p2++; continue;        } */      double diff;      if (firstI == secondI) {	diff = difference(firstI, 			  first.valueSparse(p1),			  second.valueSparse(p2));	p1++; p2++;      } else if (firstI > secondI) {	diff = difference(secondI, 			  0, second.valueSparse(p2));	p2++;      } else {	diff = difference(firstI, 			  first.valueSparse(p1), 0);	p1++;      }      distance += diff * diff;    }        //return Math.sqrt(distance / m_ClusterCentroids.numAttributes());    return distance;  }  /**   * Computes the difference between two given attribute   * values.   *    * @param index the attribute index   * @param val1 the first value   * @param val2 the second value   * @return the difference   */  private double difference(int index, double val1, double val2) {    switch (m_ClusterCentroids.attribute(index).type()) {    case Attribute.NOMINAL:            // If attribute is nominal      if (Instance.isMissingValue(val1) || 	  Instance.isMissingValue(val2) ||	  ((int)val1 != (int)val2)) {	return 1;      } else {	return 0;      }    case Attribute.NUMERIC:      // If attribute is numeric      if (Instance.isMissingValue(val1) || 	  Instance.isMissingValue(val2)) {	if (Instance.isMissingValue(val1) && 	    Instance.isMissingValue(val2)) {	  return 1;	} else {	  double diff;	  if (Instance.isMissingValue(val2)) {	    diff = norm(val1, index);	  } else {	    diff = norm(val2, index);	  }	  if (diff < 0.5) {	    diff = 1.0 - diff;	  }	  return diff;	}      } else {	return norm(val1, index) - norm(val2, index);      }    default:      return 0;    }  }  /**   * Normalizes a given value of a numeric attribute.   *   * @param x the value to be normalized   * @param i the attribute's index   * @return the normalized value   */  private double norm(double x, int i) {    if (Double.isNaN(m_Min[i]) || Utils.eq(m_Max[i],m_Min[i])) {      return 0;    } else {      return (x - m_Min[i]) / (m_Max[i] - m_Min[i]);    }  }  /**   * Updates the minimum and maximum values for all the attributes   * based on a new instance.   *   * @param instance the new instance   */  private void updateMinMax(Instance instance) {      for (int j = 0;j < m_ClusterCentroids.numAttributes(); j++) {      if (!instance.isMissing(j)) {	if (Double.isNaN(m_Min[j])) {	  m_Min[j] = instance.value(j);	  m_Max[j] = instance.value(j);	} else {	  if (instance.value(j) < m_Min[j]) {	    m_Min[j] = instance.value(j);	  } else {	    if (instance.value(j) > m_Max[j]) {	      m_Max[j] = instance.value(j);	    }	  }	}      }    }  }    /**   * Returns the number of clusters.   *   * @return the number of clusters generated for a training dataset.   * @throws Exception if number of clusters could not be returned   * successfully   */  public int numberOfClusters() throws Exception {    return m_NumClusters;  }  /**   * Returns an enumeration describing the available options.   *   * @return an enumeration of all the available options.   */  public Enumeration listOptions () {    Vector result = new Vector();    result.addElement(new Option(	"\tnumber of clusters.\n"	+ "\t(default 2).", 	"N", 1, "-N <num>"));    result.addElement(new Option(	"\tDisplay std. deviations for centroids.\n", 	"V", 0, "-V"));    result.addElement(new Option(	"\tReplace missing values with mean/mode.\n", 	"M", 0, "-M"));    Enumeration en = super.listOptions();    while (en.hasMoreElements())      result.addElement(en.nextElement());     return  result.elements();  }  /**   * Returns the tip text for this property   * @return tip text for this property suitable for   * displaying in the explorer/experimenter gui   */  public String numClustersTipText() {    return "set number of clusters";  }  /**   * set the number of clusters to generate   *   * @param n the number of clusters to generate   * @throws Exception if number of clusters is negative   */  public void setNumClusters(int n) throws Exception {    if (n <= 0) {      throw new Exception("Number of clusters must be > 0");    }    m_NumClusters = n;  }  /**   * gets the number of clusters to generate   *   * @return the number of clusters to generate   */  public int getNumClusters() {    return m_NumClusters;  }  /**   * Returns the tip text for this property   * @return tip text for this property suitable for   * displaying in the explorer/experimenter gui   */  public String displayStdDevsTipText() {    return "Display std deviations of numeric attributes "      + "and counts of nominal attributes.";  }  /**   * Sets whether standard deviations and nominal count   * Should be displayed in the clustering output   *   * @param stdD true if std. devs and counts should be    * displayed   */  public void setDisplayStdDevs(boolean stdD) {    m_displayStdDevs = stdD;  }  /**   * Gets whether standard deviations and nominal count   * Should be displayed in the clustering output   *   * @return true if std. devs and counts should be    * displayed   */  public boolean getDisplayStdDevs() {    return m_displayStdDevs;  }  /**   * Returns the tip text for this property   * @return tip text for this property suitable for   * displaying in the explorer/experimenter gui   */  public String dontReplaceMissingValuesTipText() {    return "Replace missing values globally with mean/mode.";  }  /**   * Sets whether missing values are to be replaced   *   * @param r true if missing values are to be   * replaced   */  public void setDontReplaceMissingValues(boolean r) {    m_dontReplaceMissing = r;  }  /**   * Gets whether missing values are to be replaced   *   * @return true if missing values are to be   * replaced   */  public boolean getDontReplaceMissingValues() {    return m_dontReplaceMissing;  }  /**   * Parses a given list of options. <p/>   *    <!-- options-start -->   * Valid options are: <p/>   *    * <pre> -N &lt;num&gt;   *  number of clusters.   *  (default 2).</pre>   *    * <pre> -V   *  Display std. deviations for centroids.   * </pre>   *    * <pre> -M   *  Replace missing values with mean/mode.   * </pre>   *    * <pre> -S &lt;num&gt;   *  Random number seed.   *  (default 10)</pre>   *    <!-- options-end -->   *   * @param options the list of options as an array of strings   * @throws Exception if an option is not supported   */  public void setOptions (String[] options)    throws Exception {    m_displayStdDevs = Utils.getFlag("V", options);    m_dontReplaceMissing = Utils.getFlag("M", options);    String optionString = Utils.getOption('N', options);    if (optionString.length() != 0) {      setNumClusters(Integer.parseInt(optionString));    }        super.setOptions(options);  }  /**   * Gets the current settings of SimpleKMeans   *   * @return an array of strings suitable for passing to setOptions()   */  public String[] getOptions () {    int       	i;    Vector    	result;    String[]  	options;    result = new Vector();    if (m_displayStdDevs) {      result.add("-V");    }    if (m_dontReplaceMissing) {      result.add("-M");    }    result.add("-N");    result.add("" + getNumClusters());    options = super.getOptions();    for (i = 0; i < options.length; i++)      result.add(options[i]);    return (String[]) result.toArray(new String[result.size()]);	    }  /**   * return a string describing this clusterer   *   * @return a description of the clusterer as a string   */
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -