📄 simplekmeans.java
字号:
} if (updateErrors) { m_squaredErrors[bestCluster] += minDist; } return bestCluster; } /** * Classifies a given instance. * * @param instance the instance to be assigned to a cluster * @return the number of the assigned cluster as an interger * if the class is enumerated, otherwise the predicted value * @throws Exception if instance could not be classified * successfully */ public int clusterInstance(Instance instance) throws Exception { Instance inst = null; if (!m_dontReplaceMissing) { m_ReplaceMissingFilter.input(instance); m_ReplaceMissingFilter.batchFinished(); inst = m_ReplaceMissingFilter.output(); } else { inst = instance; } return clusterProcessedInstance(inst, false); } /** * Calculates the distance between two instances * * @param first the first instance * @param second the second instance * @return the distance between the two given instances, between 0 and 1 */ private double distance(Instance first, Instance second) { double distance = 0; int firstI, secondI; for (int p1 = 0, p2 = 0; p1 < first.numValues() || p2 < second.numValues();) { if (p1 >= first.numValues()) { firstI = m_ClusterCentroids.numAttributes(); } else { firstI = first.index(p1); } if (p2 >= second.numValues()) { secondI = m_ClusterCentroids.numAttributes(); } else { secondI = second.index(p2); } /* if (firstI == m_ClusterCentroids.classIndex()) { p1++; continue; } if (secondI == m_ClusterCentroids.classIndex()) { p2++; continue; } */ double diff; if (firstI == secondI) { diff = difference(firstI, first.valueSparse(p1), second.valueSparse(p2)); p1++; p2++; } else if (firstI > secondI) { diff = difference(secondI, 0, second.valueSparse(p2)); p2++; } else { diff = difference(firstI, first.valueSparse(p1), 0); p1++; } distance += diff * diff; } //return Math.sqrt(distance / m_ClusterCentroids.numAttributes()); return distance; } /** * Computes the difference between two given attribute * values. * * @param index the attribute index * @param val1 the first value * @param val2 the second value * @return the difference */ private double difference(int index, double val1, double val2) { switch (m_ClusterCentroids.attribute(index).type()) { case Attribute.NOMINAL: // If attribute is nominal if (Instance.isMissingValue(val1) || Instance.isMissingValue(val2) || ((int)val1 != (int)val2)) { return 1; } else { return 0; } case Attribute.NUMERIC: // If attribute is numeric if (Instance.isMissingValue(val1) || Instance.isMissingValue(val2)) { if (Instance.isMissingValue(val1) && Instance.isMissingValue(val2)) { return 1; } else { double diff; if (Instance.isMissingValue(val2)) { diff = norm(val1, index); } else { diff = norm(val2, index); } if (diff < 0.5) { diff = 1.0 - diff; } return diff; } } else { return norm(val1, index) - norm(val2, index); } default: return 0; } } /** * Normalizes a given value of a numeric attribute. * * @param x the value to be normalized * @param i the attribute's index * @return the normalized value */ private double norm(double x, int i) { if (Double.isNaN(m_Min[i]) || Utils.eq(m_Max[i],m_Min[i])) { return 0; } else { return (x - m_Min[i]) / (m_Max[i] - m_Min[i]); } } /** * Updates the minimum and maximum values for all the attributes * based on a new instance. * * @param instance the new instance */ private void updateMinMax(Instance instance) { for (int j = 0;j < m_ClusterCentroids.numAttributes(); j++) { if (!instance.isMissing(j)) { if (Double.isNaN(m_Min[j])) { m_Min[j] = instance.value(j); m_Max[j] = instance.value(j); } else { if (instance.value(j) < m_Min[j]) { m_Min[j] = instance.value(j); } else { if (instance.value(j) > m_Max[j]) { m_Max[j] = instance.value(j); } } } } } } /** * Returns the number of clusters. * * @return the number of clusters generated for a training dataset. * @throws Exception if number of clusters could not be returned * successfully */ public int numberOfClusters() throws Exception { return m_NumClusters; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ public Enumeration listOptions () { Vector result = new Vector(); result.addElement(new Option( "\tnumber of clusters.\n" + "\t(default 2).", "N", 1, "-N <num>")); result.addElement(new Option( "\tDisplay std. deviations for centroids.\n", "V", 0, "-V")); result.addElement(new Option( "\tReplace missing values with mean/mode.\n", "M", 0, "-M")); Enumeration en = super.listOptions(); while (en.hasMoreElements()) result.addElement(en.nextElement()); return result.elements(); } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String numClustersTipText() { return "set number of clusters"; } /** * set the number of clusters to generate * * @param n the number of clusters to generate * @throws Exception if number of clusters is negative */ public void setNumClusters(int n) throws Exception { if (n <= 0) { throw new Exception("Number of clusters must be > 0"); } m_NumClusters = n; } /** * gets the number of clusters to generate * * @return the number of clusters to generate */ public int getNumClusters() { return m_NumClusters; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String displayStdDevsTipText() { return "Display std deviations of numeric attributes " + "and counts of nominal attributes."; } /** * Sets whether standard deviations and nominal count * Should be displayed in the clustering output * * @param stdD true if std. devs and counts should be * displayed */ public void setDisplayStdDevs(boolean stdD) { m_displayStdDevs = stdD; } /** * Gets whether standard deviations and nominal count * Should be displayed in the clustering output * * @return true if std. devs and counts should be * displayed */ public boolean getDisplayStdDevs() { return m_displayStdDevs; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String dontReplaceMissingValuesTipText() { return "Replace missing values globally with mean/mode."; } /** * Sets whether missing values are to be replaced * * @param r true if missing values are to be * replaced */ public void setDontReplaceMissingValues(boolean r) { m_dontReplaceMissing = r; } /** * Gets whether missing values are to be replaced * * @return true if missing values are to be * replaced */ public boolean getDontReplaceMissingValues() { return m_dontReplaceMissing; } /** * Parses a given list of options. <p/> * <!-- options-start --> * Valid options are: <p/> * * <pre> -N <num> * number of clusters. * (default 2).</pre> * * <pre> -V * Display std. deviations for centroids. * </pre> * * <pre> -M * Replace missing values with mean/mode. * </pre> * * <pre> -S <num> * Random number seed. * (default 10)</pre> * <!-- options-end --> * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ public void setOptions (String[] options) throws Exception { m_displayStdDevs = Utils.getFlag("V", options); m_dontReplaceMissing = Utils.getFlag("M", options); String optionString = Utils.getOption('N', options); if (optionString.length() != 0) { setNumClusters(Integer.parseInt(optionString)); } super.setOptions(options); } /** * Gets the current settings of SimpleKMeans * * @return an array of strings suitable for passing to setOptions() */ public String[] getOptions () { int i; Vector result; String[] options; result = new Vector(); if (m_displayStdDevs) { result.add("-V"); } if (m_dontReplaceMissing) { result.add("-M"); } result.add("-N"); result.add("" + getNumClusters()); options = super.getOptions(); for (i = 0; i < options.length; i++) result.add(options[i]); return (String[]) result.toArray(new String[result.size()]); } /** * return a string describing this clusterer * * @return a description of the clusterer as a string */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -