📄 checkclusterer.java

📁 数据挖掘中聚类的算法
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/* *    This program is free software; you can redistribute it and/or modify *    it under the terms of the GNU General Public License as published by *    the Free Software Foundation; either version 2 of the License, or *    (at your option) any later version. * *    This program is distributed in the hope that it will be useful, *    but WITHOUT ANY WARRANTY; without even the implied warranty of *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *    GNU General Public License for more details. * *    You should have received a copy of the GNU General Public License *    along with this program; if not, write to the Free Software *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* * CheckClusterer.java * Copyright (C) 2006 University of Waikato, Hamilton, New Zealand * */package weka.clusterers;import weka.core.CheckScheme;import weka.core.FastVector;import weka.core.Instance;import weka.core.Instances;import weka.core.MultiInstanceCapabilitiesHandler;import weka.core.Option;import weka.core.OptionHandler;import weka.core.SerializationHelper;import weka.core.TestInstances;import weka.core.Utils;import weka.core.WeightedInstancesHandler;import java.util.Enumeration;import java.util.Random;import java.util.Vector;/** * Class for examining the capabilities and finding problems with  * clusterers. If you implement a clusterer using the WEKA.libraries, * you should run the checks on it to ensure robustness and correct * operation. Passing all the tests of this object does not mean * bugs in the clusterer don't exist, but this will help find some * common ones. <p/> *  * Typical usage: <p/> * <code>java weka.clusterers.CheckClusterer -W clusterer_name  * -- clusterer_options </code><p/> *  * CheckClusterer reports on the following: * <ul> *    <li> Clusterer abilities  *      <ul> *         <li> Possible command line options to the clusterer </li> *         <li> Whether the clusterer can predict nominal, numeric, string,  *              date or relational class attributes.</li> *         <li> Whether the clusterer can handle numeric predictor attributes </li> *         <li> Whether the clusterer can handle nominal predictor attributes </li> *         <li> Whether the clusterer can handle string predictor attributes </li> *         <li> Whether the clusterer can handle date predictor attributes </li> *         <li> Whether the clusterer can handle relational predictor attributes </li> *         <li> Whether the clusterer can handle multi-instance data </li> *         <li> Whether the clusterer can handle missing predictor values </li> *         <li> Whether the clusterer can handle instance weights </li> *      </ul> *    </li> *    <li> Correct functioning  *      <ul> *         <li> Correct initialisation during buildClusterer (i.e. no result *              changes when buildClusterer called repeatedly) </li> *         <li> Whether the clusterer alters the data pased to it  *              (number of instances, instance order, instance weights, etc) </li> *      </ul> *    </li> *    <li> Degenerate cases  *      <ul> *         <li> building clusterer with zero training instances </li> *         <li> all but one predictor attribute values missing </li> *         <li> all predictor attribute values missing </li> *         <li> all but one class values missing </li> *         <li> all class values missing </li> *      </ul> *    </li> * </ul> * Running CheckClusterer with the debug option set will output the  * training dataset for any failed tests.<p/> * * The <code>weka.clusterers.AbstractClustererTest</code> uses this * class to test all the clusterers. Any changes here, have to be  * checked in that abstract test class, too. <p/> * <!-- options-start --> * Valid options are: <p/> *  * <pre> -D *  Turn on debugging output.</pre> *  * <pre> -S *  Silent mode - prints nothing to stdout.</pre> *  * <pre> -N &lt;num&gt; *  The number of instances in the datasets (default 20).</pre> *  * <pre> -nominal &lt;num&gt; *  The number of nominal attributes (default 2).</pre> *  * <pre> -nominal-values &lt;num&gt; *  The number of values for nominal attributes (default 1).</pre> *  * <pre> -numeric &lt;num&gt; *  The number of numeric attributes (default 1).</pre> *  * <pre> -string &lt;num&gt; *  The number of string attributes (default 1).</pre> *  * <pre> -date &lt;num&gt; *  The number of date attributes (default 1).</pre> *  * <pre> -relational &lt;num&gt; *  The number of relational attributes (default 1).</pre> *  * <pre> -num-instances-relational &lt;num&gt; *  The number of instances in relational/bag attributes (default 10).</pre> *  * <pre> -words &lt;comma-separated-list&gt; *  The words to use in string attributes.</pre> *  * <pre> -word-separators &lt;chars&gt; *  The word separators to use in string attributes.</pre> *  * <pre> -W *  Full name of the clusterer analyzed. *  eg: weka.clusterers.SimpleKMeans *  (default weka.clusterers.SimpleKMeans)</pre> *  * <pre>  * Options specific to clusterer weka.clusterers.SimpleKMeans: * </pre> *  * <pre> -N &lt;num&gt; *  number of clusters. (default = 2).</pre> *  * <pre> -S &lt;num&gt; *  random number seed. *  (default 10)</pre> *  <!-- options-end --> * * Options after -- are passed to the designated clusterer.<p/> * * @author Len Trigg (trigg@cs.waikato.ac.nz) * @author FracPete (fracpete at waikato dot ac dot nz) * @version $Revision: 1.8 $ * @see TestInstances */public class CheckClusterer   extends CheckScheme {  /*   * Note about test methods:   * - methods return array of booleans   * - first index: success or not   * - second index: acceptable or not (e.g., Exception is OK)   *   * FracPete (fracpete at waikato dot ac dot nz)   */    /*** The clusterer to be examined */  protected Clusterer m_Clusterer = new SimpleKMeans();    /**   * default constructor   */  public CheckClusterer() {    super();        setNumInstances(40);  }    /**   * Returns an enumeration describing the available options.   *   * @return an enumeration of all the available options.   */  public Enumeration listOptions() {    Vector result = new Vector();        Enumeration en = super.listOptions();    while (en.hasMoreElements())      result.addElement(en.nextElement());        result.addElement(new Option(        "\tFull name of the clusterer analyzed.\n"        +"\teg: weka.clusterers.SimpleKMeans\n"        + "\t(default weka.clusterers.SimpleKMeans)",        "W", 1, "-W"));        if ((m_Clusterer != null)         && (m_Clusterer instanceof OptionHandler)) {      result.addElement(new Option("", "", 0,           "\nOptions specific to clusterer "          + m_Clusterer.getClass().getName()          + ":"));      Enumeration enu = ((OptionHandler)m_Clusterer).listOptions();      while (enu.hasMoreElements())        result.addElement(enu.nextElement());    }        return result.elements();  }    /**   * Parses a given list of options. <p/>   *   <!-- options-start -->   * Valid options are: <p/>   *    * <pre> -D   *  Turn on debugging output.</pre>   *    * <pre> -S   *  Silent mode - prints nothing to stdout.</pre>   *    * <pre> -N &lt;num&gt;   *  The number of instances in the datasets (default 20).</pre>   *    * <pre> -nominal &lt;num&gt;   *  The number of nominal attributes (default 2).</pre>   *    * <pre> -nominal-values &lt;num&gt;   *  The number of values for nominal attributes (default 1).</pre>   *    * <pre> -numeric &lt;num&gt;   *  The number of numeric attributes (default 1).</pre>   *    * <pre> -string &lt;num&gt;   *  The number of string attributes (default 1).</pre>   *    * <pre> -date &lt;num&gt;   *  The number of date attributes (default 1).</pre>   *    * <pre> -relational &lt;num&gt;   *  The number of relational attributes (default 1).</pre>   *    * <pre> -num-instances-relational &lt;num&gt;   *  The number of instances in relational/bag attributes (default 10).</pre>   *    * <pre> -words &lt;comma-separated-list&gt;   *  The words to use in string attributes.</pre>   *    * <pre> -word-separators &lt;chars&gt;   *  The word separators to use in string attributes.</pre>   *    * <pre> -W   *  Full name of the clusterer analyzed.   *  eg: weka.clusterers.SimpleKMeans   *  (default weka.clusterers.SimpleKMeans)</pre>   *    * <pre>    * Options specific to clusterer weka.clusterers.SimpleKMeans:   * </pre>   *    * <pre> -N &lt;num&gt;   *  number of clusters. (default = 2).</pre>   *    * <pre> -S &lt;num&gt;   *  random number seed.   *  (default 10)</pre>   *    <!-- options-end -->   *   * @param options the list of options as an array of strings   * @throws Exception if an option is not supported   */  public void setOptions(String[] options) throws Exception {    String      tmpStr;        tmpStr = Utils.getOption('N', options);        super.setOptions(options);        if (tmpStr.length() != 0)      setNumInstances(Integer.parseInt(tmpStr));    else      setNumInstances(40);    tmpStr = Utils.getOption('W', options);    if (tmpStr.length() == 0)      tmpStr = weka.clusterers.SimpleKMeans.class.getName();    setClusterer(	(Clusterer) forName(	    "weka.clusterers", 	    Clusterer.class, 	    tmpStr, 	    Utils.partitionOptions(options)));  }    /**   * Gets the current settings of the CheckClusterer.   *   * @return an array of strings suitable for passing to setOptions   */  public String[] getOptions() {    Vector        result;    String[]      options;    int           i;        result = new Vector();        options = super.getOptions();    for (i = 0; i < options.length; i++)      result.add(options[i]);        if (getClusterer() != null) {      result.add("-W");      result.add(getClusterer().getClass().getName());    }        if ((m_Clusterer != null) && (m_Clusterer instanceof OptionHandler))      options = ((OptionHandler) m_Clusterer).getOptions();    else      options = new String[0];        if (options.length > 0) {      result.add("--");      for (i = 0; i < options.length; i++)        result.add(options[i]);    }        return (String[]) result.toArray(new String[result.size()]);  }    /**   * Begin the tests, reporting results to System.out   */  public void doTests() {        if (getClusterer() == null) {      println("\n=== No clusterer set ===");      return;    }    println("\n=== Check on Clusterer: "        + getClusterer().getClass().getName()        + " ===\n");        // Start tests    println("--> Checking for interfaces");    canTakeOptions();    boolean updateable = updateableClusterer()[0];    boolean weightedInstancesHandler = weightedInstancesHandler()[0];    boolean multiInstanceHandler = multiInstanceHandler()[0];    println("--> Clusterer tests");    declaresSerialVersionUID();    runTests(weightedInstancesHandler, multiInstanceHandler, updateable);  }    /**   * Set the clusterer for testing.    *   * @param newClusterer the Clusterer to use.   */  public void setClusterer(Clusterer newClusterer) {    m_Clusterer = newClusterer;  }    /**   * Get the clusterer used as the clusterer   *   * @return the clusterer used as the clusterer   */  public Clusterer getClusterer() {    return m_Clusterer;  }    /**   * Run a battery of tests   *   * @param weighted true if the clusterer says it handles weights   * @param multiInstance true if the clusterer is a multi-instance clusterer   * @param updateable true if the classifier is updateable   */  protected void runTests(boolean weighted, boolean multiInstance, boolean updateable) {        boolean PNom = canPredict(true,  false, false, false, false, multiInstance)[0];    boolean PNum = canPredict(false, true,  false, false, false, multiInstance)[0];    boolean PStr = canPredict(false, false, true,  false, false, multiInstance)[0];    boolean PDat = canPredict(false, false, false, true,  false, multiInstance)[0];    boolean PRel;    if (!multiInstance)      PRel = canPredict(false, false, false, false,  true, multiInstance)[0];    else      PRel = false;    if (PNom || PNum || PStr || PDat || PRel) {      if (weighted)        instanceWeights(PNom, PNum, PStr, PDat, PRel, multiInstance);            canHandleZeroTraining(PNom, PNum, PStr, PDat, PRel, multiInstance);      boolean handleMissingPredictors = canHandleMissing(PNom, PNum, PStr, PDat, PRel,           multiInstance, true, 20)[0];      if (handleMissingPredictors)        canHandleMissing(PNom, PNum, PStr, PDat, PRel, multiInstance, true, 100);            correctBuildInitialisation(PNom, PNum, PStr, PDat, PRel, multiInstance);      datasetIntegrity(PNom, PNum, PStr, PDat, PRel, multiInstance, handleMissingPredictors);      if (updateable)        updatingEquality(PNom, PNum, PStr, PDat, PRel, multiInstance);    }  }    /**   * Checks whether the scheme can take command line options.   *   * @return index 0 is true if the clusterer can take options   */  protected boolean[] canTakeOptions() {        boolean[] result = new boolean[2];        print("options...");    if (m_Clusterer instanceof OptionHandler) {      println("yes");      if (m_Debug) {        println("\n=== Full report ===");        Enumeration enu = ((OptionHandler)m_Clusterer).listOptions();        while (enu.hasMoreElements()) {          Option option = (Option) enu.nextElement();          print(option.synopsis() + "\n"               + option.description() + "\n");        }        println("\n");      }      result[0] = true;    }    else {      println("no");      result[0] = false;    }
12 3 下一页
💿 文件大小 124 K
👤 上传用户 wuseyue
📂 所属分类数学计算
🏷️ 相关标签

#数据挖掘 #聚类 #算法
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -