birchcluster.java

来自「wekaUT是 university texas austin 开发的基于wek」· Java 代码 · 共 1,095 行 · 第 1/2 页

JAVA
1,095
字号
/* *    BIRCHCluster.java *    Copyright (C) 2001 Gabi Schmidberger. * *    This program is free software; you can redistribute it and/or modify *    it under the terms of the GNU General Public License as published by *    the Free Software Foundation; either version 2 of the License, or *    (at your option) any later version. * *    This program is distributed in the hope that it will be useful, *    but WITHOUT ANY WARRANTY; without even the implied warranty of *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *    GNU General Public License for more details. * *    You should have received a copy of the GNU General Public License *    along with this program; if not, write to the Free Software *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */package weka.datagenerators;import weka.core.Attribute;import weka.core.FastVector;import weka.core.Instance;import weka.core.Instances;import weka.core.OptionHandler;import weka.core.Option;import weka.core.Utils;import java.io.Serializable;import java.util.Random;import java.util.Enumeration;import java.util.Vector;/** * Cluster data generator designed for the BIRCH System * * Dataset is generated with instances in K clusters. * Instances are 2-d data points. * Each cluster is characterized by the number of data points in it * its radius and its center. The location of the cluster centers is * determined by the pattern parameter. Three patterns are currently * supported grid, sine and random. * todo: * * (out of: BIRCH: An Efficient Data Clustering Method for Very Large * Databases; T. Zhang, R. Ramkrishnan, M. Livny; 1996 ACM) * *  * Class to generate data randomly by producing a decision list. * The decision list consists of rules. * Instances are generated randomly one by one. If decision list fails * to classify the current instance, a new rule according to this current * instance is generated and added to the decision list.<p> * * The option -V switches on voting, which means that at the end * of the generation all instances are * reclassified to the class value that is supported by the most rules.<p> * * This data generator can generate 'boolean' attributes (= nominal with * the values {true, false}) and numeric attributes. The rules can be * 'A' or 'NOT A' for boolean values and 'B < random_value' or * 'B >= random_value' for numeric values.<p>  * * Valid options are:<p> * * -G <br> * The pattern for instance generation is grid.<br> * This flag cannot be used at the same time as flag I. * The pattern is random, if neither flag G nor flag I is set.<p> * * -I <br> * The pattern for instance generation is sine.<br> * This flag cannot be used at the same time as flag G. * The pattern is random, if neither flag G nor flag I is set.<p> * * -N num .. num <br> * The range of the number of instances in each cluster (default 1..50).<br> * Lower number must be between 0 and 2500, upper number must be between * 50 and 2500.<p> * * -R num .. num <br> * The range of the radius of the clusters (default 0.1 .. SQRT(2)).<br> * Lower number must be between 0 and SQRT(2), upper number must be between<br> * SQRT(2) and SQRT(32).<p> * * -M num <br> * Distance multiplier, only used if pattern is grid (default 4). <p> * * -C num <br> * Number of cycles, only used if pattern is sine (default 4). <p> * * -O <br> * Flag for input order is ordered. If flag is not set then input * order is randomized.<p> * * -P num<br> * Noise rate in percent. Can be between 0% and 30% (default 0%).<br> * (Remark: The original algorithm only allows noise up to 10%.)<p> * * -S seed <br> * Random number seed for random function used (default 1). <p> * * @author Gabi Schmidberger (gabi@cs.waikato.ac.nz) * @version $Revision: 1.1.1.1 $  **/public class BIRCHCluster extends ClusterGenerator  implements OptionHandler,	     Serializable {  /**@serial minimal number of instances per cluster (option N)*/   private int m_MinInstNum = 1;    /**@serial maximal number of instances per cluster (option N)*/   private int m_MaxInstNum = 50;    /**@serial minimum radius (option R)*/   private double m_MinRadius= 0.1;    /**@serial maximum radius (option R)*/   private double m_MaxRadius = Math.sqrt(2.0);    /**@serial  Constant set for choice of pattern. (option G)*/  public static final int GRID = 0;  /**@serial  Constant set for choice of pattern. (option I)*/  public static final int SINE = 1;  /**@serial  Constant set for choice of pattern. (default)*/  public static final int RANDOM = 2;    /**@serial pattern (changed with options G or S)*/   private int m_Pattern = RANDOM;    /**@serial distance multiplier (option M)*/  private double m_DistMult = 4.0;  /**@serial number of cycles (option C)*/  private int m_NumCycles = 4;  /**@serial  Constant set for input order (option O)*/  public static final int ORDERED = 0;  /**@serial  Constant set for input order (default)*/  public static final int RANDOMIZED = 1;  /**@serial input order (changed with option O)*/   private int m_InputOrder = RANDOMIZED;  /**@serial noise rate in percent (option P,  between 0 and 30)*/   private double m_NoiseRate = 0.0;  /**@serial random number generator seed (option S)*/   private int m_Seed = 1;   /**@serial dataset format*/   private Instances m_DatasetFormat = null;  /**@serial random number generator*/   private Random m_Random = null;  /**@serial debug flag*/   private int m_Debug = 0;  /**@serial cluster list */  private FastVector m_ClusterList;  // following are used for pattern is GRID  /**@serial grid size*/  private int m_GridSize;  /**@serial grid width*/  private double m_GridWidth;    /**********************************************************************   * class to represent cluster   */  private class Cluster implements Serializable {    // number of instances for this cluster    private int m_InstNum;    // radius of cluster    //   variance is radius ** 2 / 2    private double m_Radius;    // center of cluster = array of Double values    private double [] m_Center;    /*     * Constructor, used for pattern = RANDOM     *     * @param instNum the number of instances     * @param radius radius of the cluster     * @param center      */    private Cluster(int instNum, double radius, Random random) {      m_InstNum = instNum;      m_Radius = radius;      m_Center = new double[m_NumAttributes];      for (int i = 0; i < m_NumAttributes; i++) {	m_Center[i] = random.nextDouble() * (double) m_NumClusters;      }    }    /*     * Constructor, used for pattern = GRID     *     * @param instNum the number of instances     * @param radius radius of the cluster     * @param gridVector vector for grid positions     * @param gridWidth factor for grid position     */      // center is defined in the constructor of cluster    private Cluster(int instNum,		    double radius,		    int [] gridVector,		    double gridWidth) {      m_InstNum = instNum;      m_Radius = radius;      m_Center = new double[m_NumAttributes];      for (int i = 0; i < m_NumAttributes; i++) {	m_Center[i] = ((double) gridVector[i] + 1.0) * gridWidth;      }          }       private int getInstNum () { return m_InstNum; }    private double getRadius () { return m_Radius; }    private double getVariance () { return Math.pow(m_Radius, 2.0) / 2.0; }    private double getStdDev () { return (m_Radius / Math.pow(2.0, 0.5)); }    private double [] getCenter () { return m_Center; }    private double getCenterValue (int dimension) throws Exception {      if (dimension >= m_Center.length)	throw new Exception("Current system has only " +			    m_Center.length + " dimensions.");      return m_Center[dimension];    }      } // end class Cluster  /**********************************************************************   * class to represent Vector for placement of the center in space   */  private class GridVector implements Serializable {    // array of integer    private int [] m_GridVector;    //  one higher then the highest possible integer value    //  in any of the integers in the gridvector    private int m_Base;    // size of vector    private int m_Size;    /*     * Constructor     *     * @param numDim number of dimensions = number of attributes     * @param base is one higher then the highest possible integer value     * in any of the integers in the gridvector     */    private GridVector(int numDim, int base) {      m_Size = numDim;      m_Base = base;      m_GridVector = new int [numDim];      for (int i = 0; i < numDim; i++) {	m_GridVector[i] = 0;      }    }    /*     * returns the integer array     *     * @return the integer array     */    private int [] getGridVector() {      return m_GridVector;    }    /*     * Overflow has occurred when integer is zero.     *     *@param digit the input integer     *@return true if digit is 0     */    private boolean overflow(int digit) {      return (digit == 0);    }    /*     * Adds one to integer and sets to zero, if new value was     * equal m_Base.     *     *@param digit the input integer     *@return new integer object     */    private int addOne(int digit) {      int value = digit + 1;      if (value >= m_Base) value = 0;      return value;    }    /*     * add 1 to vector     */    private void addOne() {      m_GridVector[0] = addOne(m_GridVector[0]);      int i = 1;      while (overflow(m_GridVector[i - 1]) && i < m_Size) {        m_GridVector[i] = addOne(m_GridVector[i]);	i++;      }	    }        } // end class GridVector    /**   * Returns a string describing this data generator.   *   * @return a description of the data generator suitable for   * displaying in the explorer/experimenter gui   */  public String globalInfo() {        return "A data generator that produces data points in "           + "clusters.";  }  /**   * Sets the upper and lower boundary for instances per cluster.   *   * @param newToFrom the string containing the upper and lower boundary for   * instances per cluster separated by ..   */  public void setInstNums(String fromTo) {    int i = fromTo.indexOf("..");    String from = fromTo.substring(0, i);    setMinInstNum(Integer.parseInt(from));    String to = fromTo.substring(i + 2, fromTo.length());    setMaxInstNum(Integer.parseInt(to));      }    /**   * Gets the upper and lower boundary for instances per cluster.   *   * @return the string containing the upper and lower boundary for   * instances per cluster separated by ..   */  public String getInstNums() {    String fromTo = "" +       getMinInstNum() + ".." +      getMaxInstNum();    return fromTo;  }     /**   * Gets the lower boundary for instances per cluster.   *   * @return the the lower boundary for instances per cluster   */  public int getMinInstNum() { return m_MinInstNum; }    /**   * Sets the lower boundary for instances per cluster.   *   * @param newMinInstNum new lower boundary for instances per cluster   */  public void setMinInstNum(int newMinInstNum) {    m_MinInstNum = newMinInstNum;  }  /**   * Gets the upper boundary for instances per cluster.   *   * @return the upper boundary for instances per cluster   */  public int getMaxInstNum() { return m_MaxInstNum; }    /**   * Sets the upper boundary for instances per cluster.   *   * @param newMaxInstNum new upper boundary for instances per cluster   */  public void setMaxInstNum(int newMaxInstNum) {    m_MaxInstNum = newMaxInstNum;  }  /**   * Sets the upper and lower boundary for the radius of the clusters.   *   * @param newToFrom the string containing the upper and lower boundary for   * the radius  of the clusters, separated by ..   */  public void setRadiuses(String fromTo) {    int i = fromTo.indexOf("..");    String from = fromTo.substring(0, i);    setMinRadius(Double.valueOf(from).doubleValue());    String to = fromTo.substring(i + 2, fromTo.length());    setMaxRadius(Double.valueOf(to).doubleValue());  }  /**   * Gets the upper and lower boundary for the radius of the clusters.   *   * @return the string containing the upper and lower boundary for   * the radius  of the clusters, separated by ..   */  public String getRadiuses() {    String fromTo = "" +       Utils.doubleToString(getMinRadius(), 2) + ".." +      Utils.doubleToString(getMaxRadius(), 2);    return fromTo;  }  /**   * Gets the lower boundary for the radiuses of the clusters.   *   * @return the lower boundary for the radiuses of the clusters   */  public double getMinRadius() { return m_MinRadius; }    /**   * Sets the lower boundary for the radiuses of the clusters.   *   * @param newMinRadius new lower boundary for the radiuses of the clusters   */  public void setMinRadius(double newMinRadius) {    m_MinRadius = newMinRadius;  }  /**   * Gets the upper boundary for the radiuses of the clusters.   *   * @return the upper boundary for the radiuses of the clusters   */  public double getMaxRadius() { return m_MaxRadius; }    /**   * Sets the upper boundary for the radiuses of the clusters.   *   * @param newMaxRadius new upper boundary for the radiuses of the clusters   */  public void setMaxRadius(double newMaxRadius) {    m_MaxRadius = newMaxRadius;  }  /**   * Gets the grid flag (option G).   *   * @return true if grid flag is set   */  public boolean getGridFlag() { return m_Pattern == GRID; }    /**   * Gets the sine flag (option S).   *   * @return true if sine flag is set   */  public boolean getSineFlag() { return m_Pattern == SINE; }    /**   * Gets the pattern type.   *   * @return the current pattern type   */  public int getPattern() { return m_Pattern; }  /**   * Sets the pattern type.   *   * @param newPattern new pattern type    */  public void setPattern(int newPattern) {    m_Pattern = newPattern;  }  /**   * Gets the distance multiplier.   *   * @return the distance multiplier   */  public double getDistMult() { return m_DistMult; }    /**   * Sets the distance multiplier.   *   * @param newDistMult new distance multiplier   */  public void setDistMult(double newDistMult) {    m_DistMult = newDistMult;  }  /**   * Gets the number of cycles.   *   * @return the number of cycles   */  public int getNumCycles() { return m_NumCycles; }    /**   * Sets the the number of cycles.   *   * @param newNumCycles new number of cycles    */  public void setNumCycles(int newNumCycles) {    m_NumCycles = newNumCycles;  }  /**   * Gets the input order.   *   * @return the current input order   */  public int getInputOrder() { return m_InputOrder; }  /**   * Sets the input order.   *   * @param newInputOrder new input order    */  public void setInputOrder(int newInputOrder) {    m_InputOrder = newInputOrder;  }  /**   * Gets the ordered flag (option O).   *   * @return true if ordered flag is set   */  public boolean getOrderedFlag() { return m_InputOrder == ORDERED; }  /**   * Gets the percentage of noise set.   *   * @return the percentage of noise set   */  public double getNoiseRate() { return m_NoiseRate; }    /**

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?