⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 birchcluster.java

📁 一个数据挖掘软件ALPHAMINERR的整个过程的JAVA版源代码
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
/*
 *    BIRCHCluster.java
 *    Copyright (C) 2001 Gabi Schmidberger.
 *
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

package weka.datagenerators;

import java.io.Serializable;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;

import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Utils;

/**
 * Cluster data generator designed for the BIRCH System
 *
 * Dataset is generated with instances in K clusters.
 * Instances are 2-d data points.
 * Each cluster is characterized by the number of data points in it
 * its radius and its center. The location of the cluster centers is
 * determined by the pattern parameter. Three patterns are currently
 * supported grid, sine and random.
 * todo:
 *
 * (out of: BIRCH: An Efficient Data Clustering Method for Very Large
 * Databases; T. Zhang, R. Ramkrishnan, M. Livny; 1996 ACM)
 *
 * 
 * Class to generate data randomly by producing a decision list.
 * The decision list consists of rules.
 * Instances are generated randomly one by one. If decision list fails
 * to classify the current instance, a new rule according to this current
 * instance is generated and added to the decision list.<p>
 *
 * The option -V switches on voting, which means that at the end
 * of the generation all instances are
 * reclassified to the class value that is supported by the most rules.<p>
 *
 * This data generator can generate 'boolean' attributes (= nominal with
 * the values {true, false}) and numeric attributes. The rules can be
 * 'A' or 'NOT A' for boolean values and 'B < random_value' or
 * 'B >= random_value' for numeric values.<p> 
 *
 * Valid options are:<p>
 *
 * -G <br>
 * The pattern for instance generation is grid.<br>
 * This flag cannot be used at the same time as flag I.
 * The pattern is random, if neither flag G nor flag I is set.<p>
 *
 * -I <br>
 * The pattern for instance generation is sine.<br>
 * This flag cannot be used at the same time as flag G.
 * The pattern is random, if neither flag G nor flag I is set.<p>
 *
 * -N num .. num <br>
 * The range of the number of instances in each cluster (default 1..50).<br>
 * Lower number must be between 0 and 2500, upper number must be between
 * 50 and 2500.<p>
 *
 * -R num .. num <br>
 * The range of the radius of the clusters (default 0.1 .. SQRT(2)).<br>
 * Lower number must be between 0 and SQRT(2), upper number must be between<br>
 * SQRT(2) and SQRT(32).<p>
 *
 * -M num <br>
 * Distance multiplier, only used if pattern is grid (default 4). <p>
 *
 * -C num <br>
 * Number of cycles, only used if pattern is sine (default 4). <p>
 *
 * -O <br>
 * Flag for input order is ordered. If flag is not set then input
 * order is randomized.<p>
 *
 * -P num<br>
 * Noise rate in percent. Can be between 0% and 30% (default 0%).<br>
 * (Remark: The original algorithm only allows noise up to 10%.)<p>
 *
 * -S seed <br>
 * Random number seed for random function used (default 1). <p>
 *
 * @author Gabi Schmidberger (gabi@cs.waikato.ac.nz)
 * @version $Revision$ 
 **/
public class BIRCHCluster extends ClusterGenerator
  implements OptionHandler,
	     Serializable {


  /**@serial minimal number of instances per cluster (option N)*/ 
  private int m_MinInstNum = 1;
  
  /**@serial maximal number of instances per cluster (option N)*/ 
  private int m_MaxInstNum = 50;
  
  /**@serial minimum radius (option R)*/ 
  private double m_MinRadius= 0.1;
  
  /**@serial maximum radius (option R)*/ 
  private double m_MaxRadius = Math.sqrt(2.0);
  
  /**@serial  Constant set for choice of pattern. (option G)*/
  public static final int GRID = 0;
  /**@serial  Constant set for choice of pattern. (option I)*/
  public static final int SINE = 1;
  /**@serial  Constant set for choice of pattern. (default)*/
  public static final int RANDOM = 2;
  
  /**@serial pattern (changed with options G or S)*/ 
  private int m_Pattern = RANDOM;
  
  /**@serial distance multiplier (option M)*/
  private double m_DistMult = 4.0;

  /**@serial number of cycles (option C)*/
  private int m_NumCycles = 4;

  /**@serial  Constant set for input order (option O)*/
  public static final int ORDERED = 0;
  /**@serial  Constant set for input order (default)*/
  public static final int RANDOMIZED = 1;

  /**@serial input order (changed with option O)*/ 
  private int m_InputOrder = RANDOMIZED;

  /**@serial noise rate in percent (option P,  between 0 and 30)*/ 
  private double m_NoiseRate = 0.0;

  /**@serial random number generator seed (option S)*/ 
  private int m_Seed = 1;
 
  /**@serial dataset format*/ 
  private Instances m_DatasetFormat = null;

  /**@serial random number generator*/ 
  private Random m_Random = null;

  /**@serial debug flag*/ 
  private int m_Debug = 0;

  /**@serial cluster list */
  private FastVector m_ClusterList;

  // following are used for pattern is GRID
  /**@serial grid size*/
  private int m_GridSize;

  /**@serial grid width*/
  private double m_GridWidth;
  
  /**********************************************************************
   * class to represent cluster
   */
  private class Cluster implements Serializable {

    // number of instances for this cluster
    private int m_InstNum;

    // radius of cluster
    //   variance is radius ** 2 / 2
    private double m_Radius;

    // center of cluster = array of Double values
    private double [] m_Center;

    /*
     * Constructor, used for pattern = RANDOM
     *
     * @param instNum the number of instances
     * @param radius radius of the cluster
     * @param center 
     */
    private Cluster(int instNum, double radius, Random random) {
      m_InstNum = instNum;
      m_Radius = radius;
      m_Center = new double[m_NumAttributes];
      for (int i = 0; i < m_NumAttributes; i++) {
	m_Center[i] = random.nextDouble() * (double) m_NumClusters;
      }
    }

    /*
     * Constructor, used for pattern = GRID
     *
     * @param instNum the number of instances
     * @param radius radius of the cluster
     * @param gridVector vector for grid positions
     * @param gridWidth factor for grid position
     */
      // center is defined in the constructor of cluster
    private Cluster(int instNum,
		    double radius,
		    int [] gridVector,
		    double gridWidth) {
      m_InstNum = instNum;
      m_Radius = radius;
      m_Center = new double[m_NumAttributes];
      for (int i = 0; i < m_NumAttributes; i++) {
	m_Center[i] = ((double) gridVector[i] + 1.0) * gridWidth;
      }
      
    }
   
    private int getInstNum () { return m_InstNum; }
    private double getRadius () { return m_Radius; }
    private double getVariance () { return Math.pow(m_Radius, 2.0) / 2.0; }
    private double getStdDev () { return (m_Radius / Math.pow(2.0, 0.5)); }
    private double [] getCenter () { return m_Center; }
    private double getCenterValue (int dimension) throws Exception {
      if (dimension >= m_Center.length)
	throw new Exception("Current system has only " +
			    m_Center.length + " dimensions.");
      return m_Center[dimension];
    }
    
  } // end class Cluster

  /**********************************************************************
   * class to represent Vector for placement of the center in space
   */
  private class GridVector implements Serializable {

    // array of integer
    private int [] m_GridVector;

    //  one higher then the highest possible integer value
    //  in any of the integers in the gridvector
    private int m_Base;

    // size of vector
    private int m_Size;

    /*
     * Constructor
     *
     * @param numDim number of dimensions = number of attributes
     * @param base is one higher then the highest possible integer value
     * in any of the integers in the gridvector
     */
    private GridVector(int numDim, int base) {

      m_Size = numDim;
      m_Base = base;
      m_GridVector = new int [numDim];
      for (int i = 0; i < numDim; i++) {
	m_GridVector[i] = 0;
      }
    }

    /*
     * returns the integer array
     *
     * @return the integer array
     */
    private int [] getGridVector() {
      return m_GridVector;
    }

    /*
     * Overflow has occurred when integer is zero.
     *
     *@param digit the input integer
     *@return true if digit is 0
     */
    private boolean overflow(int digit) {
      return (digit == 0);
    }
    /*
     * Adds one to integer and sets to zero, if new value was
     * equal m_Base.
     *
     *@param digit the input integer
     *@return new integer object
     */
    private int addOne(int digit) {

      int value = digit + 1;
      if (value >= m_Base) value = 0;
      return value;
    }

    /*
     * add 1 to vector
     */
    private void addOne() {

      m_GridVector[0] = addOne(m_GridVector[0]);
      int i = 1;
      while (overflow(m_GridVector[i - 1]) && i < m_Size) {
        m_GridVector[i] = addOne(m_GridVector[i]);
	i++;
      }
	
    }

      
  } // end class GridVector


  
  /**
   * Returns a string describing this data generator.
   *
   * @return a description of the data generator suitable for
   * displaying in the explorer/experimenter gui
   */
  public String globalInfo() {
    
    return "A data generator that produces data points in "
           + "clusters.";
  }

  /**
   * Sets the upper and lower boundary for instances per cluster.
   *
   * @param newToFrom the string containing the upper and lower boundary for
   * instances per cluster separated by ..
   */
  public void setInstNums(String fromTo) {

    int i = fromTo.indexOf("..");
    String from = fromTo.substring(0, i);
    setMinInstNum(Integer.parseInt(from));
    String to = fromTo.substring(i + 2, fromTo.length());
    setMaxInstNum(Integer.parseInt(to));
    
  }
  
  /**
   * Gets the upper and lower boundary for instances per cluster.
   *
   * @return the string containing the upper and lower boundary for
   * instances per cluster separated by ..
   */
  public String getInstNums() {
    String fromTo = "" + 
      getMinInstNum() + ".." +
      getMaxInstNum();
    return fromTo;
  }
  
   /**
   * Gets the lower boundary for instances per cluster.

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -