📄 birchcluster.java
字号:
/*
* BIRCHCluster.java
* Copyright (C) 2001 Gabi Schmidberger.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
package weka.datagenerators;
import java.io.Serializable;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;
import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Utils;
/**
* Cluster data generator designed for the BIRCH System
*
* Dataset is generated with instances in K clusters.
* Instances are 2-d data points.
* Each cluster is characterized by the number of data points in it
* its radius and its center. The location of the cluster centers is
* determined by the pattern parameter. Three patterns are currently
* supported grid, sine and random.
* todo:
*
* (out of: BIRCH: An Efficient Data Clustering Method for Very Large
* Databases; T. Zhang, R. Ramkrishnan, M. Livny; 1996 ACM)
*
*
* Class to generate data randomly by producing a decision list.
* The decision list consists of rules.
* Instances are generated randomly one by one. If decision list fails
* to classify the current instance, a new rule according to this current
* instance is generated and added to the decision list.<p>
*
* The option -V switches on voting, which means that at the end
* of the generation all instances are
* reclassified to the class value that is supported by the most rules.<p>
*
* This data generator can generate 'boolean' attributes (= nominal with
* the values {true, false}) and numeric attributes. The rules can be
* 'A' or 'NOT A' for boolean values and 'B < random_value' or
* 'B >= random_value' for numeric values.<p>
*
* Valid options are:<p>
*
* -G <br>
* The pattern for instance generation is grid.<br>
* This flag cannot be used at the same time as flag I.
* The pattern is random, if neither flag G nor flag I is set.<p>
*
* -I <br>
* The pattern for instance generation is sine.<br>
* This flag cannot be used at the same time as flag G.
* The pattern is random, if neither flag G nor flag I is set.<p>
*
* -N num .. num <br>
* The range of the number of instances in each cluster (default 1..50).<br>
* Lower number must be between 0 and 2500, upper number must be between
* 50 and 2500.<p>
*
* -R num .. num <br>
* The range of the radius of the clusters (default 0.1 .. SQRT(2)).<br>
* Lower number must be between 0 and SQRT(2), upper number must be between<br>
* SQRT(2) and SQRT(32).<p>
*
* -M num <br>
* Distance multiplier, only used if pattern is grid (default 4). <p>
*
* -C num <br>
* Number of cycles, only used if pattern is sine (default 4). <p>
*
* -O <br>
* Flag for input order is ordered. If flag is not set then input
* order is randomized.<p>
*
* -P num<br>
* Noise rate in percent. Can be between 0% and 30% (default 0%).<br>
* (Remark: The original algorithm only allows noise up to 10%.)<p>
*
* -S seed <br>
* Random number seed for random function used (default 1). <p>
*
* @author Gabi Schmidberger (gabi@cs.waikato.ac.nz)
* @version $Revision$
**/
public class BIRCHCluster extends ClusterGenerator
implements OptionHandler,
Serializable {
/**@serial minimal number of instances per cluster (option N)*/
private int m_MinInstNum = 1;
/**@serial maximal number of instances per cluster (option N)*/
private int m_MaxInstNum = 50;
/**@serial minimum radius (option R)*/
private double m_MinRadius= 0.1;
/**@serial maximum radius (option R)*/
private double m_MaxRadius = Math.sqrt(2.0);
/**@serial Constant set for choice of pattern. (option G)*/
public static final int GRID = 0;
/**@serial Constant set for choice of pattern. (option I)*/
public static final int SINE = 1;
/**@serial Constant set for choice of pattern. (default)*/
public static final int RANDOM = 2;
/**@serial pattern (changed with options G or S)*/
private int m_Pattern = RANDOM;
/**@serial distance multiplier (option M)*/
private double m_DistMult = 4.0;
/**@serial number of cycles (option C)*/
private int m_NumCycles = 4;
/**@serial Constant set for input order (option O)*/
public static final int ORDERED = 0;
/**@serial Constant set for input order (default)*/
public static final int RANDOMIZED = 1;
/**@serial input order (changed with option O)*/
private int m_InputOrder = RANDOMIZED;
/**@serial noise rate in percent (option P, between 0 and 30)*/
private double m_NoiseRate = 0.0;
/**@serial random number generator seed (option S)*/
private int m_Seed = 1;
/**@serial dataset format*/
private Instances m_DatasetFormat = null;
/**@serial random number generator*/
private Random m_Random = null;
/**@serial debug flag*/
private int m_Debug = 0;
/**@serial cluster list */
private FastVector m_ClusterList;
// following are used for pattern is GRID
/**@serial grid size*/
private int m_GridSize;
/**@serial grid width*/
private double m_GridWidth;
/**********************************************************************
* class to represent cluster
*/
private class Cluster implements Serializable {
// number of instances for this cluster
private int m_InstNum;
// radius of cluster
// variance is radius ** 2 / 2
private double m_Radius;
// center of cluster = array of Double values
private double [] m_Center;
/*
* Constructor, used for pattern = RANDOM
*
* @param instNum the number of instances
* @param radius radius of the cluster
* @param center
*/
private Cluster(int instNum, double radius, Random random) {
m_InstNum = instNum;
m_Radius = radius;
m_Center = new double[m_NumAttributes];
for (int i = 0; i < m_NumAttributes; i++) {
m_Center[i] = random.nextDouble() * (double) m_NumClusters;
}
}
/*
* Constructor, used for pattern = GRID
*
* @param instNum the number of instances
* @param radius radius of the cluster
* @param gridVector vector for grid positions
* @param gridWidth factor for grid position
*/
// center is defined in the constructor of cluster
private Cluster(int instNum,
double radius,
int [] gridVector,
double gridWidth) {
m_InstNum = instNum;
m_Radius = radius;
m_Center = new double[m_NumAttributes];
for (int i = 0; i < m_NumAttributes; i++) {
m_Center[i] = ((double) gridVector[i] + 1.0) * gridWidth;
}
}
private int getInstNum () { return m_InstNum; }
private double getRadius () { return m_Radius; }
private double getVariance () { return Math.pow(m_Radius, 2.0) / 2.0; }
private double getStdDev () { return (m_Radius / Math.pow(2.0, 0.5)); }
private double [] getCenter () { return m_Center; }
private double getCenterValue (int dimension) throws Exception {
if (dimension >= m_Center.length)
throw new Exception("Current system has only " +
m_Center.length + " dimensions.");
return m_Center[dimension];
}
} // end class Cluster
/**********************************************************************
* class to represent Vector for placement of the center in space
*/
private class GridVector implements Serializable {
// array of integer
private int [] m_GridVector;
// one higher then the highest possible integer value
// in any of the integers in the gridvector
private int m_Base;
// size of vector
private int m_Size;
/*
* Constructor
*
* @param numDim number of dimensions = number of attributes
* @param base is one higher then the highest possible integer value
* in any of the integers in the gridvector
*/
private GridVector(int numDim, int base) {
m_Size = numDim;
m_Base = base;
m_GridVector = new int [numDim];
for (int i = 0; i < numDim; i++) {
m_GridVector[i] = 0;
}
}
/*
* returns the integer array
*
* @return the integer array
*/
private int [] getGridVector() {
return m_GridVector;
}
/*
* Overflow has occurred when integer is zero.
*
*@param digit the input integer
*@return true if digit is 0
*/
private boolean overflow(int digit) {
return (digit == 0);
}
/*
* Adds one to integer and sets to zero, if new value was
* equal m_Base.
*
*@param digit the input integer
*@return new integer object
*/
private int addOne(int digit) {
int value = digit + 1;
if (value >= m_Base) value = 0;
return value;
}
/*
* add 1 to vector
*/
private void addOne() {
m_GridVector[0] = addOne(m_GridVector[0]);
int i = 1;
while (overflow(m_GridVector[i - 1]) && i < m_Size) {
m_GridVector[i] = addOne(m_GridVector[i]);
i++;
}
}
} // end class GridVector
/**
* Returns a string describing this data generator.
*
* @return a description of the data generator suitable for
* displaying in the explorer/experimenter gui
*/
public String globalInfo() {
return "A data generator that produces data points in "
+ "clusters.";
}
/**
* Sets the upper and lower boundary for instances per cluster.
*
* @param newToFrom the string containing the upper and lower boundary for
* instances per cluster separated by ..
*/
public void setInstNums(String fromTo) {
int i = fromTo.indexOf("..");
String from = fromTo.substring(0, i);
setMinInstNum(Integer.parseInt(from));
String to = fromTo.substring(i + 2, fromTo.length());
setMaxInstNum(Integer.parseInt(to));
}
/**
* Gets the upper and lower boundary for instances per cluster.
*
* @return the string containing the upper and lower boundary for
* instances per cluster separated by ..
*/
public String getInstNums() {
String fromTo = "" +
getMinInstNum() + ".." +
getMaxInstNum();
return fromTo;
}
/**
* Gets the lower boundary for instances per cluster.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -