birchcluster.java
来自「wekaUT是 university texas austin 开发的基于wek」· Java 代码 · 共 1,095 行 · 第 1/2 页
JAVA
1,095 行
/* * BIRCHCluster.java * Copyright (C) 2001 Gabi Schmidberger. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */package weka.datagenerators;import weka.core.Attribute;import weka.core.FastVector;import weka.core.Instance;import weka.core.Instances;import weka.core.OptionHandler;import weka.core.Option;import weka.core.Utils;import java.io.Serializable;import java.util.Random;import java.util.Enumeration;import java.util.Vector;/** * Cluster data generator designed for the BIRCH System * * Dataset is generated with instances in K clusters. * Instances are 2-d data points. * Each cluster is characterized by the number of data points in it * its radius and its center. The location of the cluster centers is * determined by the pattern parameter. Three patterns are currently * supported grid, sine and random. * todo: * * (out of: BIRCH: An Efficient Data Clustering Method for Very Large * Databases; T. Zhang, R. Ramkrishnan, M. Livny; 1996 ACM) * * * Class to generate data randomly by producing a decision list. * The decision list consists of rules. * Instances are generated randomly one by one. If decision list fails * to classify the current instance, a new rule according to this current * instance is generated and added to the decision list.<p> * * The option -V switches on voting, which means that at the end * of the generation all instances are * reclassified to the class value that is supported by the most rules.<p> * * This data generator can generate 'boolean' attributes (= nominal with * the values {true, false}) and numeric attributes. The rules can be * 'A' or 'NOT A' for boolean values and 'B < random_value' or * 'B >= random_value' for numeric values.<p> * * Valid options are:<p> * * -G <br> * The pattern for instance generation is grid.<br> * This flag cannot be used at the same time as flag I. * The pattern is random, if neither flag G nor flag I is set.<p> * * -I <br> * The pattern for instance generation is sine.<br> * This flag cannot be used at the same time as flag G. * The pattern is random, if neither flag G nor flag I is set.<p> * * -N num .. num <br> * The range of the number of instances in each cluster (default 1..50).<br> * Lower number must be between 0 and 2500, upper number must be between * 50 and 2500.<p> * * -R num .. num <br> * The range of the radius of the clusters (default 0.1 .. SQRT(2)).<br> * Lower number must be between 0 and SQRT(2), upper number must be between<br> * SQRT(2) and SQRT(32).<p> * * -M num <br> * Distance multiplier, only used if pattern is grid (default 4). <p> * * -C num <br> * Number of cycles, only used if pattern is sine (default 4). <p> * * -O <br> * Flag for input order is ordered. If flag is not set then input * order is randomized.<p> * * -P num<br> * Noise rate in percent. Can be between 0% and 30% (default 0%).<br> * (Remark: The original algorithm only allows noise up to 10%.)<p> * * -S seed <br> * Random number seed for random function used (default 1). <p> * * @author Gabi Schmidberger (gabi@cs.waikato.ac.nz) * @version $Revision: 1.1.1.1 $ **/public class BIRCHCluster extends ClusterGenerator implements OptionHandler, Serializable { /**@serial minimal number of instances per cluster (option N)*/ private int m_MinInstNum = 1; /**@serial maximal number of instances per cluster (option N)*/ private int m_MaxInstNum = 50; /**@serial minimum radius (option R)*/ private double m_MinRadius= 0.1; /**@serial maximum radius (option R)*/ private double m_MaxRadius = Math.sqrt(2.0); /**@serial Constant set for choice of pattern. (option G)*/ public static final int GRID = 0; /**@serial Constant set for choice of pattern. (option I)*/ public static final int SINE = 1; /**@serial Constant set for choice of pattern. (default)*/ public static final int RANDOM = 2; /**@serial pattern (changed with options G or S)*/ private int m_Pattern = RANDOM; /**@serial distance multiplier (option M)*/ private double m_DistMult = 4.0; /**@serial number of cycles (option C)*/ private int m_NumCycles = 4; /**@serial Constant set for input order (option O)*/ public static final int ORDERED = 0; /**@serial Constant set for input order (default)*/ public static final int RANDOMIZED = 1; /**@serial input order (changed with option O)*/ private int m_InputOrder = RANDOMIZED; /**@serial noise rate in percent (option P, between 0 and 30)*/ private double m_NoiseRate = 0.0; /**@serial random number generator seed (option S)*/ private int m_Seed = 1; /**@serial dataset format*/ private Instances m_DatasetFormat = null; /**@serial random number generator*/ private Random m_Random = null; /**@serial debug flag*/ private int m_Debug = 0; /**@serial cluster list */ private FastVector m_ClusterList; // following are used for pattern is GRID /**@serial grid size*/ private int m_GridSize; /**@serial grid width*/ private double m_GridWidth; /********************************************************************** * class to represent cluster */ private class Cluster implements Serializable { // number of instances for this cluster private int m_InstNum; // radius of cluster // variance is radius ** 2 / 2 private double m_Radius; // center of cluster = array of Double values private double [] m_Center; /* * Constructor, used for pattern = RANDOM * * @param instNum the number of instances * @param radius radius of the cluster * @param center */ private Cluster(int instNum, double radius, Random random) { m_InstNum = instNum; m_Radius = radius; m_Center = new double[m_NumAttributes]; for (int i = 0; i < m_NumAttributes; i++) { m_Center[i] = random.nextDouble() * (double) m_NumClusters; } } /* * Constructor, used for pattern = GRID * * @param instNum the number of instances * @param radius radius of the cluster * @param gridVector vector for grid positions * @param gridWidth factor for grid position */ // center is defined in the constructor of cluster private Cluster(int instNum, double radius, int [] gridVector, double gridWidth) { m_InstNum = instNum; m_Radius = radius; m_Center = new double[m_NumAttributes]; for (int i = 0; i < m_NumAttributes; i++) { m_Center[i] = ((double) gridVector[i] + 1.0) * gridWidth; } } private int getInstNum () { return m_InstNum; } private double getRadius () { return m_Radius; } private double getVariance () { return Math.pow(m_Radius, 2.0) / 2.0; } private double getStdDev () { return (m_Radius / Math.pow(2.0, 0.5)); } private double [] getCenter () { return m_Center; } private double getCenterValue (int dimension) throws Exception { if (dimension >= m_Center.length) throw new Exception("Current system has only " + m_Center.length + " dimensions."); return m_Center[dimension]; } } // end class Cluster /********************************************************************** * class to represent Vector for placement of the center in space */ private class GridVector implements Serializable { // array of integer private int [] m_GridVector; // one higher then the highest possible integer value // in any of the integers in the gridvector private int m_Base; // size of vector private int m_Size; /* * Constructor * * @param numDim number of dimensions = number of attributes * @param base is one higher then the highest possible integer value * in any of the integers in the gridvector */ private GridVector(int numDim, int base) { m_Size = numDim; m_Base = base; m_GridVector = new int [numDim]; for (int i = 0; i < numDim; i++) { m_GridVector[i] = 0; } } /* * returns the integer array * * @return the integer array */ private int [] getGridVector() { return m_GridVector; } /* * Overflow has occurred when integer is zero. * *@param digit the input integer *@return true if digit is 0 */ private boolean overflow(int digit) { return (digit == 0); } /* * Adds one to integer and sets to zero, if new value was * equal m_Base. * *@param digit the input integer *@return new integer object */ private int addOne(int digit) { int value = digit + 1; if (value >= m_Base) value = 0; return value; } /* * add 1 to vector */ private void addOne() { m_GridVector[0] = addOne(m_GridVector[0]); int i = 1; while (overflow(m_GridVector[i - 1]) && i < m_Size) { m_GridVector[i] = addOne(m_GridVector[i]); i++; } } } // end class GridVector /** * Returns a string describing this data generator. * * @return a description of the data generator suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return "A data generator that produces data points in " + "clusters."; } /** * Sets the upper and lower boundary for instances per cluster. * * @param newToFrom the string containing the upper and lower boundary for * instances per cluster separated by .. */ public void setInstNums(String fromTo) { int i = fromTo.indexOf(".."); String from = fromTo.substring(0, i); setMinInstNum(Integer.parseInt(from)); String to = fromTo.substring(i + 2, fromTo.length()); setMaxInstNum(Integer.parseInt(to)); } /** * Gets the upper and lower boundary for instances per cluster. * * @return the string containing the upper and lower boundary for * instances per cluster separated by .. */ public String getInstNums() { String fromTo = "" + getMinInstNum() + ".." + getMaxInstNum(); return fromTo; } /** * Gets the lower boundary for instances per cluster. * * @return the the lower boundary for instances per cluster */ public int getMinInstNum() { return m_MinInstNum; } /** * Sets the lower boundary for instances per cluster. * * @param newMinInstNum new lower boundary for instances per cluster */ public void setMinInstNum(int newMinInstNum) { m_MinInstNum = newMinInstNum; } /** * Gets the upper boundary for instances per cluster. * * @return the upper boundary for instances per cluster */ public int getMaxInstNum() { return m_MaxInstNum; } /** * Sets the upper boundary for instances per cluster. * * @param newMaxInstNum new upper boundary for instances per cluster */ public void setMaxInstNum(int newMaxInstNum) { m_MaxInstNum = newMaxInstNum; } /** * Sets the upper and lower boundary for the radius of the clusters. * * @param newToFrom the string containing the upper and lower boundary for * the radius of the clusters, separated by .. */ public void setRadiuses(String fromTo) { int i = fromTo.indexOf(".."); String from = fromTo.substring(0, i); setMinRadius(Double.valueOf(from).doubleValue()); String to = fromTo.substring(i + 2, fromTo.length()); setMaxRadius(Double.valueOf(to).doubleValue()); } /** * Gets the upper and lower boundary for the radius of the clusters. * * @return the string containing the upper and lower boundary for * the radius of the clusters, separated by .. */ public String getRadiuses() { String fromTo = "" + Utils.doubleToString(getMinRadius(), 2) + ".." + Utils.doubleToString(getMaxRadius(), 2); return fromTo; } /** * Gets the lower boundary for the radiuses of the clusters. * * @return the lower boundary for the radiuses of the clusters */ public double getMinRadius() { return m_MinRadius; } /** * Sets the lower boundary for the radiuses of the clusters. * * @param newMinRadius new lower boundary for the radiuses of the clusters */ public void setMinRadius(double newMinRadius) { m_MinRadius = newMinRadius; } /** * Gets the upper boundary for the radiuses of the clusters. * * @return the upper boundary for the radiuses of the clusters */ public double getMaxRadius() { return m_MaxRadius; } /** * Sets the upper boundary for the radiuses of the clusters. * * @param newMaxRadius new upper boundary for the radiuses of the clusters */ public void setMaxRadius(double newMaxRadius) { m_MaxRadius = newMaxRadius; } /** * Gets the grid flag (option G). * * @return true if grid flag is set */ public boolean getGridFlag() { return m_Pattern == GRID; } /** * Gets the sine flag (option S). * * @return true if sine flag is set */ public boolean getSineFlag() { return m_Pattern == SINE; } /** * Gets the pattern type. * * @return the current pattern type */ public int getPattern() { return m_Pattern; } /** * Sets the pattern type. * * @param newPattern new pattern type */ public void setPattern(int newPattern) { m_Pattern = newPattern; } /** * Gets the distance multiplier. * * @return the distance multiplier */ public double getDistMult() { return m_DistMult; } /** * Sets the distance multiplier. * * @param newDistMult new distance multiplier */ public void setDistMult(double newDistMult) { m_DistMult = newDistMult; } /** * Gets the number of cycles. * * @return the number of cycles */ public int getNumCycles() { return m_NumCycles; } /** * Sets the the number of cycles. * * @param newNumCycles new number of cycles */ public void setNumCycles(int newNumCycles) { m_NumCycles = newNumCycles; } /** * Gets the input order. * * @return the current input order */ public int getInputOrder() { return m_InputOrder; } /** * Sets the input order. * * @param newInputOrder new input order */ public void setInputOrder(int newInputOrder) { m_InputOrder = newInputOrder; } /** * Gets the ordered flag (option O). * * @return true if ordered flag is set */ public boolean getOrderedFlag() { return m_InputOrder == ORDERED; } /** * Gets the percentage of noise set. * * @return the percentage of noise set */ public double getNoiseRate() { return m_NoiseRate; } /**
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?