📄 dbscan.java

📁 数据挖掘中聚类的算法
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* *    This program is free software; you can redistribute it and/or modify *    it under the terms of the GNU General Public License as published by *    the Free Software Foundation; either version 2 of the License, or *    (at your option) any later version. * *    This program is distributed in the hope that it will be useful, *    but WITHOUT ANY WARRANTY; without even the implied warranty of *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *    GNU General Public License for more details. * *    You should have received a copy of the GNU General Public License *    along with this program; if not, write to the Free Software *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* *    Copyright (C) 2004 *    & Matthias Schubert (schubert@dbs.ifi.lmu.de) *    & Zhanna Melnikova-Albrecht (melnikov@cip.ifi.lmu.de) *    & Rainer Holzmann (holzmann@cip.ifi.lmu.de) */package weka.clusterers;import weka.clusterers.forOPTICSAndDBScan.DataObjects.DataObject;import weka.clusterers.forOPTICSAndDBScan.Databases.Database;import weka.core.Capabilities;import weka.core.Instance;import weka.core.Instances;import weka.core.Option;import weka.core.OptionHandler;import weka.core.TechnicalInformation;import weka.core.TechnicalInformationHandler;import weka.core.Utils;import weka.core.Capabilities.Capability;import weka.core.TechnicalInformation.Field;import weka.core.TechnicalInformation.Type;import weka.filters.Filter;import weka.filters.unsupervised.attribute.ReplaceMissingValues;import java.lang.reflect.Constructor;import java.lang.reflect.InvocationTargetException;import java.text.DecimalFormat;import java.util.Enumeration;import java.util.Iterator;import java.util.List;import java.util.Vector;/** <!-- globalinfo-start --> * Martin Ester, Hans-Peter Kriegel, Joerg Sander, Xiaowei Xu: A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise. In: Second International Conference on Knowledge Discovery and Data Mining, 226-231, 1996. * <p/> <!-- globalinfo-end --> * <!-- technical-bibtex-start --> * BibTeX: * <pre> * &#64;inproceedings{Ester1996, *    author = {Martin Ester and Hans-Peter Kriegel and Joerg Sander and Xiaowei Xu}, *    booktitle = {Second International Conference on Knowledge Discovery and Data Mining}, *    editor = {Evangelos Simoudis and Jiawei Han and Usama M. Fayyad}, *    pages = {226-231}, *    publisher = {AAAI Press}, *    title = {A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise}, *    year = {1996} * } * </pre> * <p/> <!-- technical-bibtex-end --> * <!-- options-start --> * Valid options are: <p/> *  * <pre> -E &lt;double&gt; *  epsilon (default = 0.9)</pre> *  * <pre> -M &lt;int&gt; *  minPoints (default = 6)</pre> *  * <pre> -I &lt;String&gt; *  index (database) used for DBScan (default = weka.clusterers.forOPTICSAndDBScan.Databases.SequentialDatabase)</pre> *  * <pre> -D &lt;String&gt; *  distance-type (default = weka.clusterers.forOPTICSAndDBScan.DataObjects.EuclidianDataObject)</pre> *  <!-- options-end --> * * @author Matthias Schubert (schubert@dbs.ifi.lmu.de) * @author Zhanna Melnikova-Albrecht (melnikov@cip.ifi.lmu.de) * @author Rainer Holzmann (holzmann@cip.ifi.lmu.de) * @version $Revision: 1.7 $ */public class DBScan     extends Clusterer     implements OptionHandler, TechnicalInformationHandler {    /** for serialization */    static final long serialVersionUID = -1666498248451219728L;      /**     * Specifies the radius for a range-query     */    private double epsilon = 0.9;    /**     * Specifies the density (the range-query must contain at least minPoints DataObjects)     */    private int minPoints = 6;    /**     * Replace missing values in training instances     */    private ReplaceMissingValues replaceMissingValues_Filter;    /**     * Holds the number of clusters generated     */    private int numberOfGeneratedClusters;    /**     * Holds the distance-type that is used     * (default = weka.clusterers.forOPTICSAndDBScan.DataObjects.EuclidianDataObject)     */    private String database_distanceType = "weka.clusterers.forOPTICSAndDBScan.DataObjects.EuclidianDataObject";    /**     * Holds the type of the used database     * (default = weka.clusterers.forOPTICSAndDBScan.Databases.SequentialDatabase)     */    private String database_Type = "weka.clusterers.forOPTICSAndDBScan.Databases.SequentialDatabase";    /**     * The database that is used for DBScan     */    private Database database;    /**     * Holds the current clusterID     */    private int clusterID;    /**     * Counter for the processed instances     */    private int processed_InstanceID;    /**     * Holds the time-value (seconds) for the duration of the clustering-process     */    private double elapsedTime;    /**     * Returns default capabilities of the clusterer.     *     * @return      the capabilities of this clusterer     */    public Capabilities getCapabilities() {      Capabilities result = super.getCapabilities();      // attributes      result.enable(Capability.NOMINAL_ATTRIBUTES);      result.enable(Capability.NUMERIC_ATTRIBUTES);      result.enable(Capability.DATE_ATTRIBUTES);      result.enable(Capability.MISSING_VALUES);      return result;    }    // *****************************************************************************************************************    // constructors    // *****************************************************************************************************************    // *****************************************************************************************************************    // methods    // *****************************************************************************************************************    /**     * Generate Clustering via DBScan     * @param instances The instances that need to be clustered     * @throws java.lang.Exception If clustering was not successful     */    public void buildClusterer(Instances instances) throws Exception {        // can clusterer handle the data?        getCapabilities().testWithFail(instances);        long time_1 = System.currentTimeMillis();        processed_InstanceID = 0;        numberOfGeneratedClusters = 0;        clusterID = 0;        replaceMissingValues_Filter = new ReplaceMissingValues();        replaceMissingValues_Filter.setInputFormat(instances);        Instances filteredInstances = Filter.useFilter(instances, replaceMissingValues_Filter);        database = databaseForName(getDatabase_Type(), filteredInstances);        for (int i = 0; i < database.getInstances().numInstances(); i++) {            DataObject dataObject = dataObjectForName(getDatabase_distanceType(),                    database.getInstances().instance(i),                    Integer.toString(i),                    database);            database.insert(dataObject);        }        database.setMinMaxValues();        Iterator iterator = database.dataObjectIterator();        while (iterator.hasNext()) {            DataObject dataObject = (DataObject) iterator.next();            if (dataObject.getClusterLabel() == DataObject.UNCLASSIFIED) {                if (expandCluster(dataObject)) {                    clusterID++;                    numberOfGeneratedClusters++;                }            }        }        long time_2 = System.currentTimeMillis();        elapsedTime = (double) (time_2 - time_1) / 1000.0;    }    /**     * Assigns this dataObject to a cluster or remains it as NOISE     * @param dataObject The DataObject that needs to be assigned     * @return true, if the DataObject could be assigned, else false     */    private boolean expandCluster(DataObject dataObject) {        List seedList = database.epsilonRangeQuery(getEpsilon(), dataObject);        /** dataObject is NO coreObject */        if (seedList.size() < getMinPoints()) {            dataObject.setClusterLabel(DataObject.NOISE);            return false;        }        /** dataObject is coreObject */        for (int i = 0; i < seedList.size(); i++) {            DataObject seedListDataObject = (DataObject) seedList.get(i);            /** label this seedListDataObject with the current clusterID, because it is in epsilon-range */            seedListDataObject.setClusterLabel(clusterID);            if (seedListDataObject.equals(dataObject)) {                seedList.remove(i);                i--;            }        }        /** Iterate the seedList of the startDataObject */        for (int j = 0; j < seedList.size(); j++) {            DataObject seedListDataObject = (DataObject) seedList.get(j);            List seedListDataObject_Neighbourhood = database.epsilonRangeQuery(getEpsilon(), seedListDataObject);            /** seedListDataObject is coreObject */            if (seedListDataObject_Neighbourhood.size() >= getMinPoints()) {                for (int i = 0; i < seedListDataObject_Neighbourhood.size(); i++) {                    DataObject p = (DataObject) seedListDataObject_Neighbourhood.get(i);                    if (p.getClusterLabel() == DataObject.UNCLASSIFIED || p.getClusterLabel() == DataObject.NOISE) {                        if (p.getClusterLabel() == DataObject.UNCLASSIFIED) {                            seedList.add(p);                        }                        p.setClusterLabel(clusterID);                    }                }            }            seedList.remove(j);            j--;        }        return true;    }    /**     * Classifies a given instance.     *     * @param instance The instance to be assigned to a cluster     * @return int The number of the assigned cluster as an integer     * @throws java.lang.Exception If instance could not be clustered     * successfully     */    public int clusterInstance(Instance instance) throws Exception {        if (processed_InstanceID >= database.size()) processed_InstanceID = 0;        int cnum = (database.getDataObject(Integer.toString(processed_InstanceID++))).getClusterLabel();        if (cnum == DataObject.NOISE)            throw new Exception();        else            return cnum;    }    /**     * Returns the number of clusters.     *     * @return int The number of clusters generated for a training dataset.     * @throws java.lang.Exception if number of clusters could not be returned     * successfully     */    public int numberOfClusters() throws Exception {        return numberOfGeneratedClusters;    }    /**     * Returns an enumeration of all the available options..     *     * @return Enumeration An enumeration of all available options.     */    public Enumeration listOptions() {        Vector vector = new Vector();        vector.addElement(                new Option("\tepsilon (default = 0.9)",                        "E",                        1,                        "-E <double>"));        vector.addElement(                new Option("\tminPoints (default = 6)",                        "M",                        1,                        "-M <int>"));        vector.addElement(                new Option("\tindex (database) used for DBScan (default = weka.clusterers.forOPTICSAndDBScan.Databases.SequentialDatabase)",                        "I",
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -