📄 cobweb.java

📁 数据挖掘中聚类的算法
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/* *    This program is free software; you can redistribute it and/or modify *    it under the terms of the GNU General Public License as published by *    the Free Software Foundation; either version 2 of the License, or *    (at your option) any later version. * *    This program is distributed in the hope that it will be useful, *    but WITHOUT ANY WARRANTY; without even the implied warranty of *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *    GNU General Public License for more details. * *    You should have received a copy of the GNU General Public License *    along with this program; if not, write to the Free Software *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* *    Cobweb.java *    Copyright (C) 2001 University of Waikato, Hamilton, New Zealand * */package weka.clusterers;import weka.core.AttributeStats;import weka.core.Capabilities;import weka.core.Drawable;import weka.core.FastVector;import weka.core.Instance;import weka.core.Instances;import weka.core.Option;import weka.core.TechnicalInformation;import weka.core.TechnicalInformationHandler;import weka.core.Utils;import weka.core.Capabilities.Capability;import weka.core.TechnicalInformation.Field;import weka.core.TechnicalInformation.Type;import weka.experiment.Stats;import weka.filters.Filter;import weka.filters.unsupervised.attribute.Add;import java.io.Serializable;import java.util.Enumeration;import java.util.Random;import java.util.Vector;/** <!-- globalinfo-start --> * Class implementing the Cobweb and Classit clustering algorithms.<br/> * <br/> * Note: the application of node operators (merging, splitting etc.) in terms of ordering and priority differs (and is somewhat ambiguous) between the original Cobweb and Classit papers. This algorithm always compares the best host, adding a new leaf, merging the two best hosts, and splitting the best host when considering where to place a new instance.<br/> * <br/> * For more information see:<br/> * <br/> * D. Fisher (1987). Knowledge acquisition via incremental conceptual clustering. Machine Learning. 2(2):139-172.<br/> * <br/> * J. H. Gennari, P. Langley, D. Fisher (1990). Models of incremental concept formation. Artificial Intelligence. 40:11-61. * <p/> <!-- globalinfo-end --> * <!-- technical-bibtex-start --> * BibTeX: * <pre> * &#64;article{Fisher1987, *    author = {D. Fisher}, *    journal = {Machine Learning}, *    number = {2}, *    pages = {139-172}, *    title = {Knowledge acquisition via incremental conceptual clustering}, *    volume = {2}, *    year = {1987} * } *  * &#64;article{Gennari1990, *    author = {J. H. Gennari and P. Langley and D. Fisher}, *    journal = {Artificial Intelligence}, *    pages = {11-61}, *    title = {Models of incremental concept formation}, *    volume = {40}, *    year = {1990} * } * </pre> * <p/> <!-- technical-bibtex-end --> * <!-- options-start --> * Valid options are: <p/> *  * <pre> -A &lt;acuity&gt; *  Acuity. *  (default=1.0)</pre> *  * <pre> -C &lt;cutoff&gt; *  Cutoff. *  (default=0.002)</pre> *  * <pre> -S &lt;num&gt; *  Random number seed. *  (default 42)</pre> *  <!-- options-end --> * * @author <a href="mailto:mhall@cs.waikato.ac.nz">Mark Hall</a> * @version $Revision: 1.24 $ * @see RandomizableClusterer * @see Drawable */public class Cobweb   extends RandomizableClusterer  implements Drawable, TechnicalInformationHandler, UpdateableClusterer {  /** for serialization */  static final long serialVersionUID = 928406656495092318L;    /**   * Inner class handling node operations for Cobweb.   *   * @see Serializable   */  private class CNode     implements Serializable {    /** for serialization */    static final long serialVersionUID = 3452097436933325631L;        /**     * Within cluster attribute statistics     */    private AttributeStats[] m_attStats;    /**     * Number of attributes     */    private int m_numAttributes;        /**     * Instances at this node     */    protected Instances m_clusterInstances = null;    /**     * Children of this node     */    private FastVector m_children = null;    /**     * Total instances at this node     */    private double m_totalInstances = 0.0;    /**     * Cluster number of this node     */    private int m_clusterNum = -1;    /**     * Creates an empty <code>CNode</code> instance.     *     * @param numAttributes the number of attributes in the data     */    public CNode(int numAttributes) {            m_numAttributes = numAttributes;    }    /**     * Creates a new leaf <code>CNode</code> instance.     *     * @param numAttributes the number of attributes in the data     * @param leafInstance the instance to store at this leaf     */    public CNode(int numAttributes, Instance leafInstance) {      this(numAttributes);      if (m_clusterInstances == null) {	m_clusterInstances = new Instances(leafInstance.dataset(), 1);      }      m_clusterInstances.add(leafInstance);      updateStats(leafInstance, false);    }        /**     * Adds an instance to this cluster.     *     * @param newInstance the instance to add     * @throws Exception if an error occurs     */    protected void addInstance(Instance newInstance) throws Exception {      // Add the instance to this cluster      if (m_clusterInstances == null) {	m_clusterInstances = new Instances(newInstance.dataset(), 1);	m_clusterInstances.add(newInstance);	updateStats(newInstance, false);	return;      } else if (m_children == null) {	/* we are a leaf, so make our existing instance(s) into a child	   and then add the new instance as a child */	m_children = new FastVector();	CNode tempSubCluster = new CNode(m_numAttributes, 					 m_clusterInstances.instance(0)); 	//	System.out.println("Dumping "+m_clusterInstances.numInstances());	for (int i = 1; i < m_clusterInstances.numInstances(); i++) {	  tempSubCluster.m_clusterInstances.	    add(m_clusterInstances.instance(i));	  tempSubCluster.updateStats(m_clusterInstances.instance(i), false);	}	m_children = new FastVector();	m_children.addElement(tempSubCluster);	m_children.addElement(new CNode(m_numAttributes, newInstance));		m_clusterInstances.add(newInstance);	updateStats(newInstance, false);	// here is where we check against cutoff (also check cutoff	// in findHost)	if (categoryUtility() < m_cutoff) {	  //	  System.out.println("Cutting (leaf add) ");	  m_children = null;	}	return;      }            // otherwise, find the best host for this instance      CNode bestHost = findHost(newInstance, false);      if (bestHost != null) {		// now add to the best host	bestHost.addInstance(newInstance);      }    }    /**     * Temporarily adds a new instance to each of this nodes children     * in turn and computes the category utility.     *     * @param newInstance the new instance to evaluate     * @return an array of category utility values---the result of considering     * each child in turn as a host for the new instance     * @throws Exception if an error occurs     */    private double[] cuScoresForChildren(Instance newInstance)       throws Exception {      // look for a host in existing children      double[] categoryUtils = new double [m_children.size()];            // look for a home for this instance in the existing children      for (int i = 0; i < m_children.size(); i++) {	CNode temp = (CNode) m_children.elementAt(i);	// tentitively add the new instance to this child	temp.updateStats(newInstance, false);	categoryUtils[i] = categoryUtility();		// remove the new instance from this child	temp.updateStats(newInstance, true);      }      return categoryUtils;    }    private double cuScoreForBestTwoMerged(CNode merged, 					   CNode a, CNode b,					   Instance newInstance)       throws Exception {      double mergedCU = -Double.MAX_VALUE;      // consider merging the best and second      // best.      merged.m_clusterInstances = new Instances(m_clusterInstances, 1);            merged.addChildNode(a);      merged.addChildNode(b);      merged.updateStats(newInstance, false); // add new instance to stats      // remove the best and second best nodes      m_children.removeElementAt(m_children.indexOf(a));      m_children.removeElementAt(m_children.indexOf(b));	      m_children.addElement(merged);      mergedCU = categoryUtility();      // restore the status quo      merged.updateStats(newInstance, true);      m_children.removeElementAt(m_children.indexOf(merged));      m_children.addElement(a);      m_children.addElement(b);      return mergedCU;    }    /**     * Finds a host for the new instance in this nodes children. Also     * considers merging the two best hosts and splitting the best host.     *     * @param newInstance the instance to find a host for     * @param structureFrozen true if the instance is not to be added to     * the tree and instead the best potential host is to be returned     * @return the best host     * @throws Exception if an error occurs     */    private CNode findHost(Instance newInstance, 			   boolean structureFrozen) throws Exception {      if (!structureFrozen) {	updateStats(newInstance, false);      }            // look for a host in existing children and also consider as a new leaf      double[] categoryUtils = cuScoresForChildren(newInstance);            // make a temporary new leaf for this instance and get CU      CNode newLeaf = new CNode(m_numAttributes, newInstance);      m_children.addElement(newLeaf);      double bestHostCU = categoryUtility();      CNode finalBestHost = newLeaf;            // remove new leaf when seaching for best and second best nodes to      // consider for merging and splitting      m_children.removeElementAt(m_children.size()-1);      // now determine the best host (and the second best)      int best = 0;      int secondBest = 0;      for (int i = 0; i < categoryUtils.length; i++) {	if (categoryUtils[i] > categoryUtils[secondBest]) {	  if (categoryUtils[i] > categoryUtils[best]) {	    secondBest = best;	    best = i;	  } else {	    secondBest = i;	  }	}       }            CNode a = (CNode) m_children.elementAt(best);      CNode b = (CNode) m_children.elementAt(secondBest);      if (categoryUtils[best] > bestHostCU) {	bestHostCU = categoryUtils[best];	finalBestHost = a;	//	System.out.println("Node is best");      }      if (structureFrozen) {	if (finalBestHost == newLeaf) {	  return null; // *this* node is the best host	} else {	  return finalBestHost;	}      }      double mergedCU = -Double.MAX_VALUE;      CNode merged = new CNode(m_numAttributes);      if (a != b) {	mergedCU = cuScoreForBestTwoMerged(merged, a, b, newInstance);	if (mergedCU > bestHostCU) {	  bestHostCU = mergedCU;	  finalBestHost = merged;	}      }      // Consider splitting the best      double splitCU = -Double.MAX_VALUE;      double splitBestChildCU = -Double.MAX_VALUE;      double splitPlusNewLeafCU = -Double.MAX_VALUE;      double splitPlusMergeBestTwoCU = -Double.MAX_VALUE;      if (a.m_children != null) {	FastVector tempChildren = new FastVector();	for (int i = 0; i < m_children.size(); i++) {	  CNode existingChild = (CNode)m_children.elementAt(i);	  if (existingChild != a) {	    tempChildren.addElement(existingChild);	  }	}	for (int i = 0; i < a.m_children.size(); i++) {	  CNode promotedChild = (CNode)a.m_children.elementAt(i);	  tempChildren.addElement(promotedChild);	}	// also add the new leaf	tempChildren.addElement(newLeaf);	FastVector saveStatusQuo = m_children;	m_children = tempChildren;	splitPlusNewLeafCU = categoryUtility(); // split + new leaf	// remove the new leaf	tempChildren.removeElementAt(tempChildren.size()-1);	// now look for best and second best	categoryUtils = cuScoresForChildren(newInstance);	// now determine the best host (and the second best)	best = 0;	secondBest = 0;	for (int i = 0; i < categoryUtils.length; i++) {	  if (categoryUtils[i] > categoryUtils[secondBest]) {	    if (categoryUtils[i] > categoryUtils[best]) {	      secondBest = best;	      best = i;	    } else {	      secondBest = i;	    }	  } 	}	CNode sa = (CNode) m_children.elementAt(best);	CNode sb = (CNode) m_children.elementAt(secondBest);	splitBestChildCU = categoryUtils[best];	// now merge best and second best	CNode mergedSplitChildren = new CNode(m_numAttributes);	if (sa != sb) {	  splitPlusMergeBestTwoCU = 	    cuScoreForBestTwoMerged(mergedSplitChildren, sa, sb, newInstance);	}	splitCU = (splitBestChildCU > splitPlusNewLeafCU) ?	  splitBestChildCU : splitPlusNewLeafCU;	splitCU = (splitCU > splitPlusMergeBestTwoCU) ? 	  splitCU : splitPlusMergeBestTwoCU;	if (splitCU > bestHostCU) {	  bestHostCU = splitCU;	  finalBestHost = this;
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -