⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 cobweb.java

📁 一个数据挖掘软件ALPHAMINERR的整个过程的JAVA版源代码
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    Cobweb.java
 *    Copyright (C) 2001 Mark Hall
 *
 */

package weka.clusterers;

import java.io.Serializable;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;

import weka.core.AttributeStats;
import weka.core.Drawable;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Utils;
import weka.experiment.Stats;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.Add;

/**
 * Class implementing the Cobweb and Classit clustering algorithms.<p><p>
 *
 * Note: the application of node operators (merging, splitting etc.) in
 * terms of ordering and priority differs (and is somewhat ambiguous)
 * between the original Cobweb and Classit papers. This algorithm always
 * compares the best host, adding a new leaf, merging the two best hosts, and
 * splitting the best host when considering where to place a new instance.<p>
 *
 * Valid options are:<p>
 *
 * -A <acuity> <br>
 * Acuity. <p>
 *
 * -C <cutoff> <br>
 * Cutoff. <p>
 *
 * @author <a href="mailto:mhall@cs.waikato.ac.nz">Mark Hall</a>
 * @version $Revision$
 * @see Clusterer
 * @see OptionHandler
 * @see Drawable
 */
public class Cobweb extends Clusterer implements OptionHandler, Drawable {

  /**
   * Inner class handling node operations for Cobweb.
   *
   * @see Serializable
   */
  private class CNode implements Serializable {
    
    /**
     * Within cluster attribute statistics
     */
    private AttributeStats [] m_attStats;

    /**
     * Number of attributes
     */
    private int m_numAttributes;
    
    /**
     * Instances at this node
     */
    protected Instances m_clusterInstances = null;

    /**
     * Children of this node
     */
    private FastVector m_children = null;

    /**
     * Total instances at this node
     */
    private double m_totalInstances = 0.0;

    /**
     * Cluster number of this node
     */
    private int m_clusterNum = -1;

    /**
     * Creates an empty <code>CNode</code> instance.
     *
     * @param numAttributes the number of attributes in the data
     */
    public CNode(int numAttributes) {      
      m_numAttributes = numAttributes;
    }

    /**
     * Creates a new leaf <code>CNode</code> instance.
     *
     * @param numAttributes the number of attributes in the data
     * @param leafInstance the instance to store at this leaf
     */
    public CNode(int numAttributes, Instance leafInstance) {
      this(numAttributes);
      if (m_clusterInstances == null) {
	m_clusterInstances = new Instances(leafInstance.dataset(), 1);
      }
      m_clusterInstances.add(leafInstance);
      updateStats(leafInstance, false);
    }
    
    /**
     * Adds an instance to this cluster.
     *
     * @param newInstance the instance to add
     * @exception Exception if an error occurs
     */
    protected void addInstance(Instance newInstance) throws Exception {
      // Add the instance to this cluster

      if (m_clusterInstances == null) {
	m_clusterInstances = new Instances(newInstance.dataset(), 1);
	m_clusterInstances.add(newInstance);
	updateStats(newInstance, false);
	return;
      } else if (m_children == null) {
	/* we are a leaf, so make our existing instance(s) into a child
	   and then add the new instance as a child */
	m_children = new FastVector();
	CNode tempSubCluster = new CNode(m_numAttributes, 
					 m_clusterInstances.instance(0)); 

	//	System.out.println("Dumping "+m_clusterInstances.numInstances());
	for (int i = 1; i < m_clusterInstances.numInstances(); i++) {
	  tempSubCluster.m_clusterInstances.
	    add(m_clusterInstances.instance(i));
	  tempSubCluster.updateStats(m_clusterInstances.instance(i), false);
	}
	m_children = new FastVector();
	m_children.addElement(tempSubCluster);
	m_children.addElement(new CNode(m_numAttributes, newInstance));
	
	m_clusterInstances.add(newInstance);
	updateStats(newInstance, false);

	// here is where we check against cutoff (also check cutoff
	// in findHost)
	if (categoryUtility() < m_cutoff) {
	  //	  System.out.println("Cutting (leaf add) ");
	  m_children = null;
	}
	return;
      }
      
      // otherwise, find the best host for this instance
      CNode bestHost = findHost(newInstance, false);
      if (bestHost != null) {	
	// now add to the best host
	bestHost.addInstance(newInstance);
      }
    }

    /**
     * Temporarily adds a new instance to each of this nodes children
     * in turn and computes the category utility.
     *
     * @param newInstance the new instance to evaluate
     * @return an array of category utility values---the result of considering
     * each child in turn as a host for the new instance
     * @exception Exception if an error occurs
     */
    private double [] cuScoresForChildren(Instance newInstance) 
      throws Exception {
      // look for a host in existing children
      double [] categoryUtils = new double [m_children.size()];
      
      // look for a home for this instance in the existing children
      for (int i = 0; i < m_children.size(); i++) {
	CNode temp = (CNode) m_children.elementAt(i);
	// tentitively add the new instance to this child
	temp.updateStats(newInstance, false);
	categoryUtils[i] = categoryUtility();
	
	// remove the new instance from this child
	temp.updateStats(newInstance, true);
      }
      return categoryUtils;
    }

    private double cuScoreForBestTwoMerged(CNode merged, 
					   CNode a, CNode b,
					   Instance newInstance) 
      throws Exception {

      double mergedCU = -Double.MAX_VALUE;
      // consider merging the best and second
      // best.
      merged.m_clusterInstances = new Instances(m_clusterInstances, 1);
      
      merged.addChildNode(a);
      merged.addChildNode(b);
      merged.updateStats(newInstance, false); // add new instance to stats
      // remove the best and second best nodes
      m_children.removeElementAt(m_children.indexOf(a));
      m_children.removeElementAt(m_children.indexOf(b));	
      m_children.addElement(merged);
      mergedCU = categoryUtility();
      // restore the status quo
      merged.updateStats(newInstance, true);
      m_children.removeElementAt(m_children.indexOf(merged));
      m_children.addElement(a);
      m_children.addElement(b);
      return mergedCU;
    }

    /**
     * Finds a host for the new instance in this nodes children. Also
     * considers merging the two best hosts and splitting the best host.
     *
     * @param newInstance the instance to find a host for
     * @param structureFrozen true if the instance is not to be added to
     * the tree and instead the best potential host is to be returned
     * @return the best host
     * @exception Exception if an error occurs
     */
    private CNode findHost(Instance newInstance, 
			   boolean structureFrozen) throws Exception {

      if (!structureFrozen) {
	updateStats(newInstance, false);
      }
      
      // look for a host in existing children and also consider as a new leaf
      double [] categoryUtils = cuScoresForChildren(newInstance);
      
      // make a temporary new leaf for this instance and get CU
      CNode newLeaf = new CNode(m_numAttributes, newInstance);
      m_children.addElement(newLeaf);
      double bestHostCU = categoryUtility();
      CNode finalBestHost = newLeaf;
      
      // remove new leaf when seaching for best and second best nodes to
      // consider for merging and splitting
      m_children.removeElementAt(m_children.size()-1);

      // now determine the best host (and the second best)
      int best = 0;
      int secondBest = 0;
      for (int i = 0; i < categoryUtils.length; i++) {
	if (categoryUtils[i] > categoryUtils[secondBest]) {
	  if (categoryUtils[i] > categoryUtils[best]) {
	    secondBest = best;
	    best = i;
	  } else {
	    secondBest = i;
	  }
	} 
      }
      
      CNode a = (CNode) m_children.elementAt(best);
      CNode b = (CNode) m_children.elementAt(secondBest);
      if (categoryUtils[best] > bestHostCU) {
	bestHostCU = categoryUtils[best];
	finalBestHost = a;
	//	System.out.println("Node is best");
      }

      if (structureFrozen) {
	if (finalBestHost == newLeaf) {
	  return null; // *this* node is the best host
	} else {
	  return finalBestHost;
	}
      }

      double mergedCU = -Double.MAX_VALUE;
      CNode merged = new CNode(m_numAttributes);
      if (a != b) {
	mergedCU = cuScoreForBestTwoMerged(merged, a, b, newInstance);

	if (mergedCU > bestHostCU) {
	  bestHostCU = mergedCU;
	  finalBestHost = merged;
	}
      }

      // Consider splitting the best
      double splitCU = -Double.MAX_VALUE;
      double splitBestChildCU = -Double.MAX_VALUE;
      double splitPlusNewLeafCU = -Double.MAX_VALUE;
      double splitPlusMergeBestTwoCU = -Double.MAX_VALUE;
      if (a.m_children != null) {
	FastVector tempChildren = new FastVector();

	for (int i = 0; i < m_children.size(); i++) {
	  CNode existingChild = (CNode)m_children.elementAt(i);
	  if (existingChild != a) {
	    tempChildren.addElement(existingChild);
	  }
	}
	for (int i = 0; i < a.m_children.size(); i++) {
	  CNode promotedChild = (CNode)a.m_children.elementAt(i);
	  tempChildren.addElement(promotedChild);
	}
	// also add the new leaf
	tempChildren.addElement(newLeaf);

	FastVector saveStatusQuo = m_children;
	m_children = tempChildren;
	splitPlusNewLeafCU = categoryUtility(); // split + new leaf
	// remove the new leaf
	tempChildren.removeElementAt(tempChildren.size()-1);
	// now look for best and second best
	categoryUtils = cuScoresForChildren(newInstance);

	// now determine the best host (and the second best)
	best = 0;
	secondBest = 0;
	for (int i = 0; i < categoryUtils.length; i++) {
	  if (categoryUtils[i] > categoryUtils[secondBest]) {
	    if (categoryUtils[i] > categoryUtils[best]) {
	      secondBest = best;
	      best = i;
	    } else {
	      secondBest = i;
	    }
	  } 
	}
	CNode sa = (CNode) m_children.elementAt(best);
	CNode sb = (CNode) m_children.elementAt(secondBest);
	splitBestChildCU = categoryUtils[best];

	// now merge best and second best
	CNode mergedSplitChildren = new CNode(m_numAttributes);
	if (sa != sb) {
	  splitPlusMergeBestTwoCU = 
	    cuScoreForBestTwoMerged(mergedSplitChildren, sa, sb, newInstance);
	}
	splitCU = (splitBestChildCU > splitPlusNewLeafCU) ?
	  splitBestChildCU : splitPlusNewLeafCU;
	splitCU = (splitCU > splitPlusMergeBestTwoCU) ? 
	  splitCU : splitPlusMergeBestTwoCU;

	if (splitCU > bestHostCU) {
	  bestHostCU = splitCU;
	  finalBestHost = this;
	  //	  tempChildren.removeElementAt(tempChildren.size()-1);
	} else {
	  // restore the status quo
	  m_children = saveStatusQuo;
	}
      }

      if (finalBestHost != this) {
	// can commit the instance to the set of instances at this node
	m_clusterInstances.add(newInstance);
      } else {
	m_numberSplits++;
      }

      if (finalBestHost == merged) {
	m_numberMerges++;
	m_children.removeElementAt(m_children.indexOf(a));
	m_children.removeElementAt(m_children.indexOf(b));	
	m_children.addElement(merged);
      }

      if (finalBestHost == newLeaf) {
	finalBestHost = new CNode(m_numAttributes);
	m_children.addElement(finalBestHost);
      }

      if (bestHostCU < m_cutoff) {
	if (finalBestHost == this) {
	  // splitting was the best, but since we are cutting all children
	  // recursion is aborted and we still need to add the instance
	  // to the set of instances at this node
	  m_clusterInstances.add(newInstance);
	}
	m_children = null;
	finalBestHost = null;
      }

      if (finalBestHost == this) {
	// splitting is still the best, so downdate the stats as 
	// we'll be recursively calling on this node
	updateStats(newInstance, true);
      }

      return finalBestHost;
    }
    
    /**
     * Adds the supplied node as a child of this node. All of the child's
     * instances are added to this nodes instances
     *
     * @param child the child to add
     */
    protected void addChildNode(CNode child) {
      for (int i = 0; i < child.m_clusterInstances.numInstances(); i++) {
	Instance temp = child.m_clusterInstances.instance(i);
	m_clusterInstances.add(temp);
	updateStats(temp, false);
      }

      if (m_children == null) {
	m_children = new FastVector();
      }
      m_children.addElement(child);
    }

    /**
     * Computes the utility of all children with respect to this node
     *
     * @return the category utility of the children with respect to this node.
     */
    protected double categoryUtility() throws Exception {
      
      if (m_children == null) {
	throw new Exception("categoryUtility: No children!");
      }

      double totalCU = 0;
     
      for (int i = 0; i < m_children.size(); i++) {
	CNode child = (CNode) m_children.elementAt(i);
	totalCU += categoryUtilityChild(child);
      }

      totalCU /= (double)m_children.size();
      return totalCU;
    }

    /**
     * Computes the utility of a single child with respect to this node
     *
     * @param child the child for which to compute the utility
     * @return the utility of the child with respect to this node
     * @exception Exception if something goes wrong
     */
    protected double categoryUtilityChild(CNode child) throws Exception {
      
      double sum = 0;
      for (int i = 0; i < m_numAttributes; i++) {
	if (m_clusterInstances.attribute(i).isNominal()) {
	  for (int j = 0; 
	       j < m_clusterInstances.attribute(i).numValues(); j++) {
	    double x = child.getProbability(i, j);
	    double y = getProbability(i, j);
	    sum += (x * x) - (y * y);
	  }
	} else {
	  // numeric attribute
	  sum += ((m_normal / child.getStandardDev(i)) - 
		  (m_normal / getStandardDev(i)));
	  
	}
      }
      return (child.m_totalInstances / m_totalInstances) * sum;
    }

    /**
     * Returns the probability of a value of a nominal attribute in this node
     *
     * @param attIndex the index of the attribute
     * @param valueIndex the index of the value of the attribute
     * @return the probability
     * @exception Exception if the requested attribute is not nominal
     */
    protected double getProbability(int attIndex, int valueIndex) 
      throws Exception {
      
      if (!m_clusterInstances.attribute(attIndex).isNominal()) {
	throw new Exception("getProbability: attribute is not nominal");
      }

      if (m_attStats[attIndex].totalCount <= 0) {
	return 0;
      }

      return (double) m_attStats[attIndex].nominalCounts[valueIndex] / 
	(double) m_attStats[attIndex].totalCount;
    }

    /**
     * Returns the standard deviation of a numeric attribute
     *
     * @param attIndex the index of the attribute
     * @return the standard deviation
     * @exception Exception if an error occurs
     */
    protected double getStandardDev(int attIndex) throws Exception {
      if (!m_clusterInstances.attribute(attIndex).isNumeric()) {
	throw new Exception("getStandardDev: attribute is not numeric");
      }

      m_attStats[attIndex].numericStats.calculateDerived();
      double stdDev = m_attStats[attIndex].numericStats.stdDev;
      if (Double.isNaN(stdDev) || Double.isInfinite(stdDev)) {
	return m_acuity;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -