📄 cobweb.java
字号:
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* Cobweb.java
* Copyright (C) 2001 Mark Hall
*
*/
package weka.clusterers;
import java.io.Serializable;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;
import weka.core.AttributeStats;
import weka.core.Drawable;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Utils;
import weka.experiment.Stats;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.Add;
/**
* Class implementing the Cobweb and Classit clustering algorithms.<p><p>
*
* Note: the application of node operators (merging, splitting etc.) in
* terms of ordering and priority differs (and is somewhat ambiguous)
* between the original Cobweb and Classit papers. This algorithm always
* compares the best host, adding a new leaf, merging the two best hosts, and
* splitting the best host when considering where to place a new instance.<p>
*
* Valid options are:<p>
*
* -A <acuity> <br>
* Acuity. <p>
*
* -C <cutoff> <br>
* Cutoff. <p>
*
* @author <a href="mailto:mhall@cs.waikato.ac.nz">Mark Hall</a>
* @version $Revision$
* @see Clusterer
* @see OptionHandler
* @see Drawable
*/
public class Cobweb extends Clusterer implements OptionHandler, Drawable {
/**
* Inner class handling node operations for Cobweb.
*
* @see Serializable
*/
private class CNode implements Serializable {
/**
* Within cluster attribute statistics
*/
private AttributeStats [] m_attStats;
/**
* Number of attributes
*/
private int m_numAttributes;
/**
* Instances at this node
*/
protected Instances m_clusterInstances = null;
/**
* Children of this node
*/
private FastVector m_children = null;
/**
* Total instances at this node
*/
private double m_totalInstances = 0.0;
/**
* Cluster number of this node
*/
private int m_clusterNum = -1;
/**
* Creates an empty <code>CNode</code> instance.
*
* @param numAttributes the number of attributes in the data
*/
public CNode(int numAttributes) {
m_numAttributes = numAttributes;
}
/**
* Creates a new leaf <code>CNode</code> instance.
*
* @param numAttributes the number of attributes in the data
* @param leafInstance the instance to store at this leaf
*/
public CNode(int numAttributes, Instance leafInstance) {
this(numAttributes);
if (m_clusterInstances == null) {
m_clusterInstances = new Instances(leafInstance.dataset(), 1);
}
m_clusterInstances.add(leafInstance);
updateStats(leafInstance, false);
}
/**
* Adds an instance to this cluster.
*
* @param newInstance the instance to add
* @exception Exception if an error occurs
*/
protected void addInstance(Instance newInstance) throws Exception {
// Add the instance to this cluster
if (m_clusterInstances == null) {
m_clusterInstances = new Instances(newInstance.dataset(), 1);
m_clusterInstances.add(newInstance);
updateStats(newInstance, false);
return;
} else if (m_children == null) {
/* we are a leaf, so make our existing instance(s) into a child
and then add the new instance as a child */
m_children = new FastVector();
CNode tempSubCluster = new CNode(m_numAttributes,
m_clusterInstances.instance(0));
// System.out.println("Dumping "+m_clusterInstances.numInstances());
for (int i = 1; i < m_clusterInstances.numInstances(); i++) {
tempSubCluster.m_clusterInstances.
add(m_clusterInstances.instance(i));
tempSubCluster.updateStats(m_clusterInstances.instance(i), false);
}
m_children = new FastVector();
m_children.addElement(tempSubCluster);
m_children.addElement(new CNode(m_numAttributes, newInstance));
m_clusterInstances.add(newInstance);
updateStats(newInstance, false);
// here is where we check against cutoff (also check cutoff
// in findHost)
if (categoryUtility() < m_cutoff) {
// System.out.println("Cutting (leaf add) ");
m_children = null;
}
return;
}
// otherwise, find the best host for this instance
CNode bestHost = findHost(newInstance, false);
if (bestHost != null) {
// now add to the best host
bestHost.addInstance(newInstance);
}
}
/**
* Temporarily adds a new instance to each of this nodes children
* in turn and computes the category utility.
*
* @param newInstance the new instance to evaluate
* @return an array of category utility values---the result of considering
* each child in turn as a host for the new instance
* @exception Exception if an error occurs
*/
private double [] cuScoresForChildren(Instance newInstance)
throws Exception {
// look for a host in existing children
double [] categoryUtils = new double [m_children.size()];
// look for a home for this instance in the existing children
for (int i = 0; i < m_children.size(); i++) {
CNode temp = (CNode) m_children.elementAt(i);
// tentitively add the new instance to this child
temp.updateStats(newInstance, false);
categoryUtils[i] = categoryUtility();
// remove the new instance from this child
temp.updateStats(newInstance, true);
}
return categoryUtils;
}
private double cuScoreForBestTwoMerged(CNode merged,
CNode a, CNode b,
Instance newInstance)
throws Exception {
double mergedCU = -Double.MAX_VALUE;
// consider merging the best and second
// best.
merged.m_clusterInstances = new Instances(m_clusterInstances, 1);
merged.addChildNode(a);
merged.addChildNode(b);
merged.updateStats(newInstance, false); // add new instance to stats
// remove the best and second best nodes
m_children.removeElementAt(m_children.indexOf(a));
m_children.removeElementAt(m_children.indexOf(b));
m_children.addElement(merged);
mergedCU = categoryUtility();
// restore the status quo
merged.updateStats(newInstance, true);
m_children.removeElementAt(m_children.indexOf(merged));
m_children.addElement(a);
m_children.addElement(b);
return mergedCU;
}
/**
* Finds a host for the new instance in this nodes children. Also
* considers merging the two best hosts and splitting the best host.
*
* @param newInstance the instance to find a host for
* @param structureFrozen true if the instance is not to be added to
* the tree and instead the best potential host is to be returned
* @return the best host
* @exception Exception if an error occurs
*/
private CNode findHost(Instance newInstance,
boolean structureFrozen) throws Exception {
if (!structureFrozen) {
updateStats(newInstance, false);
}
// look for a host in existing children and also consider as a new leaf
double [] categoryUtils = cuScoresForChildren(newInstance);
// make a temporary new leaf for this instance and get CU
CNode newLeaf = new CNode(m_numAttributes, newInstance);
m_children.addElement(newLeaf);
double bestHostCU = categoryUtility();
CNode finalBestHost = newLeaf;
// remove new leaf when seaching for best and second best nodes to
// consider for merging and splitting
m_children.removeElementAt(m_children.size()-1);
// now determine the best host (and the second best)
int best = 0;
int secondBest = 0;
for (int i = 0; i < categoryUtils.length; i++) {
if (categoryUtils[i] > categoryUtils[secondBest]) {
if (categoryUtils[i] > categoryUtils[best]) {
secondBest = best;
best = i;
} else {
secondBest = i;
}
}
}
CNode a = (CNode) m_children.elementAt(best);
CNode b = (CNode) m_children.elementAt(secondBest);
if (categoryUtils[best] > bestHostCU) {
bestHostCU = categoryUtils[best];
finalBestHost = a;
// System.out.println("Node is best");
}
if (structureFrozen) {
if (finalBestHost == newLeaf) {
return null; // *this* node is the best host
} else {
return finalBestHost;
}
}
double mergedCU = -Double.MAX_VALUE;
CNode merged = new CNode(m_numAttributes);
if (a != b) {
mergedCU = cuScoreForBestTwoMerged(merged, a, b, newInstance);
if (mergedCU > bestHostCU) {
bestHostCU = mergedCU;
finalBestHost = merged;
}
}
// Consider splitting the best
double splitCU = -Double.MAX_VALUE;
double splitBestChildCU = -Double.MAX_VALUE;
double splitPlusNewLeafCU = -Double.MAX_VALUE;
double splitPlusMergeBestTwoCU = -Double.MAX_VALUE;
if (a.m_children != null) {
FastVector tempChildren = new FastVector();
for (int i = 0; i < m_children.size(); i++) {
CNode existingChild = (CNode)m_children.elementAt(i);
if (existingChild != a) {
tempChildren.addElement(existingChild);
}
}
for (int i = 0; i < a.m_children.size(); i++) {
CNode promotedChild = (CNode)a.m_children.elementAt(i);
tempChildren.addElement(promotedChild);
}
// also add the new leaf
tempChildren.addElement(newLeaf);
FastVector saveStatusQuo = m_children;
m_children = tempChildren;
splitPlusNewLeafCU = categoryUtility(); // split + new leaf
// remove the new leaf
tempChildren.removeElementAt(tempChildren.size()-1);
// now look for best and second best
categoryUtils = cuScoresForChildren(newInstance);
// now determine the best host (and the second best)
best = 0;
secondBest = 0;
for (int i = 0; i < categoryUtils.length; i++) {
if (categoryUtils[i] > categoryUtils[secondBest]) {
if (categoryUtils[i] > categoryUtils[best]) {
secondBest = best;
best = i;
} else {
secondBest = i;
}
}
}
CNode sa = (CNode) m_children.elementAt(best);
CNode sb = (CNode) m_children.elementAt(secondBest);
splitBestChildCU = categoryUtils[best];
// now merge best and second best
CNode mergedSplitChildren = new CNode(m_numAttributes);
if (sa != sb) {
splitPlusMergeBestTwoCU =
cuScoreForBestTwoMerged(mergedSplitChildren, sa, sb, newInstance);
}
splitCU = (splitBestChildCU > splitPlusNewLeafCU) ?
splitBestChildCU : splitPlusNewLeafCU;
splitCU = (splitCU > splitPlusMergeBestTwoCU) ?
splitCU : splitPlusMergeBestTwoCU;
if (splitCU > bestHostCU) {
bestHostCU = splitCU;
finalBestHost = this;
// tempChildren.removeElementAt(tempChildren.size()-1);
} else {
// restore the status quo
m_children = saveStatusQuo;
}
}
if (finalBestHost != this) {
// can commit the instance to the set of instances at this node
m_clusterInstances.add(newInstance);
} else {
m_numberSplits++;
}
if (finalBestHost == merged) {
m_numberMerges++;
m_children.removeElementAt(m_children.indexOf(a));
m_children.removeElementAt(m_children.indexOf(b));
m_children.addElement(merged);
}
if (finalBestHost == newLeaf) {
finalBestHost = new CNode(m_numAttributes);
m_children.addElement(finalBestHost);
}
if (bestHostCU < m_cutoff) {
if (finalBestHost == this) {
// splitting was the best, but since we are cutting all children
// recursion is aborted and we still need to add the instance
// to the set of instances at this node
m_clusterInstances.add(newInstance);
}
m_children = null;
finalBestHost = null;
}
if (finalBestHost == this) {
// splitting is still the best, so downdate the stats as
// we'll be recursively calling on this node
updateStats(newInstance, true);
}
return finalBestHost;
}
/**
* Adds the supplied node as a child of this node. All of the child's
* instances are added to this nodes instances
*
* @param child the child to add
*/
protected void addChildNode(CNode child) {
for (int i = 0; i < child.m_clusterInstances.numInstances(); i++) {
Instance temp = child.m_clusterInstances.instance(i);
m_clusterInstances.add(temp);
updateStats(temp, false);
}
if (m_children == null) {
m_children = new FastVector();
}
m_children.addElement(child);
}
/**
* Computes the utility of all children with respect to this node
*
* @return the category utility of the children with respect to this node.
*/
protected double categoryUtility() throws Exception {
if (m_children == null) {
throw new Exception("categoryUtility: No children!");
}
double totalCU = 0;
for (int i = 0; i < m_children.size(); i++) {
CNode child = (CNode) m_children.elementAt(i);
totalCU += categoryUtilityChild(child);
}
totalCU /= (double)m_children.size();
return totalCU;
}
/**
* Computes the utility of a single child with respect to this node
*
* @param child the child for which to compute the utility
* @return the utility of the child with respect to this node
* @exception Exception if something goes wrong
*/
protected double categoryUtilityChild(CNode child) throws Exception {
double sum = 0;
for (int i = 0; i < m_numAttributes; i++) {
if (m_clusterInstances.attribute(i).isNominal()) {
for (int j = 0;
j < m_clusterInstances.attribute(i).numValues(); j++) {
double x = child.getProbability(i, j);
double y = getProbability(i, j);
sum += (x * x) - (y * y);
}
} else {
// numeric attribute
sum += ((m_normal / child.getStandardDev(i)) -
(m_normal / getStandardDev(i)));
}
}
return (child.m_totalInstances / m_totalInstances) * sum;
}
/**
* Returns the probability of a value of a nominal attribute in this node
*
* @param attIndex the index of the attribute
* @param valueIndex the index of the value of the attribute
* @return the probability
* @exception Exception if the requested attribute is not nominal
*/
protected double getProbability(int attIndex, int valueIndex)
throws Exception {
if (!m_clusterInstances.attribute(attIndex).isNominal()) {
throw new Exception("getProbability: attribute is not nominal");
}
if (m_attStats[attIndex].totalCount <= 0) {
return 0;
}
return (double) m_attStats[attIndex].nominalCounts[valueIndex] /
(double) m_attStats[attIndex].totalCount;
}
/**
* Returns the standard deviation of a numeric attribute
*
* @param attIndex the index of the attribute
* @return the standard deviation
* @exception Exception if an error occurs
*/
protected double getStandardDev(int attIndex) throws Exception {
if (!m_clusterInstances.attribute(attIndex).isNumeric()) {
throw new Exception("getStandardDev: attribute is not numeric");
}
m_attStats[attIndex].numericStats.calculateDerived();
double stdDev = m_attStats[attIndex].numericStats.stdDev;
if (Double.isNaN(stdDev) || Double.isInfinite(stdDev)) {
return m_acuity;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -