📄 cobweb.java
字号:
/*
0002: * This program is free software; you can redistribute it and/or modify
0003: * it under the terms of the GNU General Public License as published by
0004: * the Free Software Foundation; either version 2 of the License, or
0005: * (at your option) any later version.
0006: *
0007: * This program is distributed in the hope that it will be useful,
0008: * but WITHOUT ANY WARRANTY; without even the implied warranty of
0009: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
0010: * GNU General Public License for more details.
0011: *
0012: * You should have received a copy of the GNU General Public License
0013: * along with this program; if not, write to the Free Software
0014: * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
0015: */
0016:
0017: /*
0018: * Cobweb.java
0019: * Copyright (C) 2001 University of Waikato, Hamilton, New Zealand
0020: *
0021: */
0022:
0023: package weka.clusterers;
0024:
0025: import weka.core.AttributeStats;
0026: import weka.core.Capabilities;
0027: import weka.core.Drawable;
0028: import weka.core.FastVector;
0029: import weka.core.Instance;
0030: import weka.core.Instances;
0031: import weka.core.Option;
0032: import weka.core.TechnicalInformation;
0033: import weka.core.TechnicalInformationHandler;
0034: import weka.core.Utils;
0035: import weka.core.Capabilities.Capability;
0036: import weka.core.TechnicalInformation.Field;
0037: import weka.core.TechnicalInformation.Type;
0038: import weka.experiment.Stats;
0039: import weka.filters.Filter;
0040: import weka.filters.unsupervised.attribute.Add;
0041:
0042: import java.io.Serializable;
0043: import java.util.Enumeration;
0044: import java.util.Random;
0045: import java.util.Vector;
0046:
0047: /**
0048: <!-- globalinfo-start -->
0049: * Class implementing the Cobweb and Classit clustering algorithms.<br/>
0050: * <br/>
0051: * Note: the application of node operators (merging, splitting etc.) in terms of ordering and priority differs (and is somewhat ambiguous) between the original Cobweb and Classit papers. This algorithm always compares the best host, adding a new leaf, merging the two best hosts, and splitting the best host when considering where to place a new instance.<br/>
0052: * <br/>
0053: * For more information see:<br/>
0054: * <br/>
0055: * D. Fisher (1987). Knowledge acquisition via incremental conceptual clustering. Machine Learning. 2(2):139-172.<br/>
0056: * <br/>
0057: * J. H. Gennari, P. Langley, D. Fisher (1990). Models of incremental concept formation. Artificial Intelligence. 40:11-61.
0058: * <p/>
0059: <!-- globalinfo-end -->
0060: *
0061: <!-- technical-bibtex-start -->
0062: * BibTeX:
0063: * <pre>
0064: * @article{Fisher1987,
0065: * author = {D. Fisher},
0066: * journal = {Machine Learning},
0067: * number = {2},
0068: * pages = {139-172},
0069: * title = {Knowledge acquisition via incremental conceptual clustering},
0070: * volume = {2},
0071: * year = {1987}
0072: * }
0073: *
0074: * @article{Gennari1990,
0075: * author = {J. H. Gennari and P. Langley and D. Fisher},
0076: * journal = {Artificial Intelligence},
0077: * pages = {11-61},
0078: * title = {Models of incremental concept formation},
0079: * volume = {40},
0080: * year = {1990}
0081: * }
0082: * </pre>
0083: * <p/>
0084: <!-- technical-bibtex-end -->
0085: *
0086: <!-- options-start -->
0087: * Valid options are: <p/>
0088: *
0089: * <pre> -A <acuity>
0090: * Acuity.
0091: * (default=1.0)</pre>
0092: *
0093: * <pre> -C <cutoff>
0094: * Cutoff.
0095: * (default=0.002)</pre>
0096: *
0097: * <pre> -S <num>
0098: * Random number seed.
0099: * (default 42)</pre>
0100: *
0101: <!-- options-end -->
0102: *
0103: * @author <a href="mailto:mhall@cs.waikato.ac.nz">Mark Hall</a>
0104: * @version $Revision: 1.23 $
0105: * @see RandomizableClusterer
0106: * @see Drawable
0107: */
0108: public class Cobweb extends RandomizableClusterer implements Drawable,
0109: TechnicalInformationHandler, UpdateableClusterer {
0110:
0111: /** for serialization */
0112: static final long serialVersionUID = 928406656495092318L;
0113:
0114: /**
0115: * Inner class handling node operations for Cobweb.
0116: *
0117: * @see Serializable
0118: */
0119: private class CNode implements Serializable {
0120:
0121: /** for serialization */
0122: static final long serialVersionUID = 3452097436933325631L;
0123: /**
0124: * Within cluster attribute statistics
0125: */
0126: private AttributeStats[] m_attStats;
0127:
0128: /**
0129: * Number of attributes
0130: */
0131: private int m_numAttributes;
0132:
0133: /**
0134: * Instances at this node
0135: */
0136: protected Instances m_clusterInstances = null;
0137:
0138: /**
0139: * Children of this node
0140: */
0141: private FastVector m_children = null;
0142:
0143: /**
0144: * Total instances at this node
0145: */
0146: private double m_totalInstances = 0.0;
0147:
0148: /**
0149: * Cluster number of this node
0150: */
0151: private int m_clusterNum = -1;
0152:
0153: /**
0154: * Creates an empty <code>CNode</code> instance.
0155: *
0156: * @param numAttributes the number of attributes in the data
0157: */
0158: public CNode(int numAttributes) {
0159: m_numAttributes = numAttributes;
0160: }
0161:
0162: /**
0163: * Creates a new leaf <code>CNode</code> instance.
0164: *
0165: * @param numAttributes the number of attributes in the data
0166: * @param leafInstance the instance to store at this leaf
0167: */
0168: public CNode(int numAttributes, Instance leafInstance) {
0169: this (numAttributes);
0170: if (m_clusterInstances == null) {
0171: m_clusterInstances = new Instances(leafInstance
0172: .dataset(), 1);
0173: }
0174: m_clusterInstances.add(leafInstance);
0175: updateStats(leafInstance, false);
0176: }
0177:
0178: /**
0179: * Adds an instance to this cluster.
0180: *
0181: * @param newInstance the instance to add
0182: * @throws Exception if an error occurs
0183: */
0184: protected void addInstance(Instance newInstance)
0185: throws Exception {
0186: // Add the instance to this cluster
0187:
0188: if (m_clusterInstances == null) {
0189: m_clusterInstances = new Instances(newInstance
0190: .dataset(), 1);
0191: m_clusterInstances.add(newInstance);
0192: updateStats(newInstance, false);
0193: return;
0194: } else if (m_children == null) {
0195: /* we are a leaf, so make our existing instance(s) into a child
0196: and then add the new instance as a child */
0197: m_children = new FastVector();
0198: CNode tempSubCluster = new CNode(m_numAttributes,
0199: m_clusterInstances.instance(0));
0200:
0201: // System.out.println("Dumping "+m_clusterInstances.numInstances());
0202: for (int i = 1; i < m_clusterInstances.numInstances(); i++) {
0203: tempSubCluster.m_clusterInstances
0204: .add(m_clusterInstances.instance(i));
0205: tempSubCluster.updateStats(m_clusterInstances
0206: .instance(i), false);
0207: }
0208: m_children = new FastVector();
0209: m_children.addElement(tempSubCluster);
0210: m_children.addElement(new CNode(m_numAttributes,
0211: newInstance));
0212:
0213: m_clusterInstances.add(newInstance);
0214: updateStats(newInstance, false);
0215:
0216: // here is where we check against cutoff (also check cutoff
0217: // in findHost)
0218: if (categoryUtility() < m_cutoff) {
0219: // System.out.println("Cutting (leaf add) ");
0220: m_children = null;
0221: }
0222: return;
0223: }
0224:
0225: // otherwise, find the best host for this instance
0226: CNode bestHost = findHost(newInstance, false);
0227: if (bestHost != null) {
0228: // now add to the best host
0229: bestHost.addInstance(newInstance);
0230: }
0231: }
0232:
0233: /**
0234: * Temporarily adds a new instance to each of this nodes children
0235: * in turn and computes the category utility.
0236: *
0237: * @param newInstance the new instance to evaluate
0238: * @return an array of category utility values---the result of considering
0239: * each child in turn as a host for the new instance
0240: * @throws Exception if an error occurs
0241: */
0242: private double[] cuScoresForChildren(Instance newInstance)
0243: throws Exception {
0244: // look for a host in existing children
0245: double[] categoryUtils = new double[m_children.size()];
0246:
0247: // look for a home for this instance in the existing children
0248: for (int i = 0; i < m_children.size(); i++) {
0249: CNode temp = (CNode) m_children.elementAt(i);
0250: // tentitively add the new instance to this child
0251: temp.updateStats(newInstance, false);
0252: categoryUtils[i] = categoryUtility();
0253:
0254: // remove the new instance from this child
0255: temp.updateStats(newInstance, true);
0256: }
0257: return categoryUtils;
0258: }
0259:
0260: private double cuScoreForBestTwoMerged(CNode merged, CNode a,
0261: CNode b, Instance newInstance) throws Exception {
0262:
0263: double mergedCU = -Double.MAX_VALUE;
0264: // consider merging the best and second
0265: // best.
0266: merged.m_clusterInstances = new Instances(
0267: m_clusterInstances, 1);
0268:
0269: merged.addChildNode(a);
0270: merged.addChildNode(b);
0271: merged.updateStats(newInstance, false); // add new instance to stats
0272: // remove the best and second best nodes
0273: m_children.removeElementAt(m_children.indexOf(a));
0274: m_children.removeElementAt(m_children.indexOf(b));
0275: m_children.addElement(merged);
0276: mergedCU = categoryUtility();
0277: // restore the status quo
0278: merged.updateStats(newInstance, true);
0279: m_children.removeElementAt(m_children.indexOf(merged));
0280: m_children.addElement(a);
0281: m_children.addElement(b);
0282: return mergedCU;
0283: }
0284:
0285: /**
0286: * Finds a host for the new instance in this nodes children. Also
0287: * considers merging the two best hosts and splitting the best host.
0288: *
0289: * @param newInstance the instance to find a host for
0290: * @param structureFrozen true if the instance is not to be added to
0291: * the tree and instead the best potential host is to be returned
0292: * @return the best host
0293: * @throws Exception if an error occurs
0294: */
0295: private CNode findHost(Instance newInstance,
0296: boolean structureFrozen) throws Exception {
0297:
0298: if (!structureFrozen) {
0299: updateStats(newInstance, false);
0300: }
0301:
0302: // look for a host in existing children and also consider as a new leaf
0303: double[] categoryUtils = cuScoresForChildren(newInstance);
0304:
0305: // make a temporary new leaf for this instance and get CU
0306: CNode newLeaf = new CNode(m_numAttributes, newInstance);
0307: m_children.addElement(newLeaf);
0308: double bestHostCU = categoryUtility();
0309: CNode finalBestHost = newLeaf;
0310:
0311: // remove new leaf when seaching for best and second best nodes to
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -