📄 removemisclassified.java
字号:
/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* * RemoveMisclassified.java * Copyright (C) 2002 Richard Kirkby * */package weka.filters.unsupervised.instance;import weka.filters.*;import weka.classifiers.Classifier;import weka.core.*;import java.util.Enumeration;import java.util.Vector;/** * A filter that removes instances which are incorrectly classified. * Useful for removing outliers. <p> * * Valid filter-specific options are: <p> * * -W classifier string <br> * Full class name of classifier to use, followed by scheme options. (required)<p> * * -C class index <br> * Attribute on which misclassifications are based. If < 0 will use any current set * class or default to the last attribute. * * -F number of folds <br> * The number of folds to use for cross-validation cleansing. * (<2 = no cross-validation - default)<p> * * -T threshold <br> * Threshold for the max error when predicting numeric class. * (Value should be >= 0, default = 0.1)<p> * * -I max iterations <br> * The maximum number of cleansing iterations to perform. * (<1 = until fully cleansed - default)<p> * * -V <br> * Invert the match so that correctly classified instances are discarded.<p> * * @author Richard Kirkby (rkirkby@cs.waikato.ac.nz) * @author Malcolm Ware (mfw4@cs.waikato.ac.nz) * @version $Revision: 1.1.1.1 $ */public class RemoveMisclassified extends Filter implements UnsupervisedFilter, OptionHandler { /** The classifier used to do the cleansing */ protected Classifier m_cleansingClassifier = new weka.classifiers.rules.ZeroR(); /** The attribute to treat as the class for purposes of cleansing. */ protected int m_classIndex = -1; /** The number of cross validation folds to perform (<2 = no cross validation) */ protected int m_numOfCrossValidationFolds = 0; /** The maximum number of cleansing iterations to perform (<1 = until fully cleansed) */ protected int m_numOfCleansingIterations = 0; /** The threshold for deciding when a numeric value is correctly classified */ protected double m_numericClassifyThreshold = 0.1; /** Whether to invert the match so the correctly classified instances are discarded */ protected boolean m_invertMatching = false; /** * Sets the format of the input instances. * * @param instanceInfo an Instances object containing the input instance * structure (any instances contained in the object are ignored - only the * structure is required). * @return true if the outputFormat may be collected immediately * @exception Exception if the inputFormat can't be set successfully */ public boolean setInputFormat(Instances instanceInfo) throws Exception { super.setInputFormat(instanceInfo); setOutputFormat(instanceInfo); return true; } /** * Cleanses the data based on misclassifications when used training data. * * @param data the data to train with and cleanse */ private Instances cleanseTrain(Instances data) throws Exception { Instance inst; Instances buildSet = new Instances(data); Instances temp = new Instances(data, data.numInstances()); Instances inverseSet = new Instances(data, data.numInstances()); int count = 0; double ans; int iterations = 0; int classIndex = m_classIndex; if (classIndex < 0) classIndex = data.classIndex(); if (classIndex < 0) classIndex = data.numAttributes()-1; // loop until perfect while(count != buildSet.numInstances()) { // check if hit maximum number of iterations iterations++; if (m_numOfCleansingIterations > 0 && iterations > m_numOfCleansingIterations) break; // build classifier count = buildSet.numInstances(); buildSet.setClassIndex(classIndex); m_cleansingClassifier.buildClassifier(buildSet); temp = new Instances(buildSet, buildSet.numInstances()); // test on training data for (int i = 0; i < buildSet.numInstances(); i++) { inst = buildSet.instance(i); ans = m_cleansingClassifier.classifyInstance(inst); if (buildSet.classAttribute().isNumeric()) { if (ans >= inst.classValue() - m_numericClassifyThreshold && ans <= inst.classValue() + m_numericClassifyThreshold) { temp.add(inst); } else if (m_invertMatching) { inverseSet.add(inst); } } else { //class is nominal if (ans == inst.classValue()) { temp.add(inst); } else if (m_invertMatching) { inverseSet.add(inst); } } } buildSet = temp; } if (m_invertMatching) { inverseSet.setClassIndex(data.classIndex()); return inverseSet; } else { buildSet.setClassIndex(data.classIndex()); return buildSet; } } /** * Cleanses the data based on misclassifications when performing cross-validation. * * @param data the data to train with and cleanse */ private Instances cleanseCross(Instances data) throws Exception { Instance inst; Instances crossSet = new Instances(data); Instances temp = new Instances(data, data.numInstances()); Instances inverseSet = new Instances(data, data.numInstances()); int count = 0; double ans; int iterations = 0; int classIndex = m_classIndex; if (classIndex < 0) classIndex = data.classIndex(); if (classIndex < 0) classIndex = data.numAttributes()-1; // loop until perfect while (count != crossSet.numInstances() && crossSet.numInstances() >= m_numOfCrossValidationFolds) { count = crossSet.numInstances(); // check if hit maximum number of iterations iterations++; if (m_numOfCleansingIterations > 0 && iterations > m_numOfCleansingIterations) break; crossSet.setClassIndex(classIndex); if (crossSet.classAttribute().isNominal()) { crossSet.stratify(m_numOfCrossValidationFolds); } // do the folds temp = new Instances(crossSet, crossSet.numInstances()); for (int fold = 0; fold < m_numOfCrossValidationFolds; fold++) { Instances train = crossSet.trainCV(m_numOfCrossValidationFolds, fold); m_cleansingClassifier.buildClassifier(train); Instances test = crossSet.testCV(m_numOfCrossValidationFolds, fold); //now test for (int i = 0; i < test.numInstances(); i++) { inst = test.instance(i); ans = m_cleansingClassifier.classifyInstance(inst); if (crossSet.classAttribute().isNumeric()) { if (ans >= inst.classValue() - m_numericClassifyThreshold && ans <= inst.classValue() + m_numericClassifyThreshold) { temp.add(inst); } else if (m_invertMatching) { inverseSet.add(inst); } } else { //class is nominal if (ans == inst.classValue()) { temp.add(inst); } else if (m_invertMatching) { inverseSet.add(inst); } } } } crossSet = temp; } if (m_invertMatching) { inverseSet.setClassIndex(data.classIndex()); return inverseSet; } else { crossSet.setClassIndex(data.classIndex()); return crossSet; } } /** * Signify that this batch of input to the filter is finished. * * @return true if there are instances pending output * @exception IllegalStateException if no input structure has been defined */ public boolean batchFinished() throws Exception { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); } Instances filtered; if (m_numOfCrossValidationFolds < 2) { filtered = cleanseTrain(getInputFormat()); } else { filtered = cleanseCross(getInputFormat()); } for (int i=0; i<filtered.numInstances(); i++) { push(filtered.instance(i)); } return (numPendingOutput() != 0); } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ public Enumeration listOptions() { Vector newVector = new Vector(6); newVector.addElement(new Option( "\tFull class name of classifier to use, followed\n" + "\tby scheme options. (required)\n" + "\teg: \"weka.classifiers.bayes.NaiveBayes -D\"", "W", 1, "-W <classifier specification>")); newVector.addElement(new Option( "\tAttribute on which misclassifications are based.\n" + "\tIf < 0 will use any current set class or default to the last attribute.", "C", 1, "-C <class index>")); newVector.addElement(new Option( "\tThe number of folds to use for cross-validation cleansing.\n" +"\t(<2 = no cross-validation - default).", "F", 1, "-F <number of folds>")); newVector.addElement(new Option( "\tThreshold for the max error when predicting numeric class.\n" +"\t(Value should be >= 0, default = 0.1).", "T", 1, "-T <threshold>")); newVector.addElement(new Option( "\tThe maximum number of cleansing iterations to perform.\n" +"\t(<1 = until fully cleansed - default)", "I", 1,"-I")); newVector.addElement(new Option( "\tInvert the match so that correctly classified instances are discarded.\n", "V", 0,"-V")); return newVector.elements(); } /** * Parses the options for this object. Valid options are: <p> * * -W classifier string <br> * Full class name of classifier to use, followed by scheme options. (required)<p> * * -C class index <br> * Attribute on which misclassifications are based. If < 0 will use any current * set class or default to the last attribute. * * -F number of folds <br> * The number of folds to use for cross-validation cleansing. * (<2 = no cross-validation - default)<p> * * -T threshold <br> * Threshold for the max error when predicting numeric class. * (Value should be >= 0, default = 0.1)<p> * * -I max iterations <br> * The maximum number of cleansing iterations to perform.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -