📄 filter.java
字号:
/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* * Filter.java * Copyright (C) 1999 Len Trigg * */package weka.filters;import java.io.BufferedReader;import java.io.FileOutputStream;import java.io.FileReader;import java.io.InputStreamReader;import java.io.PrintWriter;import java.io.Reader;import java.io.Serializable;import java.util.Enumeration;import weka.core.Attribute;import weka.core.Instance;import weka.core.Instances;import weka.core.Option;import weka.core.OptionHandler;import weka.core.Queue;import weka.core.Utils;/** * An abstract class for instance filters: objects that take instances * as input, carry out some transformation on the instance and then * output the instance. The method implementations in this class * assume that most of the work will be done in the methods overridden * by subclasses.<p> * * A simple example of filter use. This example doesn't remove * instances from the output queue until all instances have been * input, so has higher memory consumption than an approach that * uses output instances as they are made available:<p> * * <code> <pre> * Filter filter = ..some type of filter.. * Instances instances = ..some instances.. * for (int i = 0; i < data.numInstances(); i++) { * filter.input(data.instance(i)); * } * filter.batchFinished(); * Instances newData = filter.outputFormat(); * Instance processed; * while ((processed = filter.output()) != null) { * newData.add(processed); * } * ..do something with newData.. * </pre> </code> * * @author Len Trigg (trigg@cs.waikato.ac.nz) * @version $Revision: 1.1.1.1 $ */public abstract class Filter implements Serializable { /* * Filter refactoring TODO: * * - Update all filters to use getOutputFormat and setInputFormat * instead of outputFormat, outputFormatPeek and inputFormat. * - Update users of filters to use getOutputFormat and setInputFormat * - remove outputFormat, outputFormatPeek and inputFormat * */ /** Debugging mode */ private boolean m_Debug = false; /** The output format for instances */ private Instances m_OutputFormat = null; /** The output instance queue */ private Queue m_OutputQueue = null; /** Indices of string attributes in the output format */ private int [] m_OutputStringAtts = null; /** Indices of string attributes in the input format */ private int [] m_InputStringAtts = null; /** The input format for instances */ private Instances m_InputFormat = null; /** Record whether the filter is at the start of a batch */ protected boolean m_NewBatch = true; /** * Sets the format of output instances. The derived class should use this * method once it has determined the outputformat. The * output queue is cleared. * * @param outputFormat the new output format */ protected void setOutputFormat(Instances outputFormat) { if (outputFormat != null) { m_OutputFormat = outputFormat.stringFreeStructure(); m_OutputStringAtts = getStringIndices(m_OutputFormat); // Rename the attribute String relationName = outputFormat.relationName() + "-" + this.getClass().getName(); if (this instanceof OptionHandler) { String [] options = ((OptionHandler)this).getOptions(); for (int i = 0; i < options.length; i++) { relationName += options[i].trim(); } } m_OutputFormat.setRelationName(relationName); } else { m_OutputFormat = null; } m_OutputQueue = new Queue(); } /** * Gets the currently set inputformat instances. This dataset may contain * buffered instances. * * @return the input Instances. */ protected Instances getInputFormat() { return m_InputFormat; } /** * Returns a reference to the current output format without * copying it. * * @return a reference to the current output format */ protected Instances outputFormatPeek() { return m_OutputFormat; } /** * Adds an output instance to the queue. The derived class should use this * method for each output instance it makes available. * * @param instance the instance to be added to the queue */ protected void push(Instance instance) { if (instance != null) { copyStringValues(instance, m_OutputFormat, m_OutputStringAtts); instance.setDataset(m_OutputFormat); m_OutputQueue.push(instance); } } /** * Clears the output queue. */ protected void resetQueue() { m_OutputQueue = new Queue(); } /** * Adds the supplied input instance to the inputformat dataset for * later processing. Use this method rather than * getInputFormat().add(instance). Or else. * * @param instance the <code>Instance</code> to buffer. */ protected void bufferInput(Instance instance) { if (instance != null) { copyStringValues(instance, m_InputFormat, m_InputStringAtts); instance.setDataset(m_InputFormat); m_InputFormat.add(instance); } } /** * Returns an array containing the indices of all string attributes in the * input format. This index is created during setInputFormat() * * @return an array containing the indices of string attributes in the * input dataset. */ protected int [] getInputStringIndex() { return m_InputStringAtts; } /** * Returns an array containing the indices of all string attributes in the * output format. This index is created during setOutputFormat() * * @return an array containing the indices of string attributes in the * output dataset. */ protected int [] getOutputStringIndex() { return m_OutputStringAtts; } /** * Copies string values contained in the instance copied to a new * dataset. The Instance must already be assigned to a dataset. This * dataset and the destination dataset must have the same structure. * * @param instance the Instance containing the string values to copy. * @param destDataset the destination set of Instances * @param strAtts an array containing the indices of any string attributes * in the dataset. */ private void copyStringValues(Instance inst, Instances destDataset, int []strAtts) { if (strAtts.length == 0) { return; } if (inst.dataset() == null) { throw new IllegalArgumentException("Instance has no dataset assigned!!"); } else if (inst.dataset().numAttributes() != destDataset.numAttributes()) { throw new IllegalArgumentException("Src and Dest differ in # of attributes!!"); } copyStringValues(inst, true, inst.dataset(), strAtts, destDataset, strAtts); } /** * Takes string values referenced by an Instance and copies them from a * source dataset to a destination dataset. The instance references are * updated to be valid for the destination dataset. The instance may have the * structure (i.e. number and attribute position) of either dataset (this * affects where references are obtained from). The source dataset must * have the same structure as the filter input format and the destination * must have the same structure as the filter output format. * * @param instance the instance containing references to strings in the source * dataset that will have references updated to be valid for the destination * dataset. * @param instSrcCompat true if the instance structure is the same as the * source, or false if it is the same as the destination * @param srcDataset the dataset for which the current instance string * references are valid (after any position mapping if needed) * @param destDataset the dataset for which the current instance string * references need to be inserted (after any position mapping if needed) */ protected void copyStringValues(Instance instance, boolean instSrcCompat, Instances srcDataset, Instances destDataset) { copyStringValues(instance, instSrcCompat, srcDataset, m_InputStringAtts, destDataset, m_OutputStringAtts); } /** * Takes string values referenced by an Instance and copies them from a * source dataset to a destination dataset. The instance references are * updated to be valid for the destination dataset. The instance may have the * structure (i.e. number and attribute position) of either dataset (this * affects where references are obtained from). Only works if the number * of string attributes is the same in both indices (implicitly these string * attributes should be semantically same but just with shifted positions). * * @param instance the instance containing references to strings in the source * dataset that will have references updated to be valid for the destination * dataset. * @param instSrcCompat true if the instance structure is the same as the * source, or false if it is the same as the destination (i.e. which of the * string attribute indices contains the correct locations for this instance). * @param srcDataset the dataset for which the current instance string * references are valid (after any position mapping if needed) * @param srcStrAtts an array containing the indices of string attributes * in the source datset. * @param destDataset the dataset for which the current instance string * references need to be inserted (after any position mapping if needed) * @param destStrAtts an array containing the indices of string attributes * in the destination datset. */ protected void copyStringValues(Instance instance, boolean instSrcCompat, Instances srcDataset, int []srcStrAtts, Instances destDataset, int []destStrAtts) { if (srcDataset == destDataset) { return; } if (srcStrAtts.length != destStrAtts.length) { throw new IllegalArgumentException("Src and Dest string indices differ in length!!"); } for (int i = 0; i < srcStrAtts.length; i++) { int instIndex = instSrcCompat ? srcStrAtts[i] : destStrAtts[i]; Attribute src = srcDataset.attribute(srcStrAtts[i]); Attribute dest = destDataset.attribute(destStrAtts[i]); if (!instance.isMissing(instIndex)) { //System.err.println(instance.value(srcIndex) // + " " + src.numValues() // + " " + dest.numValues()); int valIndex = dest.addStringValue(src, (int)instance.value(instIndex)); // setValue here shouldn't be too slow here unless your dataset has // squillions of string attributes instance.setValue(instIndex, (double)valIndex); } } } /** * This will remove all buffered instances from the inputformat dataset. * Use this method rather than getInputFormat().delete(); */ protected void flushInput() { if (m_InputStringAtts.length > 0) { m_InputFormat = m_InputFormat.stringFreeStructure(); } else { // This more efficient than new Instances(m_InputFormat, 0); m_InputFormat.delete(); } } /** * @deprecated use <code>setInputFormat(Instances)</code> instead. */ public boolean inputFormat(Instances instanceInfo) throws Exception { return setInputFormat(instanceInfo); } /** * Sets the format of the input instances. If the filter is able to * determine the output format before seeing any input instances, it * does so here. This default implementation clears the output format * and output queue, and the new batch flag is set. Overriders should * call <code>super.setInputFormat(Instances)</code> * * @param instanceInfo an Instances object containing the input instance * structure (any instances contained in the object are ignored - only the * structure is required). * @return true if the outputFormat may be collected immediately * @exception Exception if the inputFormat can't be set successfully */ public boolean setInputFormat(Instances instanceInfo) throws Exception { m_InputFormat = instanceInfo.stringFreeStructure(); m_InputStringAtts = getStringIndices(instanceInfo); m_OutputFormat = null; m_OutputQueue = new Queue(); m_NewBatch = true; return false; } /** * @deprecated use <code>getOutputFormat()</code> instead. */ public final Instances outputFormat() { return getOutputFormat(); } /** * Gets the format of the output instances. This should only be called * after input() or batchFinished() has returned true. The relation * name of the output instances should be changed to reflect the * action of the filter (eg: add the filter name and options). * * @return an Instances object containing the output instance * structure only. * @exception NullPointerException if no input structure has been * defined (or the output format hasn't been determined yet) */ public final Instances getOutputFormat() { if (m_OutputFormat == null) { throw new NullPointerException("No output format defined."); } return new Instances(m_OutputFormat, 0); } /** * Input an instance for filtering. Ordinarily the instance is * processed and made available for output immediately. Some filters * require all instances be read before producing output, in which * case output instances should be collected after calling * batchFinished(). If the input marks the start of a new batch, the * output queue is cleared. This default implementation assumes all * instance conversion will occur when batchFinished() is called. * * @param instance the input instance * @return true if the filtered instance may now be * collected with output(). * @exception NullPointerException if the input format has not been * defined. * @exception Exception if the input instance was not of the correct * format or if there was a problem with the filtering. */ public boolean input(Instance instance) throws Exception { if (m_InputFormat == null) { throw new NullPointerException("No input instance format defined"); } if (m_NewBatch) { m_OutputQueue = new Queue(); m_NewBatch = false; } bufferInput(instance); return false; } /** * Signify that this batch of input to the filter is finished. If * the filter requires all instances prior to filtering, output() * may now be called to retrieve the filtered instances. Any * subsequent instances filtered should be filtered based on setting * obtained from the first batch (unless the inputFormat has been * re-assigned or new options have been set). This default * implementation assumes all instance processing occurs during * inputFormat() and input(). * * @return true if there are instances pending output * @exception NullPointerException if no input structure has been defined, * @exception Exception if there was a problem finishing the batch. */ public boolean batchFinished() throws Exception { if (m_InputFormat == null) { throw new NullPointerException("No input instance format defined"); } flushInput(); m_NewBatch = true; return (numPendingOutput() != 0); } /** * Output an instance after filtering and remove from the output queue. * * @return the instance that has most recently been filtered (or null if * the queue is empty). * @exception NullPointerException if no output structure has been defined */ public Instance output() { if (m_OutputFormat == null) { throw new NullPointerException("No output instance format defined"); } if (m_OutputQueue.empty()) { return null; } Instance result = (Instance)m_OutputQueue.pop(); // Clear out references to old strings occasionally if (m_OutputQueue.empty() && m_NewBatch) { if (m_OutputStringAtts.length > 0) { m_OutputFormat = m_OutputFormat.stringFreeStructure(); } }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -