filter.java
来自「Java 编写的多种数据挖掘算法 包括聚类、分类、预处理等」· Java 代码 · 共 921 行 · 第 1/2 页
JAVA
921 行
/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* * Filter.java * Copyright (C) 1999 Len Trigg * */package weka.filters;import weka.core.Instance;import weka.core.Instances;import weka.core.Option;import weka.core.OptionHandler;import weka.core.Queue;import weka.core.RelationalLocator;import weka.core.SerializedObject;import weka.core.StringLocator;import weka.core.Utils;import java.io.BufferedReader;import java.io.FileOutputStream;import java.io.FileReader;import java.io.InputStreamReader;import java.io.PrintWriter;import java.io.Reader;import java.io.Serializable;import java.util.Enumeration;/** * An abstract class for instance filters: objects that take instances * as input, carry out some transformation on the instance and then * output the instance. The method implementations in this class * assume that most of the work will be done in the methods overridden * by subclasses.<p> * * A simple example of filter use. This example doesn't remove * instances from the output queue until all instances have been * input, so has higher memory consumption than an approach that * uses output instances as they are made available:<p> * * <code> <pre> * Filter filter = ..some type of filter.. * Instances instances = ..some instances.. * for (int i = 0; i < data.numInstances(); i++) { * filter.input(data.instance(i)); * } * filter.batchFinished(); * Instances newData = filter.outputFormat(); * Instance processed; * while ((processed = filter.output()) != null) { * newData.add(processed); * } * ..do something with newData.. * </pre> </code> * * @author Len Trigg (trigg@cs.waikato.ac.nz) * @version $Revision: 1.29 $ */public abstract class Filter implements Serializable { /** The output format for instances */ private Instances m_OutputFormat = null; /** The output instance queue */ private Queue m_OutputQueue = null; /** Indices of string attributes in the output format */ protected StringLocator m_OutputStringAtts = null; /** Indices of string attributes in the input format */ protected StringLocator m_InputStringAtts = null; /** Indices of relational attributes in the output format */ protected RelationalLocator m_OutputRelAtts = null; /** Indices of relational attributes in the input format */ protected RelationalLocator m_InputRelAtts = null; /** The input format for instances */ private Instances m_InputFormat = null; /** Record whether the filter is at the start of a batch */ protected boolean m_NewBatch = true; /** * Sets the format of output instances. The derived class should use this * method once it has determined the outputformat. The * output queue is cleared. * * @param outputFormat the new output format */ protected void setOutputFormat(Instances outputFormat) { if (outputFormat != null) { m_OutputFormat = outputFormat.stringFreeStructure(); initOutputLocators(m_OutputFormat, null); // Rename the relation String relationName = outputFormat.relationName() + "-" + this.getClass().getName(); if (this instanceof OptionHandler) { String [] options = ((OptionHandler)this).getOptions(); for (int i = 0; i < options.length; i++) { relationName += options[i].trim(); } } m_OutputFormat.setRelationName(relationName); } else { m_OutputFormat = null; } m_OutputQueue = new Queue(); } /** * Gets the currently set inputformat instances. This dataset may contain * buffered instances. * * @return the input Instances. */ protected Instances getInputFormat() { return m_InputFormat; } /** * Returns a reference to the current input format without * copying it. * * @return a reference to the current input format */ protected Instances inputFormatPeek() { return m_InputFormat; } /** * Returns a reference to the current output format without * copying it. * * @return a reference to the current output format */ protected Instances outputFormatPeek() { return m_OutputFormat; } /** * Adds an output instance to the queue. The derived class should use this * method for each output instance it makes available. * * @param instance the instance to be added to the queue. */ protected void push(Instance instance) { if (instance != null) { if (instance.dataset() != null) copyValues(instance, false); instance.setDataset(m_OutputFormat); m_OutputQueue.push(instance); } } /** * Clears the output queue. */ protected void resetQueue() { m_OutputQueue = new Queue(); } /** * Adds the supplied input instance to the inputformat dataset for * later processing. Use this method rather than * getInputFormat().add(instance). Or else. Note that the provided * instance gets copied when buffered. * * @param instance the <code>Instance</code> to buffer. */ protected void bufferInput(Instance instance) { if (instance != null) { copyValues(instance, true); m_InputFormat.add(instance); } } /** * Initializes the input attribute locators. If indices is null then all * attributes of the data will be considered, otherwise only the ones * that were provided. * * @param data the data to initialize the locators with * @param indices if not null, the indices to which to restrict * the locating */ protected void initInputLocators(Instances data, int[] indices) { if (indices == null) { m_InputStringAtts = new StringLocator(data); m_InputRelAtts = new RelationalLocator(data); } else { m_InputStringAtts = new StringLocator(data, indices); m_InputRelAtts = new RelationalLocator(data, indices); } } /** * Initializes the output attribute locators. If indices is null then all * attributes of the data will be considered, otherwise only the ones * that were provided. * * @param data the data to initialize the locators with * @param indices if not null, the indices to which to restrict * the locating */ protected void initOutputLocators(Instances data, int[] indices) { if (indices == null) { m_OutputStringAtts = new StringLocator(data); m_OutputRelAtts = new RelationalLocator(data); } else { m_OutputStringAtts = new StringLocator(data, indices); m_OutputRelAtts = new RelationalLocator(data, indices); } } /** * Copies string/relational values contained in the instance copied to a new * dataset. The Instance must already be assigned to a dataset. This * dataset and the destination dataset must have the same structure. * * @param instance the Instance containing the string/relational * values to copy. * @param isInput if true the input format and input attribute * locators are used otherwise the output format * and output locators */ protected void copyValues(Instance instance, boolean isInput) { RelationalLocator.copyRelationalValues( instance, (isInput) ? m_InputFormat : m_OutputFormat, (isInput) ? m_InputRelAtts : m_OutputRelAtts); StringLocator.copyStringValues( instance, (isInput) ? m_InputFormat : m_OutputFormat, (isInput) ? m_InputStringAtts : m_OutputStringAtts); } /** * Takes string/relational values referenced by an Instance and copies them * from a source dataset to a destination dataset. The instance references are * updated to be valid for the destination dataset. The instance may have the * structure (i.e. number and attribute position) of either dataset (this * affects where references are obtained from). Only works if the number * of string/relational attributes is the same in both indices (implicitly * these string/relational attributes should be semantically same but just * with shifted positions). * * @param instance the instance containing references to strings/ * relational values in the source dataset that * will have references updated to be valid for * the destination dataset. * @param instSrcCompat true if the instance structure is the same as * the source, or false if it is the same as the * destination (i.e. which of the string/relational * attribute indices contains the correct locations * for this instance). * @param srcDataset the dataset for which the current instance * string/relational value references are valid * (after any position mapping if needed) * @param destDataset the dataset for which the current instance * string/relational value references need to be * inserted (after any position mapping if needed) */ protected void copyValues(Instance instance, boolean instSrcCompat, Instances srcDataset, Instances destDataset) { RelationalLocator.copyRelationalValues( instance, instSrcCompat, srcDataset, m_InputRelAtts, destDataset, m_OutputRelAtts); StringLocator.copyStringValues( instance, instSrcCompat, srcDataset, m_InputStringAtts, getOutputFormat(), m_OutputStringAtts); } /** * This will remove all buffered instances from the inputformat dataset. * Use this method rather than getInputFormat().delete(); */ protected void flushInput() { if ( (m_InputStringAtts.getAttributeIndices().length > 0) || (m_InputRelAtts.getAttributeIndices().length > 0) ) { m_InputFormat = m_InputFormat.stringFreeStructure(); } else { // This more efficient than new Instances(m_InputFormat, 0); m_InputFormat.delete(); } } /** * Sets the format of the input instances. If the filter is able to * determine the output format before seeing any input instances, it * does so here. This default implementation clears the output format * and output queue, and the new batch flag is set. Overriders should * call <code>super.setInputFormat(Instances)</code> * * @param instanceInfo an Instances object containing the input instance * structure (any instances contained in the object are ignored - only the * structure is required). * @return true if the outputFormat may be collected immediately * @throws Exception if the inputFormat can't be set successfully */ public boolean setInputFormat(Instances instanceInfo) throws Exception { m_InputFormat = instanceInfo.stringFreeStructure(); m_OutputFormat = null; m_OutputQueue = new Queue(); m_NewBatch = true; initInputLocators(instanceInfo, null); return false; } /** * Gets the format of the output instances. This should only be called * after input() or batchFinished() has returned true. The relation * name of the output instances should be changed to reflect the * action of the filter (eg: add the filter name and options). * * @return an Instances object containing the output instance * structure only. * @throws NullPointerException if no input structure has been * defined (or the output format hasn't been determined yet) */ public Instances getOutputFormat() { if (m_OutputFormat == null) { throw new NullPointerException("No output format defined."); } return new Instances(m_OutputFormat, 0); } /** * Input an instance for filtering. Ordinarily the instance is * processed and made available for output immediately. Some filters * require all instances be read before producing output, in which * case output instances should be collected after calling * batchFinished(). If the input marks the start of a new batch, the * output queue is cleared. This default implementation assumes all * instance conversion will occur when batchFinished() is called. * * @param instance the input instance * @return true if the filtered instance may now be * collected with output(). * @throws NullPointerException if the input format has not been * defined. * @throws Exception if the input instance was not of the correct * format or if there was a problem with the filtering. */ public boolean input(Instance instance) throws Exception { if (m_InputFormat == null) { throw new NullPointerException("No input instance format defined"); } if (m_NewBatch) { m_OutputQueue = new Queue(); m_NewBatch = false; } bufferInput(instance); return false; } /** * Signify that this batch of input to the filter is finished. If * the filter requires all instances prior to filtering, output() * may now be called to retrieve the filtered instances. Any * subsequent instances filtered should be filtered based on setting * obtained from the first batch (unless the inputFormat has been * re-assigned or new options have been set). This default * implementation assumes all instance processing occurs during * inputFormat() and input(). * * @return true if there are instances pending output * @throws NullPointerException if no input structure has been defined, * @throws Exception if there was a problem finishing the batch. */ public boolean batchFinished() throws Exception { if (m_InputFormat == null) { throw new NullPointerException("No input instance format defined"); } flushInput(); m_NewBatch = true; return (numPendingOutput() != 0); } /** * Output an instance after filtering and remove from the output queue. * * @return the instance that has most recently been filtered (or null if * the queue is empty). * @throws NullPointerException if no output structure has been defined */ public Instance output() { if (m_OutputFormat == null) { throw new NullPointerException("No output instance format defined"); } if (m_OutputQueue.empty()) { return null; } Instance result = (Instance)m_OutputQueue.pop(); // Clear out references to old strings/relationals occasionally if (m_OutputQueue.empty() && m_NewBatch) { if ( (m_OutputStringAtts.getAttributeIndices().length > 0) || (m_OutputRelAtts.getAttributeIndices().length > 0) ) { m_OutputFormat = m_OutputFormat.stringFreeStructure(); } } return result; } /** * Output an instance after filtering but do not remove from the * output queue. * * @return the instance that has most recently been filtered (or null if * the queue is empty). * @throws NullPointerException if no input structure has been defined */ public Instance outputPeek() { if (m_OutputFormat == null) { throw new NullPointerException("No output instance format defined"); } if (m_OutputQueue.empty()) { return null; }
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?