⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 filter.java

📁 一个数据挖掘软件ALPHAMINERR的整个过程的JAVA版源代码
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */


/*
 *    Filter.java
 *    Copyright (C) 1999 Len Trigg
 *
 */


package weka.filters;

import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.Serializable;
import java.util.Enumeration;

import weka.core.Attribute;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Queue;
import weka.core.Utils;

/** 
 * An abstract class for instance filters: objects that take instances
 * as input, carry out some transformation on the instance and then
 * output the instance. The method implementations in this class
 * assume that most of the work will be done in the methods overridden
 * by subclasses.<p>
 *
 * A simple example of filter use. This example doesn't remove
 * instances from the output queue until all instances have been
 * input, so has higher memory consumption than an approach that
 * uses output instances as they are made available:<p>
 *
 * <code> <pre>
 *  Filter filter = ..some type of filter..
 *  Instances instances = ..some instances..
 *  for (int i = 0; i < data.numInstances(); i++) {
 *    filter.input(data.instance(i));
 *  }
 *  filter.batchFinished();
 *  Instances newData = filter.outputFormat();
 *  Instance processed;
 *  while ((processed = filter.output()) != null) {
 *    newData.add(processed);
 *  }
 *  ..do something with newData..
 * </pre> </code>
 *
 * @author Len Trigg (trigg@cs.waikato.ac.nz)
 * @version $Revision$
 */
public abstract class Filter implements Serializable {

  /*
   * Filter refactoring TODO:
   *
   * - Update all filters to use getOutputFormat and setInputFormat
   * instead of outputFormat, outputFormatPeek and inputFormat.
   * - Update users of filters to use getOutputFormat and setInputFormat
   * - remove outputFormat, outputFormatPeek and inputFormat
   *
   */

  /** Debugging mode */
  private boolean m_Debug = false;

  /** The output format for instances */
  private Instances m_OutputFormat = null;

  /** The output instance queue */
  private Queue m_OutputQueue = null;

  /** Indices of string attributes in the output format */
  private int [] m_OutputStringAtts = null;

  /** Indices of string attributes in the input format */
  private int [] m_InputStringAtts = null;

  /** The input format for instances */
  private Instances m_InputFormat = null;

  /** Record whether the filter is at the start of a batch */
  protected boolean m_NewBatch = true;

  /**
   * Sets the format of output instances. The derived class should use this
   * method once it has determined the outputformat. The 
   * output queue is cleared.
   *
   * @param outputFormat the new output format
   */
  protected void setOutputFormat(Instances outputFormat) {

    if (outputFormat != null) {
      m_OutputFormat = outputFormat.stringFreeStructure();
      m_OutputStringAtts = getStringIndices(m_OutputFormat);

      // Rename the attribute
      String relationName = outputFormat.relationName() 
        + "-" + this.getClass().getName();
      if (this instanceof OptionHandler) {
        String [] options = ((OptionHandler)this).getOptions();
        for (int i = 0; i < options.length; i++) {
          relationName += options[i].trim();
        }
      }
      m_OutputFormat.setRelationName(relationName);
    } else {
      m_OutputFormat = null;
    }
    m_OutputQueue = new Queue();
  }

  /**
   * Gets the currently set inputformat instances. This dataset may contain
   * buffered instances.
   *
   * @return the input Instances.
   */
  protected Instances getInputFormat() {

    return m_InputFormat;
  }

  /**
   * Returns a reference to the current input format without
   * copying it.
   *
   * @return a reference to the current input format
   */
  protected Instances inputFormatPeek() {

    return m_InputFormat;
  }

  /**
   * Returns a reference to the current output format without
   * copying it.
   *
   * @return a reference to the current output format
   */
  protected Instances outputFormatPeek() {

    return m_OutputFormat;
  }

  /**
   * Adds an output instance to the queue. The derived class should use this
   * method for each output instance it makes available. 
   *
   * @param instance the instance to be added to the queue.
   */
  protected void push(Instance instance) {

    if (instance != null) {
      copyStringValues(instance, m_OutputFormat, m_OutputStringAtts);
      instance.setDataset(m_OutputFormat);
      m_OutputQueue.push(instance);
    }
  }

  /**
   * Clears the output queue.
   */
  protected void resetQueue() {

    m_OutputQueue = new Queue();
  }

  /**
   * Adds the supplied input instance to the inputformat dataset for
   * later processing.  Use this method rather than
   * getInputFormat().add(instance). Or else. Note that the provided
   * instance gets copied when buffered. 
   *
   * @param instance the <code>Instance</code> to buffer.  
   */
  protected void bufferInput(Instance instance) {

    if (instance != null) {
      copyStringValues(instance, m_InputFormat, m_InputStringAtts);
      m_InputFormat.add(instance);
    }
  }

  /**
   * Returns an array containing the indices of all string attributes in the
   * input format. This index is created during setInputFormat()
   *
   * @return an array containing the indices of string attributes in the 
   * input dataset.
   */
  protected int [] getInputStringIndex() {

    return m_InputStringAtts;
  }

  /**
   * Returns an array containing the indices of all string attributes in the
   * output format. This index is created during setOutputFormat()
   *
   * @return an array containing the indices of string attributes in the 
   * output dataset.
   */
  protected int [] getOutputStringIndex() {

    return m_OutputStringAtts;
  }

  /**
   * Copies string values contained in the instance copied to a new
   * dataset. The Instance must already be assigned to a dataset. This
   * dataset and the destination dataset must have the same structure.
   *
   * @param instance the Instance containing the string values to copy.
   * @param destDataset the destination set of Instances
   * @param strAtts an array containing the indices of any string attributes
   * in the dataset.  
   */
  private void copyStringValues(Instance inst, Instances destDataset, 
                                int []strAtts) {

    if (strAtts.length == 0) {
      return;
    }
    if (inst.dataset() == null) {
      throw new IllegalArgumentException("Instance has no dataset assigned!!");
    } else if (inst.dataset().numAttributes() != destDataset.numAttributes()) {
      throw new IllegalArgumentException("Src and Dest differ in # of attributes!!");
    } 
    copyStringValues(inst, true, inst.dataset(), strAtts,
                     destDataset, strAtts);
  }

  /**
   * Takes string values referenced by an Instance and copies them from a
   * source dataset to a destination dataset. The instance references are
   * updated to be valid for the destination dataset. The instance may have the 
   * structure (i.e. number and attribute position) of either dataset (this
   * affects where references are obtained from). The source dataset must
   * have the same structure as the filter input format and the destination
   * must have the same structure as the filter output format.
   *
   * @param instance the instance containing references to strings in the source
   * dataset that will have references updated to be valid for the destination
   * dataset.
   * @param instSrcCompat true if the instance structure is the same as the
   * source, or false if it is the same as the destination
   * @param srcDataset the dataset for which the current instance string
   * references are valid (after any position mapping if needed)
   * @param destDataset the dataset for which the current instance string
   * references need to be inserted (after any position mapping if needed)
   */
  protected void copyStringValues(Instance instance, boolean instSrcCompat,
                                  Instances srcDataset, Instances destDataset) {

    copyStringValues(instance, instSrcCompat, srcDataset, m_InputStringAtts,
                     destDataset, m_OutputStringAtts);
  }

  /**
   * Takes string values referenced by an Instance and copies them from a
   * source dataset to a destination dataset. The instance references are
   * updated to be valid for the destination dataset. The instance may have the 
   * structure (i.e. number and attribute position) of either dataset (this
   * affects where references are obtained from). Only works if the number
   * of string attributes is the same in both indices (implicitly these string
   * attributes should be semantically same but just with shifted positions).
   *
   * @param instance the instance containing references to strings in the source
   * dataset that will have references updated to be valid for the destination
   * dataset.
   * @param instSrcCompat true if the instance structure is the same as the
   * source, or false if it is the same as the destination (i.e. which of the
   * string attribute indices contains the correct locations for this instance).
   * @param srcDataset the dataset for which the current instance string
   * references are valid (after any position mapping if needed)
   * @param srcStrAtts an array containing the indices of string attributes
   * in the source datset.
   * @param destDataset the dataset for which the current instance string
   * references need to be inserted (after any position mapping if needed)
   * @param destStrAtts an array containing the indices of string attributes
   * in the destination datset.
   */
  protected void copyStringValues(Instance instance, boolean instSrcCompat,
                                  Instances srcDataset, int []srcStrAtts,
                                  Instances destDataset, int []destStrAtts) {
    if (srcDataset == destDataset) {
      return;
    }
    if (srcStrAtts.length != destStrAtts.length) {
      throw new IllegalArgumentException("Src and Dest string indices differ in length!!");
    }
    for (int i = 0; i < srcStrAtts.length; i++) {

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -