⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 splitdatasetfilter.java

📁 一个数据挖掘系统的源码
💻 JAVA
字号:

/**
 *
 *   AgentAcademy - an open source Data Mining framework for
 *   training intelligent agents
 *
 *   Copyright (C)   2001-2003 AA Consortium.
 *
 *   This library is open source software; you can redistribute it
 *   and/or modify it under the terms of the GNU Lesser General
 *   Public License as published by the Free Software Foundation;
 *   either version 2.0 of the License, or (at your option) any later
 *   version.
 *
 *   This library is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU Lesser General Public
 *   License along with this library; if not, write to the Free
 *   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 *   MA  02111-1307 USA
 *
 */

package org.agentacademy.modules.dataminer.filters;

/**
 * <p>Title: The Data Miner prototype</p>
 * <p>Description: A prototype for the DataMiner (DM), the Agent Academy (AA) module responsible for performing data mining on the contents of the Agent Use Repository (AUR). The extracted knowledge is to be sent back to the AUR in the form of a PMML document.</p>
 * <p>Copyright: Copyright (c) 2002</p>
 * <p>Company: CERTH</p>
 * @author asymeon
 * @version 0.3
 */


import org.agentacademy.modules.dataminer.core.*;
import java.util.*;
import org.apache.log4j.Logger;

/**
 * This filter takes a dataset and outputs a subset of it. If a class
 * attribute is assigned, the dataset will be stratified when fold-based
 * splitting.
 *
 * Valid options are: <p>
 *
 * -R inst1,inst2-inst4,... <br>
 * Specifies list of instances to select. First
 * and last are valid indexes. (default fold-based splitting)<p>
 *
 * -V <br>
 * Specifies if inverse of selection is to be output.<p>
 *
 * -N number of folds <br>
 * Specifies number of folds dataset is split into (default 10). <p>
 *
 * -F fold <br>
 * Specifies which fold is selected. (default 1)<p>
 *
 * -S seed <br>
 * Specifies a random number seed for shuffling the dataset.
 * (default 0, don't randomize)<p>
 *
 * -A <br>
 * If set, data is not being stratified even if class index is set. <p>
 *
 */
public class SplitDatasetFilter extends Filter implements OptionHandler {

   public static Logger                log = Logger.getLogger(SplitDatasetFilter.class);
  /** Range of instances provided by user. */
  private Range m_Range = null;

  /** Indicates if inverse of selection is to be output. */
  private boolean m_Inverse = false;

  /** Number of folds to split dataset into */
  private int m_NumFolds = 10;

  /** Fold to output */
  private int m_Fold = 1;

  /** Random number seed. */
  private long m_Seed = 0;

  /** Don't stratify data if class index is set? */
  private boolean m_DontStratifyData = false;

  /**
   * Gets an enumeration describing the available options..
   *
   * @return an enumeration of all the available options.
   */
  public Enumeration listOptions() {

    Vector newVector = new Vector(6);

    newVector.addElement(new Option(
              "\tSpecifies list of instances to select. First and last\n"
	      +"\tare valid indexes. (default fold-based splitting)\n",
              "R", 1, "-R <inst1,inst2-inst4,...>"));

    newVector.addElement(new Option(
	      "\tSpecifies if inverse of selection is to be output.\n",
	      "V", 0, "-V"));

    newVector.addElement(new Option(
              "\tSpecifies number of folds dataset is split into. \n"
	      + "\t(default 10)\n",
              "N", 1, "-N <number of folds>"));

    newVector.addElement(new Option(
	      "\tSpecifies which fold is selected. (default 1)\n",
	      "F", 1, "-F <fold>"));

    newVector.addElement(new Option(
	      "\tSpecifies random number seed. (default 0, no randomizing)\n",
	      "S", 1, "-S <seed>"));

    newVector.addElement(new Option(
	      "\tIf set, data is not being stratified even if class index is set.\n",
	      "A", 0, "-A"));

    return newVector.elements();
  }

  /**
   * Parses the options for this object. Valid options are: <p>
   *
   * -R inst1,inst2-inst4,... <br>
   * Specifies list of instances to select. First
   * and last are valid indexes. (default fold-based splitting)<p>
   *
   * -V <br>
   * Specifies if inverse of selection is to be output.<p>
   *
   * -N number of folds <br>
   * Specifies number of folds dataset is split into (default 10). <p>
   *
   * -F fold <br>
   * Specifies which fold is selected. (default 1)<p>
   *
   * -S seed <br>
   * Specifies a random number seed for shuffling the dataset.
   * (default 0, no randomizing)<p>
   *
   * -A <br>
   * If set, data is not being stratified even if class index is set. <p>
   *
   * @param options the list of options as an array of strings
   * @exception Exception if an option is not supported
   */
  public void setOptions(String[] options) throws Exception {

    setInstancesIndices(Utils.getOption('R', options));
    setInvertSelection(Utils.getFlag('V', options));
    setDontStratifyData(Utils.getFlag('A', options));
    String numFolds = Utils.getOption('N', options);
    if (numFolds.length() != 0) {
      setNumFolds(Integer.parseInt(numFolds));
    } else {
      setNumFolds(10);
    }
    String fold = Utils.getOption('F', options);
    if (fold.length() != 0) {
      setFold(Integer.parseInt(fold));
    } else {
      setFold(1);
    }
    String seed = Utils.getOption('S', options);
    if (seed.length() != 0) {
      setSeed(Integer.parseInt(seed));
    } else {
      setSeed(0);
    }
    if (getInputFormat() != null) {
      setInputFormat(getInputFormat());
    }
  }

  /**
   * Gets the current settings of the filter.
   *
   * @return an array of strings suitable for passing to setOptions
   */
  public String [] getOptions() {

    String [] options = new String [8];
    int current = 0;

    options[current++] = "-S"; options[current++] = "" + getSeed();
    if (getInvertSelection()) {
      options[current++] = "-V";
    }
    if (!getInstancesIndices().equals("")) {
      options[current++] = "-R"; options[current++] = getInstancesIndices();
    } else {
      options[current++] = "-N"; options[current++] = "" + getNumFolds();
      options[current++] = "-F"; options[current++] = "" + getFold();
    }
    if (getDontStratifyData()) {
      options[current++] = "-A";
    }
    while (current < options.length) {
      options[current++] = "";
    }
    return options;
  }

  /**
   * Gets ranges of instances selected.
   *
   * @return a string containing a comma-separated list of ranges
   */
  public String getInstancesIndices() {

    if (m_Range == null) {
      return "";
    } else {
      return m_Range.getRanges();
    }
  }

  /**
   * Sets the ranges of instances to be selected. If provided string
   * is null, ranges won't be used for selecting instances.
   *
   * @param rangeList a string representing the list of instances.
   * eg: first-3,5,6-last
   * @exception IllegalArgumentException if an invalid range list is supplied
   */
  public void setInstancesIndices(String rangeList) {

    if ((rangeList == null) || (rangeList.length() == 0)) {
      m_Range = null;
    } else {
      m_Range = new Range();
      m_Range.setRanges(rangeList);
    }
  }

  /**
   * Gets if selection is to be inverted.
   *
   * @return true if the selection is to be inverted
   */
  public boolean getInvertSelection() {

    return m_Inverse;
  }

  /**
   * Sets if selection is to be inverted.
   *
   * @param inverse true if inversion is to be performed
   */
  public void setInvertSelection(boolean inverse) {

    m_Inverse = inverse;
  }

  /**
   * Gets the number of folds in which dataset is to be split into.
   *
   * @return the number of folds the dataset is to be split into.
   */
  public int getNumFolds() {

    return m_NumFolds;
  }

  /**
   * Sets the number of folds the dataset is split into. If the number
   * of folds is zero, it won't split it into folds.
   *
   * @param numFolds number of folds dataset is to be split into
   * @exception IllegalArgumentException if number of folds is negative
   */
  public void setNumFolds(int numFolds) {

    if (numFolds < 0) {
      throw new IllegalArgumentException("Number of folds has to be positive or zero.");
    }
    m_NumFolds = numFolds;
  }

  /**
   * Gets the fold which is selected.
   *
   * @return the fold which is selected
   */
  public int getFold() {

    return m_Fold;
  }

  /**
   * Selects a fold.
   *
   * @param fold the fold to be selected.
   * @exception IllegalArgumentException if fold's index is smaller than 1
   */
  public void setFold(int fold) {

    if (fold < 1) {
      throw new IllegalArgumentException("Fold's index has to be greater than 0.");
    }
    m_Fold = fold;
  }

  /**
   * Gets the random number seed used for shuffling the dataset.
   *
   * @return the random number seed
   */
  public long getSeed() {

    return m_Seed;
  }

  /**
   * Sets the random number seed for shuffling the dataset. If seed
   * is negative, shuffling won't be performed.
   *
   * @param seed the random number seed
   */
  public void setSeed(long seed) {

    m_Seed = seed;
  }

  /**
   * Sets whether stratification is not performed.
   */
  public void setDontStratifyData(boolean flag) {

    m_DontStratifyData = flag;
  }

  /**
   * Gets whether stratification is not performed.
   */
  public boolean getDontStratifyData() {

    return m_DontStratifyData;
  }

  /**
   * Sets the format of the input instances.
   *
   * @param instanceInfo an Instances object containing the input instance
   * structure (any instances contained in the object are ignored - only the
   * structure is required).
   * @return true because outputFormat can be collected immediately
   * @exception Exception if the input format can't be set successfully
   */
  public boolean setInputFormat(Instances instanceInfo) throws Exception {

    if ((m_NumFolds > 0) && (m_NumFolds < m_Fold)) {
      throw new IllegalArgumentException("Fold has to be smaller or equal to "+
                                         "number of folds.");
    }
    super.setInputFormat(instanceInfo);
    setOutputFormat(instanceInfo);
    return true;
  }

  /**
   * Signify that this batch of input to the filter is
   * finished. Output() may now be called to retrieve the filtered
   * instances.
   *
   * @return true if there are instances pending output
   * @exception IllegalStateException if no input structure has been defined
   */
  public boolean batchFinished() throws Exception{

    if (getInputFormat() == null) {
      throw new IllegalStateException("No input instance format defined");
    }
    if (m_Seed > 0) {
      // User has provided a random number seed.
      getInputFormat().randomize(new Random(m_Seed));
    }
    // Push instances for output into output queue
    if (m_Range != null) {
      // User has provided a range
      m_Range.setInvert(m_Inverse);
      m_Range.setUpper(getInputFormat().numInstances() - 1);
      for (int i = 0; i < getInputFormat().numInstances(); i++) {
	if (m_Range.isInRange(i)) {
	  push(getInputFormat().instance(i));
	}
      }
    } else {
      // Select out a fold
      if ((getInputFormat().classIndex() >= 0) &&
	  (!(m_DontStratifyData))) {
	getInputFormat().stratify(m_NumFolds);
      }
      Instances instances;
      if (!m_Inverse) {
	instances = getInputFormat().testCV(m_NumFolds, m_Fold - 1);
      } else {
	instances = getInputFormat().trainCV(m_NumFolds, m_Fold - 1);
      }
      for (int i = 0; i < instances.numInstances(); i++) {
	push(instances.instance(i));
      }
    }
    m_NewBatch = true;
    return (numPendingOutput() != 0);
  }

  /**
   * Main method for testing this class.
   *
   * @param argv should contain arguments to the filter: use -h for help
   */
  public static void main(String [] argv) {

    try {
      if (Utils.getFlag('b', argv)) {
 	Filter.batchFilterFile(new SplitDatasetFilter(), argv);
      } else {
	Filter.filterFile(new SplitDatasetFilter(), argv);
      }
    } catch (Exception ex) {
      log.error(ex.getMessage());
    }
  }
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -