reservoirsample.java

来自「Weka」· Java 代码 · 共 398 行

JAVA
398
字号
/* *    This program is free software; you can redistribute it and/or modify *    it under the terms of the GNU General Public License as published by *    the Free Software Foundation; either version 2 of the License, or *    (at your option) any later version. * *    This program is distributed in the hope that it will be useful, *    but WITHOUT ANY WARRANTY; without even the implied warranty of *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *    GNU General Public License for more details. * *    You should have received a copy of the GNU General Public License *    along with this program; if not, write to the Free Software *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* *    ReservoirSample.java *    Copyright (C) 2007 Pentaho Corporation * */package weka.filters.unsupervised.instance;import weka.core.Capabilities;import weka.core.Instance;import weka.core.Instances;import weka.core.Option;import weka.core.OptionHandler;import weka.core.Utils;import weka.core.Capabilities.Capability;import weka.filters.Filter;import weka.filters.UnsupervisedFilter;import java.util.Collections;import java.util.Enumeration;import java.util.Random;import java.util.Vector;/**  <!-- globalinfo-start --> * Produces a random subsample of a dataset using the reservoir sampling  * algorithm "R" of Vitter. The original data set does not have to fit  * into main memory, only the resevoir does. * <br/> * For more information, see:<br/> * <br/> *  Vitter, J. S. Random Sampling with a Reservoir. ACM * Transactions on Mathematical Software, Vol. 11, No. 1, * March 1985. Pages 37-57. * <p/> <!-- globalinfo-end --> *  <!-- technical-bibtex-start --> * BibTeX: * <pre> * &#64;article{Vitter1985, *    author = {J. S. Vitter}, *    journal = {ACM Transactions on Mathematical Software}, *    number = {1} *    volume = {11} *    pages = {37-57}, *    title = {Random Sampling with a Reservoir}, *    year = {1985} * } * </pre> * </p> <!-- options-start --> * Valid options are: <p/> *  * <pre> -S &lt;num&gt; *  Specify the random number seed (default 1)</pre> *  * <pre> -Z &lt;num&gt; *  The size of the output dataset, as a percentage of *  the input dataset (default 100)</pre> *  *  <!-- options-end --> * * @author Mark Hall (mhall{[at]}pentaho{[dot]}org) * @version $Revision: 1.2 $  */public class ReservoirSample   extends Filter   implements UnsupervisedFilter, OptionHandler {    /** for serialization */  static final long serialVersionUID = 3119607037607101160L;  /** The subsample size, number of instances% */  protected int m_SampleSize = 100;  /** Holds the sub-sample (reservoir) */  protected Instance[] m_subSample;  /** The current instance being processed */  protected int m_currentInst;    /** The random number generator seed */  protected int m_RandomSeed = 1;  /** The random number generator */  protected Random m_random;    /**   * Returns a string describing this filter   * @return a description of the classifier suitable for   * displaying in the explorer/experimenter gui   */  public String globalInfo() {    return         "Produces a random subsample of a dataset using the reservoir sampling "      + "Algorithm \"R\" by Vitter. The original data set does not have to fit "      + "into main memory, but the reservoir does. ";  }  /**   * Returns an enumeration describing the available options.   *   * @return an enumeration of all the available options.   */  public Enumeration listOptions() {    Vector result = new Vector();    result.addElement(new Option(	"\tSpecify the random number seed (default 1)",	"S", 1, "-S <num>"));    result.addElement(new Option(	"\tThe size of the output dataset - number of instances\n"	+"\t(default 100)",	"Z", 1, "-Z <num>"));    return result.elements();  }  /**   * Parses a given list of options. <p/>   *    <!-- options-start -->   * Valid options are: <p/>   *    * <pre> -S &lt;num&gt;   *  Specify the random number seed (default 1)</pre>   *    * <pre> -Z &lt;num&gt;   *  The size of the output dataset, as a percentage of   *  the input dataset (default 100)</pre>   *   <!-- options-end -->   *   * @param options the list of options as an array of strings   * @throws Exception if an option is not supported   */  public void setOptions(String[] options) throws Exception {    String	tmpStr;        tmpStr = Utils.getOption('S', options);    if (tmpStr.length() != 0) {      setRandomSeed(Integer.parseInt(tmpStr));    } else {      setRandomSeed(1);    }    tmpStr = Utils.getOption('Z', options);    if (tmpStr.length() != 0) {      setSampleSize(Integer.parseInt(tmpStr));    } else {      setSampleSize(100);    }  }  /**   * Gets the current settings of the filter.   *   * @return an array of strings suitable for passing to setOptions   */  public String [] getOptions() {    Vector<String>	result;    result = new Vector<String>();    result.add("-S");    result.add("" + getRandomSeed());    result.add("-Z");    result.add("" + getSampleSize());        return result.toArray(new String[result.size()]);  }  /**   * Returns the tip text for this property   *    * @return tip text for this property suitable for   * displaying in the explorer/experimenter gui   */  public String randomSeedTipText() {    return "The seed used for random sampling.";  }  /**   * Gets the random number seed.   *   * @return the random number seed.   */  public int getRandomSeed() {    return m_RandomSeed;  }    /**   * Sets the random number seed.   *   * @param newSeed the new random number seed.   */  public void setRandomSeed(int newSeed) {    m_RandomSeed = newSeed;  }    /**   * Returns the tip text for this property   *    * @return tip text for this property suitable for   * displaying in the explorer/experimenter gui   */  public String sampleSizeTipText() {    return "Size of the subsample (reservoir). i.e. the number of instances.";  }  /**   * Gets the subsample size.   *   * @return the subsample size   */  public int getSampleSize() {    return m_SampleSize;  }    /**   * Sets the size of the subsample.   *   * @param newSampleSize size of the subsample.   */  public void setSampleSize(int newSampleSize) {    m_SampleSize = newSampleSize;  }      /**    * Returns the Capabilities of this filter.   *   * @return            the capabilities of this object   * @see               Capabilities   */  public Capabilities getCapabilities() {    Capabilities result = super.getCapabilities();    // attributes    result.enableAllAttributes();    result.enable(Capability.MISSING_VALUES);        // class    result.enableAllClasses();    result.enable(Capability.MISSING_CLASS_VALUES);    result.enable(Capability.NO_CLASS);        return result;  }  /**   * Sets the format of the input instances.   *   * @param instanceInfo an Instances object containing the input    * instance structure (any instances contained in the object are    * ignored - only the structure is required).   * @return true if the outputFormat may be collected immediately   * @throws Exception if the input format can't be set    * successfully   */  public boolean setInputFormat(Instances instanceInfo)        throws Exception {    super.setInputFormat(instanceInfo);    setOutputFormat(instanceInfo);    m_subSample = new Instance[m_SampleSize];    m_currentInst = 0;    m_random = new Random(m_RandomSeed);    return true;  }  /**   * Decides whether the current instance gets retained in the   * reservoir.   *   * @param instance the Instance to potentially retain   */  protected void processInstance(Instance instance) {    if (m_currentInst < m_SampleSize) {      m_subSample[m_currentInst] = (Instance)instance.copy();    } else {      double r = m_random.nextDouble();      if (r < ((double)m_SampleSize / (double)m_currentInst)) {        r = m_random.nextDouble();        int replace = (int)((double)m_SampleSize * r);        m_subSample[replace] = (Instance)instance.copy();      }    }    m_currentInst++;  }  /**   * Input an instance for filtering. Filter requires all   * training instances be read before producing output.   *   * @param instance the input instance   * @return true if the filtered instance may now be   * collected with output().   * @throws IllegalStateException if no input structure has been defined   */  public boolean input(Instance instance) {    if (getInputFormat() == null) {      throw new IllegalStateException("No input instance format defined");    }    if (m_NewBatch) {      resetQueue();      m_NewBatch = false;    }    if (isFirstBatchDone()) {      push(instance);      return true;    } else {      //      bufferInput(instance);      copyValues(instance, false);      processInstance(instance);      return false;    }  }  /**   * Signify that this batch of input to the filter is finished.    * If the filter requires all instances prior to filtering,   * output() may now be called to retrieve the filtered instances.   *   * @return true if there are instances pending output   * @throws IllegalStateException if no input structure has been defined   */  public boolean batchFinished() {    if (getInputFormat() == null) {      throw new IllegalStateException("No input instance format defined");    }    if (!isFirstBatchDone()) {      // Do the subsample, and clear the input instances.      createSubsample();    }    flushInput();    m_NewBatch = true;    m_FirstBatchDone = true;    return (numPendingOutput() != 0);  }    /**   * Creates a subsample of the current set of input instances. The output   * instances are pushed onto the output queue for collection.   */  protected void createSubsample() {    for (int i = 0; i < m_SampleSize; i++) {      if (m_subSample[i] != null) {        Instance copy = (Instance) m_subSample[i].copy();        push(copy);      } else {        // less data in the original than was asked for        // as a sample.        break;      }    }    m_subSample = null;  }    /**   * Main method for testing this class.   *   * @param argv should contain arguments to the filter:    * use -h for help   */  public static void main(String [] argv) {    runFilter(new ReservoirSample(), argv);  }}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?