📄 stringtowordvector.java

📁 wekaUT是 university texas austin 开发的基于weka的半指导学习(semi supervised learning)的分类器
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* *    This program is free software; you can redistribute it and/or modify *    it under the terms of the GNU General Public License as published by *    the Free Software Foundation; either version 2 of the License, or *    (at your option) any later version. * *    This program is distributed in the hope that it will be useful, *    but WITHOUT ANY WARRANTY; without even the implied warranty of *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *    GNU General Public License for more details. * *    You should have received a copy of the GNU General Public License *    along with this program; if not, write to the Free Software *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* *    StringToWordVector.java *    Copyright (C) 2002 University of Waikato * *    Updated 12/Dec/2001 by Gordon Paynter (gordon.paynter@ucr.edu) *                        Added parameters for delimiter set, *                        number of words to add, and input range. */package weka.filters.unsupervised.attribute;import java.io.Serializable;import java.util.Enumeration;import java.util.Iterator;import java.util.Random;import java.util.StringTokenizer;import java.util.TreeMap;import java.util.Vector;import weka.core.Attribute;import weka.core.FastVector;import weka.core.Instance;import weka.core.Instances;import weka.core.Option;import weka.core.OptionHandler;import weka.core.Range;import weka.core.SparseInstance;import weka.core.Utils;import weka.filters.Filter;import weka.filters.UnsupervisedFilter;/**  * Converts String attributes into a set of attributes representing word * occurrence information from the text contained in the strings. The set of * words (attributes) is determined by the first batch filtered (typically * training data). * * @author Len Trigg (len@reeltwo.com) * @author Stuart Inglis (stuart@reeltwo.com) * @version $Revision: 1.1.1.1 $  */public class StringToWordVector extends Filter  implements UnsupervisedFilter, OptionHandler {  /** Delimiters used in tokenization */  private String delimiters = " \n\t.,:'\"()?!";  /** Range of columns to convert to word vectors */  protected Range m_SelectedRange = null;  /** Contains a mapping of valid words to attribute indexes */  private TreeMap m_Dictionary = new TreeMap();  /** True if the first batch has been done */  private boolean m_FirstBatchDone = false;  /** True if output instances should contain word frequency rather than boolean 0 or 1. */  private boolean m_OutputCounts = false;  /**   * The default number of words (per class if there is a class attribute   * assigned) to attempt to keep.   */  private int m_WordsToKeep = 1000;  /**   * Returns an enumeration describing the available options   *   * @return an enumeration of all the available options   */  public Enumeration listOptions() {    Vector newVector = new Vector(3);    newVector.addElement(new Option(				    "\tOutput word counts rather than boolean word presence.\n",				    "C", 0, "-C"));    newVector.addElement(new Option(				    "\tString containing the set of delimiter characters\n"				    + "\t(default: \" \\n\\t.,:'\\\"()?!\")",				    "D", 1, "-D <delimiter set>"));    newVector.addElement(new Option(				    "\tSpecify list of string attributes to convert to words (as weka Range).\n"				    + "\t(default: select all string attributes)",				    "R", 1, "-R <index1,index2-index4,...>"));    newVector.addElement(new Option(				    "\tSpecify approximate number of word fields to create.\n"				    + "\tSurplus words will be discarded..\n"				    + "\t(default: 1000)",				    "W", 1, "-W <number of words to keep>"));    return newVector.elements();  }  /**   * Parses a given list of options controlling the behaviour of this object.   * Valid options are:<p>   *   * -C<br>   * Output word counts rather than boolean word presence.<p>   *    * -D delimiter_charcters <br>   * Specify set of delimiter characters   * (default: " \n\t.,:'\\\"()?!\"<p>   *   * -R index1,index2-index4,...<br>   * Specify list of string attributes to convert to words.   * (default: all string attributes)<p>   *   * -W number_of_words_to_keep <br>   * Specify number of word fields to create.   * Other, less useful words will be discarded.   * (default: 1000)<p>   *   * @param options the list of options as an array of strings   * @exception Exception if an option is not supported   */  public void setOptions(String[] options) throws Exception {    String value = Utils.getOption('D', options);    if (value.length() != 0) {      setDelimiters(value);    }    value = Utils.getOption('R', options);    if (value.length() != 0) {      setSelectedRange(value);    }    value = Utils.getOption('W', options);    if (value.length() != 0) {      setWordsToKeep(Integer.valueOf(value).intValue());    }    setOutputWordCounts(Utils.getFlag('C', options));  }  /**   * Gets the current settings of the filter.   *   * @return an array of strings suitable for passing to setOptions   */  public String [] getOptions() {    String [] options = new String [11];    int current = 0;    options[current++] = "-D";     options[current++] = getDelimiters();    if (getSelectedRange() != null) {      options[current++] = "-R";       m_SelectedRange.setUpper(getInputFormat().numAttributes() - 1);      options[current++] = getSelectedRange().getRanges();    }    options[current++] = "-W";     options[current++] = String.valueOf(getWordsToKeep());    if (getOutputWordCounts()) {      options[current++] = "-C";    }    while (current < options.length) {      options[current++] = "";    }    return options;  }  /**   * Default constructor. Targets 1000 words in the output.   */  public StringToWordVector() {  }  /**   * Constructor that allows specification of the target number of words   * in the output.   *   * @param wordsToKeep the number of words in the output vector (per class   * if assigned).   */  public StringToWordVector(int wordsToKeep) {    m_WordsToKeep = wordsToKeep;  }    /**    * Used to store word counts for dictionary selection based on    * a threshold.   */  private class Count implements Serializable {    public int count;    public Count(int c) { count = c; }  }  /**   * Sets the format of the input instances.   *   * @param instanceInfo an Instances object containing the input    * instance structure (any instances contained in the object are    * ignored - only the structure is required).   * @return true if the outputFormat may be collected immediately   * @exception Exception if the input format can't be set    * successfully   */  public boolean setInputFormat(Instances instanceInfo)     throws Exception {    super.setInputFormat(instanceInfo);    m_FirstBatchDone = false;    return false;  }  /**   * Input an instance for filtering. Filter requires all   * training instances be read before producing output.   *   * @param instance the input instance.   * @return true if the filtered instance may now be   * collected with output().   * @exception IllegalStateException if no input structure has been defined.   */  public boolean input(Instance instance) {    if (getInputFormat() == null) {      throw new IllegalStateException("No input instance format defined");    }    if (m_NewBatch) {      resetQueue();      m_NewBatch = false;    }    if (m_FirstBatchDone) {      convertInstance(instance);      return true;    } else {      bufferInput(instance);      return false;    }  }  /**   * Signify that this batch of input to the filter is finished.    * If the filter requires all instances prior to filtering,   * output() may now be called to retrieve the filtered instances.   *   * @return true if there are instances pending output.   * @exception IllegalStateException if no input structure has been defined.   */  public boolean batchFinished() {    if (getInputFormat() == null) {      throw new IllegalStateException("No input instance format defined");    }    // Determine the dictionary    if (!m_FirstBatchDone) {      determineDictionary();    }    // Convert pending input instances.    for(int i = 0; i < getInputFormat().numInstances(); i++) {      convertInstance(getInputFormat().instance(i));    }    flushInput();    m_NewBatch = true;    m_FirstBatchDone = true;    return (numPendingOutput() != 0);  }  /**   * Gets whether output instances contain 0 or 1 indicating word   * presence, or word counts.   *   * @return true if word counts should be output.   */  public boolean getOutputWordCounts() {    return m_OutputCounts;  }  /**   * Sets whether output instances contain 0 or 1 indicating word   * presence, or word counts.   *   * @param outputWordCounts true if word counts should be output.   */  public void setOutputWordCounts(boolean outputWordCounts) {    m_OutputCounts = outputWordCounts;  }  /**   * Get the value of delimiters.   *   * @return Value of delimiters.   */  public String getDelimiters() {    return delimiters;  }      /**   * Set the value of delimiters.   *   * @param newdelimiters Value to assign to delimiters.   */  public void setDelimiters(String newDelimiters) {    delimiters = newDelimiters;  }  /**   * Get the value of m_SelectedRange.   *   * @return Value of m_SelectedRange.   */  public Range getSelectedRange() {    return m_SelectedRange;
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -