⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 stringtowordvector.java

📁 一个数据挖掘软件ALPHAMINERR的整个过程的JAVA版源代码
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    StringToWordVector.java
 *    Copyright (C) 2002 University of Waikato
 *
 *    Updated 12/Dec/2001 by Gordon Paynter (gordon.paynter@ucr.edu)
 *                        Added parameters for delimiter set,
 *                        number of words to add, and input range.
 *    updated 27/Nov/2003 by Asrhaf M. Kibriya (amk14@cs.waikato.ac.nz)
 *                        Added options for TF/IDF transforms, length 
 *                        normalization and down casing the tokens. Also 
 *                        added another onlyAlphabeticStringTokenizer and
 *                        support for using a list of stopwords.
 */


package weka.filters.unsupervised.attribute;

import java.io.Serializable;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.StringTokenizer;
import java.util.TreeMap;
import java.util.Vector;

import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Range;
import weka.core.SparseInstance;
import weka.core.Utils;
import weka.filters.Filter;
import weka.filters.UnsupervisedFilter;


/** 
 * Converts String attributes into a set of attributes representing word
 * occurrence information from the text contained in the strings. The set of
 * words (attributes) is determined by the first batch filtered (typically
 * training data).
 *
 * @author Len Trigg (len@reeltwo.com)
 * @author Stuart Inglis (stuart@reeltwo.com)
 * @version $Revision$ 
 */
public class StringToWordVector extends Filter
  implements UnsupervisedFilter, OptionHandler {

  /** Delimiters used in tokenization */
  private String delimiters = " \n\t.,:'\"()?!";

  /** Range of columns to convert to word vectors */
  protected Range m_SelectedRange = null;

  /** Contains a mapping of valid words to attribute indexes */
  private TreeMap m_Dictionary = new TreeMap();
  
  /** True if the first batch has been done */
  private boolean m_FirstBatchDone = false;

  /** True if output instances should contain word frequency rather than boolean 0 or 1. */
  private boolean m_OutputCounts = false;

  /** A String prefix for the attribute names */
  private String m_Prefix = "";
  
  /** Contains the number of documents (instances) a particular word appears in.
      The counts are stored with the same indexing as given by m_Dictionary.  */
  private int [] docsCounts;
  
  /** Contains the number of documents (instances) in the input format from 
      which the dictionary is created. It is used in IDF transform. */
  private int numInstances = -1;

  /**
   * Contains the average length of documents (among the first batch of 
   * instances aka training data). This is used in length normalization of 
   * documents which will be normalized to average document length.
   */
  private double avgDocLength = -1;
  
  /**
   * The default number of words (per class if there is a class attribute
   * assigned) to attempt to keep.
   */
  private int m_WordsToKeep = 1000;

  /** True if word frequencies should be transformed into log(1+fi) 
      where fi is the frequency of word i
   */
  private boolean m_TFTransform;
  
  /** True if document's (instance's) word frequencies are to be normalized. 
      The are normalized to average length of documents specified as input 
      format. */
  private boolean m_normalizeDocLength;
  
  /** True if word frequencies should be transformed into 
     fij*log(numOfDocs/numOfDocsWithWordi) */
  private boolean m_IDFTransform;
  
  /** True if tokens are to be formed only from alphabetic sequences of 
      characters. (The delimiters string property is ignored if this
      is true).
   */
  private boolean m_onlyAlphabeticTokens;  
  
  /** True if all tokens should be downcased */
  private boolean m_lowerCaseTokens;
  
  /** True if tokens that are on a stoplist are to be ignored. */
  private boolean m_useStoplist;  
  
  
  /**
   * Returns an enumeration describing the available options
   *
   * @return an enumeration of all the available options
   */
  public Enumeration listOptions() {

    Vector newVector = new Vector(3);

    newVector.addElement(new Option(
				    "\tOutput word counts rather than boolean word presence.\n",
				    "C", 0, "-C"));
    newVector.addElement(new Option(
				    "\tString containing the set of delimiter characters\n"
				    + "\t(default: \" \\n\\t.,:'\\\"()?!\")",
				    "D", 1, "-D <delimiter set>"));
    newVector.addElement(new Option(
				    "\tSpecify list of string attributes to convert to words (as weka Range).\n"
				    + "\t(default: select all string attributes)",
				    "R", 1, "-R <index1,index2-index4,...>"));
    newVector.addElement(new Option(
				    "\tSpecify a prefix for the created attribute names.\n"
				    + "\t(default: \"\")",
				    "P", 1, "-P <attribute name prefix>"));
    newVector.addElement(new Option(
				    "\tSpecify approximate number of word fields to create.\n"
				    + "\tSurplus words will be discarded..\n"
				    + "\t(default: 1000)",
				    "W", 1, "-W <number of words to keep>"));
    newVector.addElement(new Option(
				    "\tTransform the word frequencies into log(1+fij)\n"+
                                    "\twhere fij is the frequency of word i in jth "+
                                    "document(instance).\n",
				    "T", 0, "-T"));
    newVector.addElement(new Option(
				    "\tTransform each word frequency into:\n"+
                                    "\tfij*log(num of Documents/num of "+
                                    " documents containing word i)\n"+
                                    "\t  where fij if frequency of word i in "+
                                    " jth document(instance)",
				    "I", 0, "-I"));
    newVector.addElement(new Option(
				    "\tNormalize word frequencies of each "+
                                    "document(instance) to average length of "+
                                    "documents.",
				    "N", 0, "-N"));
    newVector.addElement(new Option(
				    "\tOnly form tokens from contiguous "+
                                    "alphabetic sequences (The delimiter "+
                                    "string is ignored if this is set).",
				    "A", 0, "-A"));
    newVector.addElement(new Option(
				    "\tConvert all tokens to lowercase before "+
                                    "adding to the dictionary.",
				    "L", 0, "-L"));
    newVector.addElement(new Option(
				    "\tIgnore words that are in the stoplist.",
				    "S", 0, "-S"));
        

    return newVector.elements();
  }

  /**
   * Parses a given list of options controlling the behaviour of this object.
   * Valid options are:<p>
   *
   * -C<br>
   * Output word counts rather than boolean word presence.<p>
   * 
   * -D delimiter_charcters <br>
   * Specify set of delimiter characters
   * (default: " \n\t.,:'\\\"()?!\"<p>
   *
   * -R index1,index2-index4,...<br>
   * Specify list of string attributes to convert to words.
   * (default: all string attributes)<p>
   *
   * -P attribute_name_prefix <br>
   * Specify a prefix for the created attribute names.
   * (default: "")<p>
   *
   * -W number_of_words_to_keep <br>
   * Specify number of word fields to create.
   * Other, less useful words will be discarded.
   * (default: 1000)<p>
   *
   * -A <br>
   * Only tokenize contiguous alphabetic sequences. <p>
   *
   * -L <br>
   * Convert all tokens to lower case before adding to the dictionary. <p>
   *
   * -S <br>
   * Do not add words to the dictionary which are on the stop list. <p>
   *
   * -T <br>
   * Transform word frequencies to log(1+fij) where fij is frequency of word i 
   * in document j. <p>
   *
   * -I <br>
   * Transform word frequencies to fij*log(numOfDocs/numOfDocsWithWordi)
   * where fij is frequency of word i in document j. <p>
   *
   * -N <br>
   * Normalize word frequencies for each document(instance). The frequencies
   * are normalized to average length of the documents specified in input 
   * format. <p>
   *
   * @param options the list of options as an array of strings
   * @exception Exception if an option is not supported
   */
  public void setOptions(String[] options) throws Exception {

    String value = Utils.getOption('D', options);
    if (value.length() != 0) {
      setDelimiters(value);
    }

    value = Utils.getOption('R', options);
    if (value.length() != 0) {
      setSelectedRange(value);
    }

    value = Utils.getOption('P', options);
    if (value.length() != 0) {
      setAttributeNamePrefix(value);
    }

    value = Utils.getOption('W', options);
    if (value.length() != 0) {
      setWordsToKeep(Integer.valueOf(value).intValue());
    }
    
    setOutputWordCounts(Utils.getFlag('C', options));

    setTFTransform(Utils.getFlag('T',  options));
    
    setIDFTransform(Utils.getFlag('I', options));
    
    setNormalizeDocLength(Utils.getFlag('N', options));
    
    setLowerCaseTokens(Utils.getFlag('L', options));
    
    setOnlyAlphabeticTokens(Utils.getFlag('A', options));
    
    setUseStoplist(Utils.getFlag('S', options));
    
  }

  /**
   * Gets the current settings of the filter.
   *
   * @return an array of strings suitable for passing to setOptions
   */
  public String [] getOptions() {

    String [] options = new String [16];
    int current = 0;

    options[current++] = "-D"; 
    options[current++] = getDelimiters();

    if (getSelectedRange() != null) {
      options[current++] = "-R"; 
      m_SelectedRange.setUpper(getInputFormat().numAttributes() - 1);
      options[current++] = getSelectedRange().getRanges();
    }

    if (!"".equals(getAttributeNamePrefix())) {
      options[current++] = "-P"; 
      options[current++] = getAttributeNamePrefix();
    }

    options[current++] = "-W"; 
    options[current++] = String.valueOf(getWordsToKeep());

    if (getOutputWordCounts()) {
      options[current++] = "-C";
    }

    if(getTFTransform())
        options[current++] = "-T";
    
    if(getIDFTransform())
        options[current++] = "-I";
    
    if(getNormalizeDocLength())
        options[current++] = "-N";
    
    if(this.getLowerCaseTokens())
        options[current++] = "-L";
    
    if(this.getOnlyAlphabeticTokens())
        options[current++] = "-A";
    
    if(this.getUseStoplist())
        options[current++] = "-S";
    
    while (current < options.length) {
      options[current++] = "";
    }
    return options;
  }

  /**
   * Default constructor. Targets 1000 words in the output.
   */
  public StringToWordVector() {
  }

  /**
   * Constructor that allows specification of the target number of words
   * in the output.
   *
   * @param wordsToKeep the number of words in the output vector (per class
   * if assigned).
   */
  public StringToWordVector(int wordsToKeep) {
    m_WordsToKeep = wordsToKeep;
  }
  
  /** 
   * Used to store word counts for dictionary selection based on 
   * a threshold.
   */
  private class Count implements Serializable {
    public int count, docCount;
    public Count(int c) { count = c; }
  }

  /**
   * Sets the format of the input instances.
   *
   * @param instanceInfo an Instances object containing the input 
   * instance structure (any instances contained in the object are 
   * ignored - only the structure is required).
   * @return true if the outputFormat may be collected immediately
   * @exception Exception if the input format can't be set 
   * successfully
   */
  public boolean setInputFormat(Instances instanceInfo) 

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -