📄 stringtowordvector.java
字号:
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* StringToWordVector.java
* Copyright (C) 2002 University of Waikato
*
* Updated 12/Dec/2001 by Gordon Paynter (gordon.paynter@ucr.edu)
* Added parameters for delimiter set,
* number of words to add, and input range.
* updated 27/Nov/2003 by Asrhaf M. Kibriya (amk14@cs.waikato.ac.nz)
* Added options for TF/IDF transforms, length
* normalization and down casing the tokens. Also
* added another onlyAlphabeticStringTokenizer and
* support for using a list of stopwords.
*/
package weka.filters.unsupervised.attribute;
import java.io.Serializable;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.StringTokenizer;
import java.util.TreeMap;
import java.util.Vector;
import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Range;
import weka.core.SparseInstance;
import weka.core.Utils;
import weka.filters.Filter;
import weka.filters.UnsupervisedFilter;
/**
* Converts String attributes into a set of attributes representing word
* occurrence information from the text contained in the strings. The set of
* words (attributes) is determined by the first batch filtered (typically
* training data).
*
* @author Len Trigg (len@reeltwo.com)
* @author Stuart Inglis (stuart@reeltwo.com)
* @version $Revision$
*/
public class StringToWordVector extends Filter
implements UnsupervisedFilter, OptionHandler {
/** Delimiters used in tokenization */
private String delimiters = " \n\t.,:'\"()?!";
/** Range of columns to convert to word vectors */
protected Range m_SelectedRange = null;
/** Contains a mapping of valid words to attribute indexes */
private TreeMap m_Dictionary = new TreeMap();
/** True if the first batch has been done */
private boolean m_FirstBatchDone = false;
/** True if output instances should contain word frequency rather than boolean 0 or 1. */
private boolean m_OutputCounts = false;
/** A String prefix for the attribute names */
private String m_Prefix = "";
/** Contains the number of documents (instances) a particular word appears in.
The counts are stored with the same indexing as given by m_Dictionary. */
private int [] docsCounts;
/** Contains the number of documents (instances) in the input format from
which the dictionary is created. It is used in IDF transform. */
private int numInstances = -1;
/**
* Contains the average length of documents (among the first batch of
* instances aka training data). This is used in length normalization of
* documents which will be normalized to average document length.
*/
private double avgDocLength = -1;
/**
* The default number of words (per class if there is a class attribute
* assigned) to attempt to keep.
*/
private int m_WordsToKeep = 1000;
/** True if word frequencies should be transformed into log(1+fi)
where fi is the frequency of word i
*/
private boolean m_TFTransform;
/** True if document's (instance's) word frequencies are to be normalized.
The are normalized to average length of documents specified as input
format. */
private boolean m_normalizeDocLength;
/** True if word frequencies should be transformed into
fij*log(numOfDocs/numOfDocsWithWordi) */
private boolean m_IDFTransform;
/** True if tokens are to be formed only from alphabetic sequences of
characters. (The delimiters string property is ignored if this
is true).
*/
private boolean m_onlyAlphabeticTokens;
/** True if all tokens should be downcased */
private boolean m_lowerCaseTokens;
/** True if tokens that are on a stoplist are to be ignored. */
private boolean m_useStoplist;
/**
* Returns an enumeration describing the available options
*
* @return an enumeration of all the available options
*/
public Enumeration listOptions() {
Vector newVector = new Vector(3);
newVector.addElement(new Option(
"\tOutput word counts rather than boolean word presence.\n",
"C", 0, "-C"));
newVector.addElement(new Option(
"\tString containing the set of delimiter characters\n"
+ "\t(default: \" \\n\\t.,:'\\\"()?!\")",
"D", 1, "-D <delimiter set>"));
newVector.addElement(new Option(
"\tSpecify list of string attributes to convert to words (as weka Range).\n"
+ "\t(default: select all string attributes)",
"R", 1, "-R <index1,index2-index4,...>"));
newVector.addElement(new Option(
"\tSpecify a prefix for the created attribute names.\n"
+ "\t(default: \"\")",
"P", 1, "-P <attribute name prefix>"));
newVector.addElement(new Option(
"\tSpecify approximate number of word fields to create.\n"
+ "\tSurplus words will be discarded..\n"
+ "\t(default: 1000)",
"W", 1, "-W <number of words to keep>"));
newVector.addElement(new Option(
"\tTransform the word frequencies into log(1+fij)\n"+
"\twhere fij is the frequency of word i in jth "+
"document(instance).\n",
"T", 0, "-T"));
newVector.addElement(new Option(
"\tTransform each word frequency into:\n"+
"\tfij*log(num of Documents/num of "+
" documents containing word i)\n"+
"\t where fij if frequency of word i in "+
" jth document(instance)",
"I", 0, "-I"));
newVector.addElement(new Option(
"\tNormalize word frequencies of each "+
"document(instance) to average length of "+
"documents.",
"N", 0, "-N"));
newVector.addElement(new Option(
"\tOnly form tokens from contiguous "+
"alphabetic sequences (The delimiter "+
"string is ignored if this is set).",
"A", 0, "-A"));
newVector.addElement(new Option(
"\tConvert all tokens to lowercase before "+
"adding to the dictionary.",
"L", 0, "-L"));
newVector.addElement(new Option(
"\tIgnore words that are in the stoplist.",
"S", 0, "-S"));
return newVector.elements();
}
/**
* Parses a given list of options controlling the behaviour of this object.
* Valid options are:<p>
*
* -C<br>
* Output word counts rather than boolean word presence.<p>
*
* -D delimiter_charcters <br>
* Specify set of delimiter characters
* (default: " \n\t.,:'\\\"()?!\"<p>
*
* -R index1,index2-index4,...<br>
* Specify list of string attributes to convert to words.
* (default: all string attributes)<p>
*
* -P attribute_name_prefix <br>
* Specify a prefix for the created attribute names.
* (default: "")<p>
*
* -W number_of_words_to_keep <br>
* Specify number of word fields to create.
* Other, less useful words will be discarded.
* (default: 1000)<p>
*
* -A <br>
* Only tokenize contiguous alphabetic sequences. <p>
*
* -L <br>
* Convert all tokens to lower case before adding to the dictionary. <p>
*
* -S <br>
* Do not add words to the dictionary which are on the stop list. <p>
*
* -T <br>
* Transform word frequencies to log(1+fij) where fij is frequency of word i
* in document j. <p>
*
* -I <br>
* Transform word frequencies to fij*log(numOfDocs/numOfDocsWithWordi)
* where fij is frequency of word i in document j. <p>
*
* -N <br>
* Normalize word frequencies for each document(instance). The frequencies
* are normalized to average length of the documents specified in input
* format. <p>
*
* @param options the list of options as an array of strings
* @exception Exception if an option is not supported
*/
public void setOptions(String[] options) throws Exception {
String value = Utils.getOption('D', options);
if (value.length() != 0) {
setDelimiters(value);
}
value = Utils.getOption('R', options);
if (value.length() != 0) {
setSelectedRange(value);
}
value = Utils.getOption('P', options);
if (value.length() != 0) {
setAttributeNamePrefix(value);
}
value = Utils.getOption('W', options);
if (value.length() != 0) {
setWordsToKeep(Integer.valueOf(value).intValue());
}
setOutputWordCounts(Utils.getFlag('C', options));
setTFTransform(Utils.getFlag('T', options));
setIDFTransform(Utils.getFlag('I', options));
setNormalizeDocLength(Utils.getFlag('N', options));
setLowerCaseTokens(Utils.getFlag('L', options));
setOnlyAlphabeticTokens(Utils.getFlag('A', options));
setUseStoplist(Utils.getFlag('S', options));
}
/**
* Gets the current settings of the filter.
*
* @return an array of strings suitable for passing to setOptions
*/
public String [] getOptions() {
String [] options = new String [16];
int current = 0;
options[current++] = "-D";
options[current++] = getDelimiters();
if (getSelectedRange() != null) {
options[current++] = "-R";
m_SelectedRange.setUpper(getInputFormat().numAttributes() - 1);
options[current++] = getSelectedRange().getRanges();
}
if (!"".equals(getAttributeNamePrefix())) {
options[current++] = "-P";
options[current++] = getAttributeNamePrefix();
}
options[current++] = "-W";
options[current++] = String.valueOf(getWordsToKeep());
if (getOutputWordCounts()) {
options[current++] = "-C";
}
if(getTFTransform())
options[current++] = "-T";
if(getIDFTransform())
options[current++] = "-I";
if(getNormalizeDocLength())
options[current++] = "-N";
if(this.getLowerCaseTokens())
options[current++] = "-L";
if(this.getOnlyAlphabeticTokens())
options[current++] = "-A";
if(this.getUseStoplist())
options[current++] = "-S";
while (current < options.length) {
options[current++] = "";
}
return options;
}
/**
* Default constructor. Targets 1000 words in the output.
*/
public StringToWordVector() {
}
/**
* Constructor that allows specification of the target number of words
* in the output.
*
* @param wordsToKeep the number of words in the output vector (per class
* if assigned).
*/
public StringToWordVector(int wordsToKeep) {
m_WordsToKeep = wordsToKeep;
}
/**
* Used to store word counts for dictionary selection based on
* a threshold.
*/
private class Count implements Serializable {
public int count, docCount;
public Count(int c) { count = c; }
}
/**
* Sets the format of the input instances.
*
* @param instanceInfo an Instances object containing the input
* instance structure (any instances contained in the object are
* ignored - only the structure is required).
* @return true if the outputFormat may be collected immediately
* @exception Exception if the input format can't be set
* successfully
*/
public boolean setInputFormat(Instances instanceInfo)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -