📄 stringtowordvector.java
字号:
/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* * StringToWordVector.java * Copyright (C) 2002 University of Waikato * * Updated 12/Dec/2001 by Gordon Paynter (gordon.paynter@ucr.edu) * Added parameters for delimiter set, * number of words to add, and input range. * updated 27/Nov/2003 by Asrhaf M. Kibriya (amk14@cs.waikato.ac.nz) * Added options for TF/IDF transforms, length * normalization and down casing the tokens. Also * added another onlyAlphabeticStringTokenizer and * support for using a list of stopwords. */package weka.filters.unsupervised.attribute;import weka.core.Attribute;import weka.core.Capabilities;import weka.core.FastVector;import weka.core.Instance;import weka.core.Instances;import weka.core.Option;import weka.core.OptionHandler;import weka.core.Range;import weka.core.SelectedTag;import weka.core.SparseInstance;import weka.core.Tag;import weka.core.Utils;import weka.core.Capabilities.Capability;import weka.core.stemmers.NullStemmer;import weka.core.stemmers.Stemmer;import weka.filters.Filter;import weka.filters.UnsupervisedFilter;import java.io.Serializable;import java.util.Enumeration;import java.util.Hashtable;import java.util.Iterator;import java.util.NoSuchElementException;import java.util.StringTokenizer;import java.util.TreeMap;import java.util.Vector;/** <!-- globalinfo-start --> * Converts String attributes into a set of attributes representing word occurrence information from the text contained in the strings. The set of words (attributes) is determined by the first batch filtered (typically training data). * <p/> <!-- globalinfo-end --> * <!-- options-start --> * Valid options are: <p/> * * <pre> -C * Output word counts rather than boolean word presence. * </pre> * * <pre> -D <delimiter set> * String containing the set of delimiter characters * (default: " \n\t.,:'\"()?!")</pre> * * <pre> -R <index1,index2-index4,...> * Specify list of string attributes to convert to words (as weka Range). * (default: select all string attributes)</pre> * * <pre> -P <attribute name prefix> * Specify a prefix for the created attribute names. * (default: "")</pre> * * <pre> -W <number of words to keep> * Specify approximate number of word fields to create. * Surplus words will be discarded.. * (default: 1000)</pre> * * <pre> -T * Transform the word frequencies into log(1+fij) * where fij is the frequency of word i in jth document(instance). * </pre> * * <pre> -I * Transform each word frequency into: * fij*log(num of Documents/num of documents containing word i) * where fij if frequency of word i in jth document(instance)</pre> * * <pre> -N * Whether to 0=not normalize/1=normalize all data/2=normalize test data only * to average length of training documents (default 0=don't normalize).</pre> * * <pre> -A * Only form tokens from contiguous alphabetic sequences * (The delimiter string is ignored if this is set).</pre> * * <pre> -L * Convert all tokens to lowercase before adding to the dictionary.</pre> * * <pre> -S * Ignore words that are in the stoplist.</pre> * * <pre> -stemmer <spec> * The stemmering algorihtm (classname plus parameters) to use.</pre> * * <pre> -M <int> * The minimum term frequency (default = 1).</pre> * * <pre> -O * If this is set, the maximum number of words and the * minimum term frequency is not enforced on a per-class * basis but based on the documents in all the classes * (even if a class attribute is set).</pre> * <!-- options-end --> * * @author Len Trigg (len@reeltwo.com) * @author Stuart Inglis (stuart@reeltwo.com) * @version $Revision: 1.15 $ */public class StringToWordVector extends Filter implements UnsupervisedFilter, OptionHandler { /** for serialization */ static final long serialVersionUID = 8249106275278565424L; /** Delimiters used in tokenization */ private String delimiters = " \n\t.,:'\"()?!"; /** Range of columns to convert to word vectors */ protected Range m_SelectedRange = null; /** Contains a mapping of valid words to attribute indexes */ private TreeMap m_Dictionary = new TreeMap(); /** True if output instances should contain word frequency rather than boolean 0 or 1. */ private boolean m_OutputCounts = false; /** A String prefix for the attribute names */ private String m_Prefix = ""; /** Contains the number of documents (instances) a particular word appears in. The counts are stored with the same indexing as given by m_Dictionary. */ private int [] docsCounts; /** Contains the number of documents (instances) in the input format from which the dictionary is created. It is used in IDF transform. */ private int numInstances = -1; /** * Contains the average length of documents (among the first batch of * instances aka training data). This is used in length normalization of * documents which will be normalized to average document length. */ private double avgDocLength = -1; /** * The default number of words (per class if there is a class attribute * assigned) to attempt to keep. */ private int m_WordsToKeep = 1000; /** True if word frequencies should be transformed into log(1+fi) where fi is the frequency of word i */ private boolean m_TFTransform; /** The normalization to apply. */ protected int m_filterType = FILTER_NONE; /** normalization: No normalization */ public static final int FILTER_NONE = 0; /** normalization: Normalize all data */ public static final int FILTER_NORMALIZE_ALL = 1; /** normalization: Normalize test data only */ public static final int FILTER_NORMALIZE_TEST_ONLY = 2; /** Specifies whether document's (instance's) word frequencies are * to be normalized. The are normalized to average length of * documents specified as input format. */ public static final Tag [] TAGS_FILTER = { new Tag(FILTER_NONE, "No normalization"), new Tag(FILTER_NORMALIZE_ALL, "Normalize all data"), new Tag(FILTER_NORMALIZE_TEST_ONLY, "Normalize test data only"), }; /** True if word frequencies should be transformed into fij*log(numOfDocs/numOfDocsWithWordi) */ private boolean m_IDFTransform; /** True if tokens are to be formed only from alphabetic sequences of characters. (The delimiters string property is ignored if this is true). */ private boolean m_onlyAlphabeticTokens; /** True if all tokens should be downcased */ private boolean m_lowerCaseTokens; /** True if tokens that are on a stoplist are to be ignored. */ private boolean m_useStoplist; /** the stemming algorithm */ private Stemmer m_Stemmer = new NullStemmer(); /** the minimum (per-class) word frequency */ private int m_minTermFreq = 1; /** whether to operate on a per-class basis */ private boolean m_doNotOperateOnPerClassBasis = false; /** * Returns an enumeration describing the available options * * @return an enumeration of all the available options */ public Enumeration listOptions() { Vector newVector = new Vector(); newVector.addElement(new Option( "\tOutput word counts rather than boolean word presence.\n", "C", 0, "-C")); newVector.addElement(new Option( "\tString containing the set of delimiter characters\n" + "\t(default: \" \\n\\t.,:'\\\"()?!\")", "D", 1, "-D <delimiter set>")); newVector.addElement(new Option( "\tSpecify list of string attributes to convert to words (as weka Range).\n" + "\t(default: select all string attributes)", "R", 1, "-R <index1,index2-index4,...>")); newVector.addElement(new Option( "\tSpecify a prefix for the created attribute names.\n" + "\t(default: \"\")", "P", 1, "-P <attribute name prefix>")); newVector.addElement(new Option( "\tSpecify approximate number of word fields to create.\n" + "\tSurplus words will be discarded..\n" + "\t(default: 1000)", "W", 1, "-W <number of words to keep>")); newVector.addElement(new Option( "\tTransform the word frequencies into log(1+fij)\n"+ "\twhere fij is the frequency of word i in jth "+ "document(instance).\n", "T", 0, "-T")); newVector.addElement(new Option( "\tTransform each word frequency into:\n"+ "\tfij*log(num of Documents/num of "+ " documents containing word i)\n"+ "\t where fij if frequency of word i in "+ " jth document(instance)", "I", 0, "-I")); newVector.addElement(new Option("\tWhether to 0=not normalize/1=normalize all data/2=normalize test data only\n" + "\tto average length of training documents " + "(default 0=don\'t normalize).", "N", 1, "-N")); newVector.addElement(new Option( "\tOnly form tokens from contiguous "+ "alphabetic sequences\n\t(The delimiter "+ "string is ignored if this is set).", "A", 0, "-A")); newVector.addElement(new Option( "\tConvert all tokens to lowercase before "+ "adding to the dictionary.", "L", 0, "-L")); newVector.addElement(new Option( "\tIgnore words that are in the stoplist.", "S", 0, "-S")); newVector.addElement(new Option( "\tThe stemmering algorihtm (classname plus parameters) to use.", "stemmer", 1, "-stemmer <spec>")); newVector.addElement(new Option( "\tThe minimum term frequency (default = 1).", "M", 1, "-M <int>")); newVector.addElement(new Option( "\tIf this is set, the maximum number of words and the \n" + "\tminimum term frequency is not enforced on a per-class \n" + "\tbasis but based on the documents in all the classes \n" + "\t(even if a class attribute is set).", "O", 0, "-O")); return newVector.elements(); } /** * Parses a given list of options. <p/> * <!-- options-start --> * Valid options are: <p/> * * <pre> -C * Output word counts rather than boolean word presence. * </pre> * * <pre> -D <delimiter set> * String containing the set of delimiter characters * (default: " \n\t.,:'\"()?!")</pre> * * <pre> -R <index1,index2-index4,...> * Specify list of string attributes to convert to words (as weka Range). * (default: select all string attributes)</pre> * * <pre> -P <attribute name prefix> * Specify a prefix for the created attribute names. * (default: "")</pre> * * <pre> -W <number of words to keep> * Specify approximate number of word fields to create. * Surplus words will be discarded.. * (default: 1000)</pre> * * <pre> -T * Transform the word frequencies into log(1+fij) * where fij is the frequency of word i in jth document(instance). * </pre> * * <pre> -I * Transform each word frequency into: * fij*log(num of Documents/num of documents containing word i) * where fij if frequency of word i in jth document(instance)</pre> * * <pre> -N * Whether to 0=not normalize/1=normalize all data/2=normalize test data only * to average length of training documents (default 0=don't normalize).</pre> * * <pre> -A * Only form tokens from contiguous alphabetic sequences * (The delimiter string is ignored if this is set).</pre> * * <pre> -L * Convert all tokens to lowercase before adding to the dictionary.</pre> * * <pre> -S * Ignore words that are in the stoplist.</pre> * * <pre> -stemmer <spec> * The stemmering algorihtm (classname plus parameters) to use.</pre> * * <pre> -M <int> * The minimum term frequency (default = 1).</pre> * * <pre> -O * If this is set, the maximum number of words and the * minimum term frequency is not enforced on a per-class * basis but based on the documents in all the classes * (even if a class attribute is set).</pre> * <!-- options-end --> * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ public void setOptions(String[] options) throws Exception { String value = Utils.getOption('D', options); if (value.length() != 0) { setDelimiters(value); } value = Utils.getOption('R', options); if (value.length() != 0) { setSelectedRange(value); } value = Utils.getOption('P', options); if (value.length() != 0) { setAttributeNamePrefix(value); } value = Utils.getOption('W', options); if (value.length() != 0) { setWordsToKeep(Integer.valueOf(value).intValue()); } value = Utils.getOption('M', options); if (value.length() != 0) { setMinTermFreq(Integer.valueOf(value).intValue()); } setOutputWordCounts(Utils.getFlag('C', options)); setTFTransform(Utils.getFlag('T', options)); setIDFTransform(Utils.getFlag('I', options));
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -