stringtowordvector.java
来自「Weka」· Java 代码 · 共 1,638 行 · 第 1/4 页
JAVA
1,638 行
/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* * StringToWordVector.java * Copyright (C) 2002 University of Waikato, Hamilton, New Zealand * */package weka.filters.unsupervised.attribute;import weka.core.Attribute;import weka.core.Capabilities;import weka.core.FastVector;import weka.core.Instance;import weka.core.Instances;import weka.core.Option;import weka.core.OptionHandler;import weka.core.Range;import weka.core.SelectedTag;import weka.core.SparseInstance;import weka.core.Stopwords;import weka.core.Tag;import weka.core.Utils;import weka.core.Capabilities.Capability;import weka.core.stemmers.NullStemmer;import weka.core.stemmers.Stemmer;import weka.core.tokenizers.WordTokenizer;import weka.core.tokenizers.Tokenizer;import weka.filters.Filter;import weka.filters.UnsupervisedFilter;import java.io.File;import java.io.Serializable;import java.util.Enumeration;import java.util.Hashtable;import java.util.Iterator;import java.util.TreeMap;import java.util.Vector;/** <!-- globalinfo-start --> * Converts String attributes into a set of attributes representing word occurrence (depending on the tokenizer) information from the text contained in the strings. The set of words (attributes) is determined by the first batch filtered (typically training data). * <p/> <!-- globalinfo-end --> * <!-- options-start --> * Valid options are: <p/> * * <pre> -C * Output word counts rather than boolean word presence. * </pre> * * <pre> -R <index1,index2-index4,...> * Specify list of string attributes to convert to words (as weka Range). * (default: select all string attributes)</pre> * * <pre> -V * Invert matching sense of column indexes.</pre> * * <pre> -P <attribute name prefix> * Specify a prefix for the created attribute names. * (default: "")</pre> * * <pre> -W <number of words to keep> * Specify approximate number of word fields to create. * Surplus words will be discarded.. * (default: 1000)</pre> * * <pre> -T * Transform the word frequencies into log(1+fij) * where fij is the frequency of word i in jth document(instance). * </pre> * * <pre> -I * Transform each word frequency into: * fij*log(num of Documents/num of documents containing word i) * where fij if frequency of word i in jth document(instance)</pre> * * <pre> -N * Whether to 0=not normalize/1=normalize all data/2=normalize test data only * to average length of training documents (default 0=don't normalize).</pre> * * <pre> -L * Convert all tokens to lowercase before adding to the dictionary.</pre> * * <pre> -S * Ignore words that are in the stoplist.</pre> * * <pre> -stemmer <spec> * The stemmering algorihtm (classname plus parameters) to use.</pre> * * <pre> -M <int> * The minimum term frequency (default = 1).</pre> * * <pre> -O * If this is set, the maximum number of words and the * minimum term frequency is not enforced on a per-class * basis but based on the documents in all the classes * (even if a class attribute is set).</pre> * * <pre> -stopwords <file> * A file containing stopwords to override the default ones. * Using this option automatically sets the flag ('-S') to use the * stoplist if the file exists. * Format: one stopword per line, lines starting with '#' * are interpreted as comments and ignored.</pre> * * <pre> -tokenizer <spec> * The tokenizing algorihtm (classname plus parameters) to use. * (default: weka.core.tokenizers.WordTokenizer)</pre> * <!-- options-end --> * * @author Len Trigg (len@reeltwo.com) * @author Stuart Inglis (stuart@reeltwo.com) * @author Gordon Paynter (gordon.paynter@ucr.edu) * @author Asrhaf M. Kibriya (amk14@cs.waikato.ac.nz) * @version $Revision: 1.20 $ * @see Stopwords */public class StringToWordVector extends Filter implements UnsupervisedFilter, OptionHandler { /** for serialization */ static final long serialVersionUID = 8249106275278565424L; /** Range of columns to convert to word vectors */ protected Range m_SelectedRange = new Range("first-last"); /** Contains a mapping of valid words to attribute indexes */ private TreeMap m_Dictionary = new TreeMap(); /** True if output instances should contain word frequency rather than boolean 0 or 1. */ private boolean m_OutputCounts = false; /** A String prefix for the attribute names */ private String m_Prefix = ""; /** Contains the number of documents (instances) a particular word appears in. The counts are stored with the same indexing as given by m_Dictionary. */ private int [] m_DocsCounts; /** Contains the number of documents (instances) in the input format from which the dictionary is created. It is used in IDF transform. */ private int m_NumInstances = -1; /** * Contains the average length of documents (among the first batch of * instances aka training data). This is used in length normalization of * documents which will be normalized to average document length. */ private double m_AvgDocLength = -1; /** * The default number of words (per class if there is a class attribute * assigned) to attempt to keep. */ private int m_WordsToKeep = 1000; /** True if word frequencies should be transformed into log(1+fi) where fi is the frequency of word i */ private boolean m_TFTransform; /** The normalization to apply. */ protected int m_filterType = FILTER_NONE; /** normalization: No normalization */ public static final int FILTER_NONE = 0; /** normalization: Normalize all data */ public static final int FILTER_NORMALIZE_ALL = 1; /** normalization: Normalize test data only */ public static final int FILTER_NORMALIZE_TEST_ONLY = 2; /** Specifies whether document's (instance's) word frequencies are * to be normalized. The are normalized to average length of * documents specified as input format. */ public static final Tag [] TAGS_FILTER = { new Tag(FILTER_NONE, "No normalization"), new Tag(FILTER_NORMALIZE_ALL, "Normalize all data"), new Tag(FILTER_NORMALIZE_TEST_ONLY, "Normalize test data only"), }; /** True if word frequencies should be transformed into fij*log(numOfDocs/numOfDocsWithWordi) */ private boolean m_IDFTransform; /** True if all tokens should be downcased */ private boolean m_lowerCaseTokens; /** True if tokens that are on a stoplist are to be ignored. */ private boolean m_useStoplist; /** the stemming algorithm */ private Stemmer m_Stemmer = new NullStemmer(); /** the minimum (per-class) word frequency */ private int m_minTermFreq = 1; /** whether to operate on a per-class basis */ private boolean m_doNotOperateOnPerClassBasis = false; /** a file containing stopwords for using others than the default Rainbow * ones */ private File m_Stopwords = new File(System.getProperty("user.dir")); /** the tokenizer algorithm to use */ private Tokenizer m_Tokenizer = new WordTokenizer(); /** * Default constructor. Targets 1000 words in the output. */ public StringToWordVector() { } /** * Returns an enumeration describing the available options * * @return an enumeration of all the available options */ public Enumeration listOptions() { Vector result = new Vector(); result.addElement(new Option( "\tOutput word counts rather than boolean word presence.\n", "C", 0, "-C")); result.addElement(new Option( "\tSpecify list of string attributes to convert to words (as weka Range).\n" + "\t(default: select all string attributes)", "R", 1, "-R <index1,index2-index4,...>")); result.addElement(new Option( "\tInvert matching sense of column indexes.", "V", 0, "-V")); result.addElement(new Option( "\tSpecify a prefix for the created attribute names.\n" + "\t(default: \"\")", "P", 1, "-P <attribute name prefix>")); result.addElement(new Option( "\tSpecify approximate number of word fields to create.\n" + "\tSurplus words will be discarded..\n" + "\t(default: 1000)", "W", 1, "-W <number of words to keep>")); result.addElement(new Option( "\tTransform the word frequencies into log(1+fij)\n"+ "\twhere fij is the frequency of word i in jth document(instance).\n", "T", 0, "-T")); result.addElement(new Option( "\tTransform each word frequency into:\n"+ "\tfij*log(num of Documents/num of documents containing word i)\n"+ "\t where fij if frequency of word i in jth document(instance)", "I", 0, "-I")); result.addElement(new Option( "\tWhether to 0=not normalize/1=normalize all data/2=normalize test data only\n" + "\tto average length of training documents " + "(default 0=don\'t normalize).", "N", 1, "-N")); result.addElement(new Option( "\tConvert all tokens to lowercase before "+ "adding to the dictionary.", "L", 0, "-L")); result.addElement(new Option( "\tIgnore words that are in the stoplist.", "S", 0, "-S")); result.addElement(new Option( "\tThe stemmering algorihtm (classname plus parameters) to use.", "stemmer", 1, "-stemmer <spec>")); result.addElement(new Option( "\tThe minimum term frequency (default = 1).", "M", 1, "-M <int>")); result.addElement(new Option( "\tIf this is set, the maximum number of words and the \n" + "\tminimum term frequency is not enforced on a per-class \n" + "\tbasis but based on the documents in all the classes \n" + "\t(even if a class attribute is set).", "O", 0, "-O")); result.addElement(new Option( "\tA file containing stopwords to override the default ones.\n" + "\tUsing this option automatically sets the flag ('-S') to use the\n" + "\tstoplist if the file exists.\n" + "\tFormat: one stopword per line, lines starting with '#'\n" + "\tare interpreted as comments and ignored.", "stopwords", 1, "-stopwords <file>")); result.addElement(new Option( "\tThe tokenizing algorihtm (classname plus parameters) to use.\n" + "\t(default: " + WordTokenizer.class.getName() + ")", "tokenizer", 1, "-tokenizer <spec>")); return result.elements(); } /** * Parses a given list of options. <p/> * <!-- options-start --> * Valid options are: <p/> * * <pre> -C * Output word counts rather than boolean word presence. * </pre> * * <pre> -R <index1,index2-index4,...> * Specify list of string attributes to convert to words (as weka Range). * (default: select all string attributes)</pre> * * <pre> -V * Invert matching sense of column indexes.</pre> * * <pre> -P <attribute name prefix> * Specify a prefix for the created attribute names. * (default: "")</pre> * * <pre> -W <number of words to keep> * Specify approximate number of word fields to create. * Surplus words will be discarded.. * (default: 1000)</pre> * * <pre> -T * Transform the word frequencies into log(1+fij) * where fij is the frequency of word i in jth document(instance). * </pre> * * <pre> -I * Transform each word frequency into: * fij*log(num of Documents/num of documents containing word i) * where fij if frequency of word i in jth document(instance)</pre> * * <pre> -N * Whether to 0=not normalize/1=normalize all data/2=normalize test data only * to average length of training documents (default 0=don't normalize).</pre> * * <pre> -L * Convert all tokens to lowercase before adding to the dictionary.</pre> * * <pre> -S * Ignore words that are in the stoplist.</pre> * * <pre> -stemmer <spec> * The stemmering algorihtm (classname plus parameters) to use.</pre> * * <pre> -M <int> * The minimum term frequency (default = 1).</pre> * * <pre> -O * If this is set, the maximum number of words and the * minimum term frequency is not enforced on a per-class * basis but based on the documents in all the classes * (even if a class attribute is set).</pre> * * <pre> -stopwords <file> * A file containing stopwords to override the default ones. * Using this option automatically sets the flag ('-S') to use the * stoplist if the file exists. * Format: one stopword per line, lines starting with '#' * are interpreted as comments and ignored.</pre> * * <pre> -tokenizer <spec> * The tokenizing algorihtm (classname plus parameters) to use. * (default: weka.core.tokenizers.WordTokenizer)</pre> * <!-- options-end --> * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ public void setOptions(String[] options) throws Exception { String value; value = Utils.getOption('R', options); if (value.length() != 0) setSelectedRange(value); else setSelectedRange("first-last"); setInvertSelection(Utils.getFlag('V', options)); value = Utils.getOption('P', options); if (value.length() != 0) setAttributeNamePrefix(value); else setAttributeNamePrefix("");
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?