📄 textsource.java
字号:
package weka.datagenerators;import weka.core.Attribute;import weka.core.FastVector;import weka.core.Instance;import weka.core.Instances;import weka.core.Option;import weka.core.OptionHandler;import weka.core.SparseInstance;import weka.core.Utils;import java.io.BufferedReader;import java.io.BufferedWriter;import java.io.File;import java.io.FileNotFoundException;import java.io.FileReader;import java.io.FileWriter;import java.io.IOException;import java.io.PrintWriter;import java.io.Serializable;import java.util.ArrayList;import java.util.Arrays;import java.util.Enumeration;import java.util.HashMap;import java.util.Iterator;import java.util.LinkedHashMap;import java.util.LinkedList;import java.util.List;import java.util.ListIterator;import java.util.Map.Entry;import java.util.Set;import java.util.TreeMap;import java.util.Vector;import java.util.regex.Pattern;/** * Reads a collection of text documents and transforms them into * sparse vectors. The sparse vectors are then put into an ARFF file * for further processing by WEKA. * * <p><b>WEKA options:</b> * <ul> * <li><code>-I</code> - Include TFIDF scores instead of TF. * * <li><code>-R <str></code> - The document reader. Now only * one is supported, namely <code>directory</code>. This parameter * has no default value and is not optional. * * <li><code>-L <str></code> - The lexer. Now only one lexer * is supported, namely <code>simple</code>. This parameter has no * default value and is not optional. * * <li><code>-F <str>[:<str>...]</code> - A * colon-separated list of filters being applied on the tokens. * Four filters are supported, namely <code>lower_case</code>, * <code>porter_stemmer</code>, <code>stop_word</code>, and * <code>word_length</code>. Order of listing is significant. For * example, if the value for <code>filters</code> is * <code>stop_word:porter_stemmer</code>, then the * <code>stop_word</code> filter is applied before * <code>porter_stemmer</code>. By default the list is empty. * * <li>Document readers, filters and lexers have their own * parameters. See their documentation for detail. * </ul> * * <p>The generic generator options <code>-a</code>, <code>-c</code> * and <code>-n</code> are ignored. * * <p>Here are some sample command lines: * <pre> java weka.datagenerators.TextSource -r news -R directory -D cmu-newsgroup-random-100/ -L simple -y whitespace -o news.arff </pre> * * <p>The name of the dataset is <code>news</code>. We use the * <code>directory</code> document reader. The directory being read * is <code>cmu-newsgroup-random-100/</code>. We use the * <code>simple</code> lexer and all tokens are delimited by * whitespace. The output file is <code>news.arff</code>. * <pre> java weka.datagenerators.TextSource -r news -R directory -D cmu-newsgroup-random-100/ -L simple -y alphanum -o news.arff </pre> * * <p>In this case all tokens consist of only alphanumeric characters. * <pre> java weka.datagenerators.TextSource -r news -R directory -D cmu-newsgroup-random-100/ -L simple -y alpha -o news.arff </pre> * * <p>All tokens consist of only alphabets. * <pre> java weka.datagenerators.TextSource -r news -R directory -D cmu-newsgroup-random-100/ -L simple -y alpha -F lower_case -o news.arff </pre> * * <p>All tokens are converted to lower case before being indexed. * <pre> java weka.datagenerators.TextSource -r news -R directory -D cmu-newsgroup-random-100/ -L simple -y alpha -F lower_case:stop_word -o news.arff </pre> * * <p>All stop words are removed. The default SMART stop list is used. * <pre> java weka.datagenerators.TextSource -r news -R directory -D cmu-newsgroup-random-100/ -L simple -y alpha -F lower_case:stop_word:porter_stemmer -o news.arff </pre> * * <p>After removing the stop words, we apply the Porter stemmer. * <pre> java weka.datagenerators.TextSource -r news -R directory -D cmu-newsgroup-random-100/ -L simple -y alpha -F lower_case:stop_word:porter_stemmer:word_length -N 5 -o news.arff </pre> * * <p>After stemming the tokens, we throw away all tokens whose length * is less than five. * <pre> java weka.datagenerators.TextSource -r news -R directory -D cmu-newsgroup-random-100/ -L simple -y alpha -F lower_case:stop_word:word_length:porter_stemmer -N 5 -o news.arff </pre> * * <p>We throw away tokens whose length is less than five before * applying the Porter stemmer. * <pre> java weka.datagenerators.TextSource -r news -R directory -D cmu-newsgroup-random-100/ -u 'talk.*' -L simple -y alpha -F lower_case:stop_word:word_length:porter_stemmer -N 5 -o news.arff </pre> * * <p>Read only documents that belong to the classes * <code>talk.*</code>. The argument for <code>-u</code> can be any * regular expression. * * @author ywwong * @version $Id: TextSource.java,v 1.1.1.1 2003/01/22 07:48:27 mbilenko Exp $ */public class TextSource extends Generator implements OptionHandler, Serializable { /** A simpler wrapper for int than Integer. */ public class Int implements Comparable { public int m_i; public Int(int i) { m_i = i; } public int compareTo(Object o) { Int n = (Int) o; if (m_i < n.m_i) return -1; else if (m_i == n.m_i) return 0; else return 1; } public boolean equals(Object n) { if (n.getClass() == Int.class) return m_i == ((Int) n).m_i; else return false; } public int hashCode() { return m_i; } public String toString() { return Integer.toString(m_i); } } /** A simpler wrapper for double than Double. */ public class Real { public double m_d; public Real(double d) { m_d = d; } public String toString() { return Double.toString(m_d); } } /** Sparse map data row structure with public hash map. */ public class DataRow { public double m_dClass; public TreeMap m_data; public DataRow() { m_data = new TreeMap(); } public void set(Int nIndex, Real dVal) { if (dVal.m_d == 0.0) m_data.remove(nIndex); else m_data.put(nIndex, dVal); } public void setClass(Real dClass) { m_dClass = dClass.m_d; } // WEKA specific. public Instance makeInstance(Table table) { Instance inst; double[] aVals; int[] aIndices; Iterator it; Entry ent; int i; aVals = new double[m_data.size() + 1]; aIndices = new int[m_data.size() + 1]; it = m_data.entrySet().iterator(); for (i = 0; it.hasNext(); i++) { ent = (Entry) it.next(); aVals[i] = ((Real) ent.getValue()).m_d; aIndices[i] = ((Int) ent.getKey()).m_i; } aVals[i] = m_dClass; aIndices[i] = table.m_nIndex; inst = new SparseInstance(0.0, aVals, aIndices, table.m_format.numAttributes()); inst.setDataset(table.m_format); return inst; } } /** Table that allows incremental addition of attributes. */ public class Table { protected TextSource m_ts; protected FastVector m_attribs; protected Instances m_format; protected LinkedList m_data; protected int m_nIndex; // class index protected ListIterator m_it; // used by getNextInstance() public Table(TextSource ts) { m_ts = ts; m_attribs = new FastVector(); m_data = new LinkedList(); m_it = null; } public void add(DataRow vector) { m_data.add(vector); } public void addAttribute(Attribute attrib) { m_attribs.addElement(attrib); } // WEKA specific. public Instances makeDataFormat() throws Exception { FastVector attribs; FastVector aClasses; Set setKeys; Iterator it; // Add class index as one of the attributes. aClasses = new FastVector(m_ts.m_hashClasses.size()); setKeys = m_ts.m_hashClasses.keySet(); for (it = setKeys.iterator(); it.hasNext(); ) aClasses.addElement(it.next()); m_nIndex = m_attribs.size(); attribs = (FastVector) m_attribs.copy(); attribs.addElement(new Attribute("__class__", aClasses)); m_format = new Instances(m_ts.getRelationName(), attribs, 0); m_format.setClassIndex(m_nIndex); // Update generator variables. m_ts.setNumClasses(aClasses.size()); m_ts.setNumExamples(m_data.size()); m_ts.setNumExamplesAct(m_data.size()); m_ts.setNumAttributes(m_attribs.size() + 1); return m_format; } // WEKA specific. public Instance getNextInstance() { if (m_it == null) m_it = m_data.listIterator(); return ((DataRow) m_it.next()).makeInstance(this); } } /** Information about a particular token. */ protected class Token { /** The token string. */ public String m_strToken; /** The token ID, which is the same as the attribute index. */ public Int m_nID; /** The document frequency. */ public int m_nDF; public Token(String strToken, Int nID) { m_strToken = strToken; m_nID = nID; m_nDF = 0; } } /** The example table. */ protected Table m_table; /** A map for looking up tokens. */ protected HashMap m_hashTokens; /** An ordered list for looking up tokens. */ protected ArrayList m_aTokens;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -