📄 textsource.java
字号:
/** The next token ID. */ protected int m_nNextToken; /** A map for looking up classes. */ protected LinkedHashMap m_hashClasses; /** The next class ID. */ protected double m_dNextClass; /** Collect TFIDF statistics instead of TF. */ protected boolean m_bTFIDF; /** The document reader. */ protected DocumentReader m_reader; /** The lexer. */ protected Lexer m_lexer; /** The list of token filters which are applied in order. */ protected LinkedList m_lstFilters; public TextSource() { m_table = new Table(this); m_hashTokens = new HashMap(); m_aTokens = new ArrayList(); m_nNextToken = 0; m_hashClasses = new LinkedHashMap(); m_dNextClass = 0.0; m_bTFIDF = false; m_bFormatDefined = false; } // Called by document readers. public Real registerClass(String strClass) { Real dClass; dClass = (Real) m_hashClasses.get(strClass); if (dClass == null) { dClass = new Real(m_dNextClass); m_dNextClass += 1.0; m_hashClasses.put(strClass, dClass); } return dClass; } /** * Tokenizes a document and transforms it into a sparse vector. * * @param dClass The class index of the document to be read. */ protected DataRow getInstance(Real dClass) throws IOException { DataRow vector; String strToken; ListIterator itFilter; TokenFilter filter; Token token; Int nTokenID; Real dTF; Attribute attrib; Set setKeys; Iterator itKey; vector = new DataRow(); vector.setClass(dClass); strToken = m_lexer.nextToken(); while (strToken != null) { // Push token through the filters. for (itFilter = m_lstFilters.listIterator(); itFilter.hasNext(); ) { filter = (TokenFilter) itFilter.next(); strToken = filter.apply(strToken); if (strToken == null) break; } if (strToken != null) { // Update token info. token = (Token) m_hashTokens.get(strToken); if (token != null) nTokenID = token.m_nID; else { nTokenID = new Int(m_nNextToken++); token = new Token(strToken, nTokenID); attrib = new Attribute(strToken); m_table.addAttribute(attrib); m_hashTokens.put(strToken, token); // The token with ID n can be found in m_aTokens[n]. m_aTokens.add(token); } // Update sparse vector. dTF = (Real) vector.m_data.get(nTokenID); if (dTF != null) dTF.m_d += 1.0; else vector.m_data.put(nTokenID, new Real(1.0)); } strToken = m_lexer.nextToken(); } // Update token info again. setKeys = vector.m_data.keySet(); for (itKey = setKeys.iterator(); itKey.hasNext(); ) { nTokenID = (Int) itKey.next(); token = (Token) m_aTokens.get(nTokenID.m_i); ++token.m_nDF; } return vector; } /** * Reads all documents and converts them all to sparse vectors. */ protected void readInstances() throws Exception { DataRow vector; while (m_reader.hasNextDocument()) { vector = getInstance(m_reader.nextDocument()); m_table.add(vector); } // Convert to TFIDF if necessary. if (m_bTFIDF) { Iterator itr, itw; DataRow row; Entry ent; Real r; Token t; double nDocs, max, d; int nTokens, i; nDocs = m_table.m_data.size(); nTokens = m_aTokens.size(); for (itr = m_table.m_data.iterator(); itr.hasNext(); ) { row = (DataRow) itr.next(); max = 0.0; for (itw = row.m_data.entrySet().iterator(); itw.hasNext(); ) { ent = (Entry) itw.next(); if (((Int) ent.getKey()).m_i < nTokens) { d = ((Real) ent.getValue()).m_d; if (max < d) max = d; } } for (itw = row.m_data.entrySet().iterator(); itw.hasNext(); ) { ent = (Entry) itw.next(); i = ((Int) ent.getKey()).m_i; if (i < nTokens) { r = (Real) ent.getValue(); t = ((Token) m_aTokens.get(i)); r.m_d /= max; r.m_d *= Math.log(nDocs / t.m_nDF); } } } } } ////// WEKA specific stuff. ////// /** The option string for document reader. */ protected String m_strDocReader; /** The option string for lexer. */ protected String m_strLexer; /** The option string for token filters. */ protected String m_strFilters; /** True iff defineDataFormat() has been called. */ protected boolean m_bFormatDefined; public String globalInfo() { return "A data generator that reads a collection of text documents " + "and transforms them into sparse vectors."; } public Enumeration listOptions() { Vector aOpts; aOpts = new Vector(); aOpts.add(new Option("\tCompute TFIDF instead of TF (default false)", "I", 0, "-I")); aOpts.add(new Option("\tDocument reader", "R", 1, "-R <str>")); aOpts.add(new Option("\tLexer", "L", 1, "-L <str>")); aOpts.add(new Option("\tFilters (default empty)", "F", 1, "-F <str>[:<str>...]")); aOpts.addAll(DirectoryDocumentReader.listOptions()); aOpts.addAll(SimpleLexer.listOptions()); aOpts.addAll(LowerCaseFilter.listOptions()); aOpts.addAll(PorterStemmer.listOptions()); aOpts.addAll(StopWordFilter.listOptions()); aOpts.addAll(WordLengthFilter.listOptions()); return aOpts.elements(); } public void setOptions(String[] options) throws Exception { Pattern patSep; String[] aFilters; Integer n; m_bTFIDF = Utils.getFlag('I', options); m_strDocReader = Utils.getOption('R', options); if (m_strDocReader.length() == 0) throw new Exception("Document reader (-R) not set."); else if (m_strDocReader.equals("directory")) m_reader = new DirectoryDocumentReader(this, options); else throw new Exception("Invalid document reader (-R)."); m_strLexer = Utils.getOption('L', options); if (m_strLexer.length() == 0) throw new Exception("Lexer (-L) not set."); else if (m_strLexer.equals("simple")) m_lexer = new SimpleLexer(this, m_reader, options); else throw new Exception("Invalid lexer (-L)."); m_strFilters = Utils.getOption('F', options); m_lstFilters = new LinkedList(); if (m_strFilters.length() > 0) { patSep = Pattern.compile(":"); aFilters = patSep.split(m_strFilters); for (int i = 0; i < aFilters.length; ++i) if (aFilters[i].length() > 0) { if (aFilters[i].equals("lower_case")) m_lstFilters.addLast (new LowerCaseFilter(this, options)); else if (aFilters[i].equals("porter_stemmer")) m_lstFilters.addLast (new PorterStemmer(this, options)); else if (aFilters[i].equals("stop_word")) m_lstFilters.addLast (new StopWordFilter(this, options)); else if (aFilters[i].equals("word_length")) m_lstFilters.addLast (new WordLengthFilter(this, options)); else throw new Exception ("Invalid filter (-F): " + aFilters[i] + "."); } } } public String[] getOptions() { ArrayList aOpts; ListIterator it; TokenFilter filter; String[] array; aOpts = new ArrayList(); if (m_bTFIDF) aOpts.add("-I"); aOpts.add("-R"); aOpts.add(m_strDocReader); aOpts.addAll(m_reader.getOptions()); aOpts.add("-L"); aOpts.add(m_strLexer); aOpts.addAll(m_lexer.getOptions()); if (m_strFilters.length() > 0) { aOpts.add("-F"); aOpts.add(m_strFilters); for (it = m_lstFilters.listIterator(); it.hasNext(); ) { filter = (TokenFilter) it.next(); aOpts.addAll(filter.getOptions()); } } array = new String[aOpts.size()]; return (String[]) aOpts.toArray(array); } public Instances defineDataFormat() throws Exception { m_bFormatDefined = true; readInstances(); return m_table.makeDataFormat(); } public Instance generateExample() throws Exception { if (!m_bFormatDefined) throw new Exception("Dataset format not defined."); return m_table.getNextInstance(); } public Instances generateExamples() throws Exception { throw new Exception("Only single mode supported."); } public String generateFinished() throws Exception { return ""; } public boolean getSingleModeFlag() throws Exception { return true; } public static void main(String[] argv) throws Exception { Generator.makeData(new TextSource(), argv); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -