wordtokenizer.java

来自「wekaUT是 university texas austin 开发的基于wek」· Java 代码 · 共 309 行
JAVA
309 行
/* *    This program is free software; you can redistribute it and/or modify *    it under the terms of the GNU General Public License as published by *    the Free Software Foundation; either version 2 of the License, or *    (at your option) any later version. * *    This program is distributed in the hope that it will be useful, *    but WITHOUT ANY WARRANTY; without even the implied warranty of *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *    GNU General Public License for more details. * *    You should have received a copy of the GNU General Public License *    along with this program; if not, write to the Free Software *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* *    WordTokenizer.java *    Copyright (C) 2001 Mikhail Bilenko * */package weka.deduping.metrics;import java.util.*;import java.io.*;import weka.core.*;/** * This class defines a tokenizer that turns strings into HashMapVectors * using the native Java StringTokenizer * * @author Mikhail Bilenko */public class WordTokenizer extends Tokenizer implements Serializable, OptionHandler, Cloneable {  /** Converting all tokens to lowercase */  protected boolean m_caseInsensitive = true;  /** Stemming */  protected boolean m_stemming = false;  protected Porter m_stemmer = new Porter();  /** Stopword removal */  protected boolean m_stopwordRemoval = false;  /** The with the stopword list */  protected static String m_stopwordFilename = "/u/mbilenko/weka/weka/deduping/metrics/stopwords.txt";  /** Stopword hash */  protected static HashSet m_stopwordSet = null;  /** A default set of delimiters */  protected String m_delimiters = " \t\n\r\f\'\"\\!@#$%^&*()_-+={}<>,.;:|[]{}/*~`";    /** The default minimum length of a token */  protected int m_minTokenLength = 1;  /** A default constructor */  public WordTokenizer() {    super();    setStemming(m_stemming);    setStopwordRemoval(m_stopwordRemoval);   }   /** Take a string and create a vector of tokens from it   * @param string a String to tokenize   * @returns vector with individual tokens   */  public HashMapVector tokenize(String string) {    StringTokenizer tokenizer = new StringTokenizer(string, m_delimiters);    HashMapVector result = new HashMapVector();    if (m_stopwordRemoval && m_stopwordSet == null) {      setStopwordRemoval(true);    }         while (tokenizer.hasMoreTokens()) {      String token = tokenizer.nextToken();      if (token.length() >= m_minTokenLength) {	if (m_caseInsensitive) {	  token = token.toLowerCase();	}	if (m_stemming) {	  token = stem(token);	}	if (m_stopwordRemoval) {	  if (!m_stopwordSet.contains(token)) {	    result.increment(token);	  }	} else {	  result.increment(token);	}       }     }    return result;  }  /** Take a string and create a TokenString from it   * @param string a String to tokenize   * @returns a TokenString composed of individual tokens   */  public TokenString getTokenString(String string) {    StringTokenizer tokenizer = new StringTokenizer(string, m_delimiters);    TokenString ts = new TokenString(string);    ArrayList tokenList = new ArrayList();    if (m_stopwordRemoval && m_stopwordSet == null) {      setStopwordRemoval(true);    }     if (m_caseInsensitive) {      string = string.toLowerCase();    }        while (tokenizer.hasMoreTokens()) {      String token = tokenizer.nextToken();      if (token.length() >= m_minTokenLength) {	if (m_stemming) {	  System.out.print(token + "->");	  token = stem(token);	  System.out.println(token);	}	if (!m_stopwordRemoval || !m_stopwordSet.contains(token)) {	  Object o = m_stringIDmap.get(token);	  if (o == null) {	    m_stringIDmap.put(token, new Integer(m_currIDidx++));	  }	  tokenList.add(token);	}      }     }    // convert the tokenList into the two arrays inside TokenString    ts.tokens = new String[tokenList.size()];    ts.tokens = (String[]) tokenList.toArray(ts.tokens);    ts.tokenIDs = new int[ts.tokens.length];    for (int i = 0; i < ts.tokens.length; i++) {      ts.tokenIDs[i] = ((Integer)m_stringIDmap.get(ts.tokens[i])).intValue();    }     return ts;  }  /** Specify which delimiters to use   * @param delim a string containing delmiters to use   */  public void setDelimiters(String delimiters) {    m_delimiters = new String(delimiters);  }   /** Get the delimiters    * @return a string containing delmiters that are used   */  public String getDelimiters() {    return m_delimiters;  }  /** Set the minimum token length   * @param minTokenLength the minimum length of a token   */  public void setMinTokenLength(int minTokenLength) {    m_minTokenLength = minTokenLength;  }  /** Get the minimum token length   * @return the minimum length of a token   */  public int getMinTokenLength() {    return m_minTokenLength;  }  /**   * Gets the current settings of WordTokenizer.   *   * @return an array of strings suitable for passing to setOptions()   */  public String [] getOptions() {    String [] options = new String [10];    int current = 0;    if (m_caseInsensitive) {      options[current++] = "-I";    }         if (m_stemming) {      options[current++] = "-S";    }    if (m_stopwordRemoval) {      options[current++] = "-R";    }        options[current++] = "-m";    options[current++] = "" + m_minTokenLength;        while (current < options.length) {      options[current++] = "";    }    return options;  }  /**   * Parses a given list of options. Valid options are:<p>   *    * -S use stemming   * -R remove stopwords   * -m minimum length of a token for it to be included   */  public void setOptions(String[] options) throws Exception {    System.out.println("Inside setOPtions + " + options.length);     setStemming(Utils.getFlag('S', options));    setStopwordRemoval(Utils.getFlag('R', options));        String minTokenLengthString = Utils.getOption('m', options);    if (minTokenLengthString.length() != 0) {      setMinTokenLength(Integer.parseInt(minTokenLengthString));    }  }  /**   * Returns an enumeration describing the available options.   *   * @return an enumeration of all the available options.   */  public Enumeration listOptions() {    Vector newVector = new Vector(5);    newVector.addElement(new Option("\tUse Porter stemmer for stemming\n",				    "S", 0, "-S"));    newVector.addElement(new Option("\tRemove stopwords\n",				    "R", 0, "-R"));        newVector.addElement(new Option("\tMinimum length of token for it to be included\n",				    "m", 1, "-m"));        return newVector.elements();  }  /** Turn case sensitivity on/off   * @param caseInsensitive if true, the tokenizer is case-insensitive   */  public void setCaseInsensitive(boolean caseInsensitive) {     m_caseInsensitive = caseInsensitive;  }  /** Turn case sensitivity on/off   * @return if true, the tokenizer is case-insensitive   */  public boolean getCaseInsensitive() {     return m_caseInsensitive;  }   /** Turn stemming on/off   * @param stemming if true, stemming is used   */  public void setStemming(boolean stemming) {     m_stemming = stemming;    if (stemming) {      m_stemmer = new Porter();    }  }  /** Find out whether stemming is on/off   * @return if true, stemming is used   */  public boolean getStemming() {     return m_stemming;  }  /** Stem a given token   * @param token the token to be stemmed   * @return a new token resulting from applying the stemmer   */  public String stem(String token) {    return m_stemmer.stripAffixes(token);  }    /** Turn stopword removal on/off and load the stopwords   * @param stopwordRemoval if true, stopwords from m_stopwordFile will be removed   */  public void setStopwordRemoval(boolean stopwordRemoval) {    m_stopwordRemoval = stopwordRemoval;    if (m_stopwordRemoval) {      try {	m_stopwordSet = new HashSet();	BufferedReader in = new BufferedReader(new FileReader(m_stopwordFilename));	String stopword;	while ((stopword = in.readLine()) != null) {	  m_stopwordSet.add(stopword);	}      } catch (Exception e) {	System.out.println("Problems initializing the stopwords from " + m_stopwordFilename);      }    }  }  /** Get whether stopword removal is on or off   * @return true if stopword removal is on   */  public boolean getStopwordRemoval() {    if (m_stopwordSet != null)   System.out.println("Size of the hash: " + m_stopwordSet.size());    return m_stopwordRemoval;  }}
wordtokenizer.java - 源码说明

本页面展示了「wekaUT是 university texas austin 开发的基于weka的半指导学习(semi supervised learning)的分类器」中的 wordtokenizer.java 源码文件，采用 Java 编程语言编写，共 309 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与university相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?