ngramtokenizer.java

来自「wekaUT是 university texas austin 开发的基于wek」· Java 代码 · 共 351 行
JAVA
351 行
/* *    This program is free software; you can redistribute it and/or modify *    it under the terms of the GNU General Public License as published by *    the Free Software Foundation; either version 2 of the License, or *    (at your option) any later version. * *    This program is distributed in the hope that it will be useful, *    but WITHOUT ANY WARRANTY; without even the implied warranty of *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *    GNU General Public License for more details. * *    You should have received a copy of the GNU General Public License *    along with this program; if not, write to the Free Software *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* *    NGramTokenizer.java *    Copyright (C) 2001 Mikhail Bilenko * */package weka.deduping.metrics;import java.util.*;import java.io.*;import weka.core.*;/** * This class defines a tokenizer that turns strings into HashMapVectors  * of n-grams * * @author Mikhail Bilenko */public class NGramTokenizer extends Tokenizer implements Serializable, OptionHandler {  /** Converting all tokens to lowercase */  protected boolean m_caseInsensitive = true;  /** Stemming */  protected boolean m_stemming = false;  protected Porter m_stemmer = new Porter();  /** Stopword removal */  protected boolean m_stopwordRemoval = false;  /** The with the stopword list */  protected static String m_stopwordFilename = "/u/mbilenko/weka/weka/deduping/metrics/stopwords.txt";  /** Stopword hash */  protected static HashSet m_stopwordSet = null;  /** Length of an n-gram */  protected int m_n = 3;  /** A default set of space-equivalent characters */  protected String m_spaceEquivalents = "\t\n\r\f\'\"\\!@#$%^&*()_-+={}<>,.;:|[]{}/*~`";  protected char[] m_spaceChars = null;  /** if true, all space equivalents will be replaced with a single space */   protected boolean m_replaceSpaces = false;   /** A default constructor */  public NGramTokenizer() {    super();    m_spaceChars = m_spaceEquivalents.toCharArray();    setStemming(false);    setStopwordRemoval(false);  }   /** Take a string and create a vector of n-gram tokens from it   * @param string a String to tokenize   * @returns vector with individual tokens   */  public HashMapVector tokenize(String string) {    if (m_caseInsensitive) {      string = string.toLowerCase();    }        StringBuffer filteredString = new StringBuffer();    // only need to tokenize if stemming, or removing stopwords, or replacing space equivalents    if (m_stemming || m_stopwordRemoval || m_replaceSpaces) {      StringTokenizer tokenizer = new StringTokenizer(string, m_spaceEquivalents, true);      while (tokenizer.hasMoreTokens()) {	String token = tokenizer.nextToken();	if (m_stemming) {	  token = stem(token);	}	if (m_stopwordRemoval && m_stopwordSet.contains(token)) {	  continue;	}	if (m_replaceSpaces && token.length() == 1) {	  if (m_spaceEquivalents.indexOf(token) > -1) {	    filteredString.append(" ");	  }	} else {	  filteredString.append(token);	}      }    } else {      filteredString = new StringBuffer(string);    }         char[] chars = filteredString.toString().toCharArray();       HashMapVector result = new HashMapVector();    for (int i = 0; i < chars.length - m_n; i++) {      String token = new String(chars, i, m_n);      result.increment(token);    }     return result;  }  /** Take a string and create a TokenString of overlapping n-gram tokens from it   * @param string a String to tokenize   * @returns vector with individual tokens   */  public TokenString getTokenString(String string) {    TokenString ts = new TokenString(string);    ArrayList tokenList = new ArrayList();        StringBuffer filteredString = new StringBuffer();    // only need to tokenize if stemming, or removing stopwords, or replacing space equivalents    if (m_stemming || m_stopwordRemoval || m_replaceSpaces) {      StringTokenizer tokenizer = new StringTokenizer(string, m_spaceEquivalents, true);      while (tokenizer.hasMoreTokens()) {	String token = tokenizer.nextToken();	if (m_stemming) {	  token = stem(token);	}	if (m_stopwordRemoval && m_stopwordSet.contains(token)) {	  continue;	}	if (m_replaceSpaces && token.length() == 1) {	  if (m_spaceEquivalents.indexOf(token) > -1) {	    filteredString.append(" ");	  }	} else {	  filteredString.append(token);	}      }    } else {      filteredString = new StringBuffer(string);    }         char[] chars = filteredString.toString().toCharArray();       HashMapVector result = new HashMapVector();    for (int i = 0; i < chars.length - m_n; i++) {      String token = new String(chars, i, m_n);      Object o = m_stringIDmap.get(token);      if (o == null) {	m_stringIDmap.put(token, new Integer(m_currIDidx++));      }      tokenList.add(token);    }    // convert the tokenList into the two arrays inside TokenString    ts.tokens = new String[tokenList.size()];    ts.tokens = (String[]) tokenList.toArray(ts.tokens);    ts.tokenIDs = new int[ts.tokens.length];    for (int i = 0; i < ts.tokens.length; i++) {      ts.tokenIDs[i] = ((Integer)m_stringIDmap.get(ts.tokens[i])).intValue();    }     return ts;  }  /** Set the gram length   * @param n the gram length   */  public void setN(int n) {    m_n = n;  }  /** Get the gram length   * @return the gram length   */  public int getN() {    return m_n;  }    /** Specify which characters should be treated as spaces   * @param spaceEquivalents a string containing space equivalents   */  public void setSpaceEquivalents(String spaceEquivalents) {    m_spaceEquivalents = new String(spaceEquivalents);    m_spaceChars = m_spaceEquivalents.toCharArray();  }   /** Get the haracters that should be treated as spaces    * @return a string containing space equivalents   */  public String getSpaceEquivalents() {    return m_spaceEquivalents;  }  /** Turn on/off replacing space equivalents with a single space */  public void setReplaceSpaces(boolean replaceSpaces) {    m_replaceSpaces = replaceSpaces;  }  public boolean getReplaceSpaces() {    return m_replaceSpaces;  }     /**   * Gets the current settings of NGramTokenizer.   *   * @return an array of strings suitable for passing to setOptions()   */  public String [] getOptions() {    String [] options = new String [10];    int current = 0;    if (m_stemming) {      options[current++] = "-S";    }    if (m_stopwordRemoval) {      options[current++] = "-R";    }    if (m_replaceSpaces) {      options[current++] = "-spaces";    }        options[current++] = "-N";    options[current++] = "" + m_n;        while (current < options.length) {      options[current++] = "";    }    return options;  }  /**   * Parses a given list of options. Valid options are:<p>   *   * -S use stemming   * -R remove stopwords   * -N gram size   */  public void setOptions(String[] options) throws Exception {    setStemming(Utils.getFlag('S', options));    setStopwordRemoval(Utils.getFlag('R', options));            String nString = Utils.getOption('N', options);    if (nString.length() != 0) {      setN(Integer.parseInt(nString));    }  }  /**   * Returns an enumeration describing the available options.   *   * @return an enumeration of all the available options.   */  public Enumeration listOptions() {    Vector newVector = new Vector(5);    newVector.addElement(new Option("\tUse Porter stemmer for stemming\n",				    "S", 0, "-S"));    newVector.addElement(new Option("\tRemove stopwords\n",				    "R", 0, "-R"));        newVector.addElement(new Option("\tGram size\n",				    "N", 1, "-N"));        return newVector.elements();  }    /** Turn case sensitivity on/off   * @param caseInsensitive if true, the tokenizer is case-insensitive   */  public void setCaseInsensitive(boolean caseInsensitive) {     m_caseInsensitive = caseInsensitive;  }  /** Turn case sensitivity on/off   * @return if true, the tokenizer is case-insensitive   */  public boolean getCaseInsensitive() {     return m_caseInsensitive;  }   /** Turn stemming on/off   * @param stemming if true, stemming is used   */  public void setStemming(boolean stemming) {     m_stemming = stemming;    if (stemming) {      m_stemmer = new Porter();    }  }  /** Find out whether stemming is on/off   * @return if true, stemming is used   */  public boolean getStemming() {     return m_stemming;  }  /** Stem a given token   * @param token the token to be stemmed   * @return a new token resulting from applying the stemmer   */  public String stem(String token) {    return m_stemmer.stripAffixes(token);  }    /** Turn stopword removal on/off and load the stopwords   * @param stopwordRemoval if true, stopwords from m_stopwordFile will be removed   */  public void setStopwordRemoval(boolean stopwordRemoval) {     m_stopwordRemoval = stopwordRemoval;    if (m_stopwordRemoval) {      try {	BufferedReader in = new BufferedReader(new FileReader(m_stopwordFilename));	m_stopwordSet = new HashSet();	String stopword;	while ((stopword = in.readLine()) != null) {	  m_stopwordSet.add(stopword);	}      } catch (Exception e) {	System.out.println("Problems initializing the stopwords from " + m_stopwordFilename);      }    }  }  /** Get whether stopword removal is on or off   * @return true if stopword removal is on   */  public boolean getStopwordRemoval() {    return m_stopwordRemoval;  }}
ngramtokenizer.java - 源码说明

本页面展示了「wekaUT是 university texas austin 开发的基于weka的半指导学习(semi supervised learning)的分类器」中的 ngramtokenizer.java 源码文件，采用 Java 编程语言编写，共 351 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与university相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?