⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 ngramtokenizer.java

📁 这是某个老师编写的JAVA编程软件包
💻 JAVA
字号:
  /*
  WVTool - Word Vector Tool
  Copyright (C) 2001-2007

	    Michael Wurst       

  web:   http://wvtool.sourceforge.net

  This program is free software; you can redistribute it and/or
  modify it under the terms of the GNU General Public License as 
  published by the Free Software Foundation; either version 2 of the
  License, or (at your option) any later version. 

  This program is distributed in the hope that it will be useful, but
  WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with this program; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
  USA.
*/
package edu.udo.cs.wvtool.generic.tokenizer;

import java.io.Reader;
import java.util.ArrayList;
import java.util.List;

import edu.udo.cs.wvtool.main.WVTDocumentInfo;
import edu.udo.cs.wvtool.util.TokenEnumeration;
import edu.udo.cs.wvtool.util.WVToolException;

/**
 * Creates tokens by creating ngrams of the tokens received from an inner tokenizer.
 * 
 * @author Michael Wurst
 * @version $Id$
 * 
 */
public class NGramTokenizer implements WVTTokenizer, TokenEnumeration {

    /**
     * The token, which is currently provided. This buffer is neccessary, to implement the semantic of TokenEnumeration
     */
    private final List currentTokens;

    private final int n;

    private TokenEnumeration input;

    private final WVTTokenizer tokenizer;

    public NGramTokenizer(int n, WVTTokenizer tokenizer) {

        this.n = n;
        this.tokenizer = tokenizer;
        input = null;
        currentTokens = new ArrayList();
    }

    /**
     * @see edu.udo.cs.wvtool.generic.tokenizer.WVTTokenizer#tokenize(Reader, WVTDocumentInfo)
     */
    public TokenEnumeration tokenize(Reader source, WVTDocumentInfo d) throws WVToolException {

        if (source != null) {

            input = tokenizer.tokenize(source, d);
            readNextToken();
            return this;

        } else
            return null;

    }

    /**
     * Read a token from the character stream and store it into currentToken. If there are no more tokens left store a null value.
     * 
     */
    private void readNextToken() throws WVToolException {

        if (input.hasMoreTokens()) {

            String token = input.nextToken();
            if (token.length() > n) {

                for (int i = 0; i < token.length() - n + 1; i++) {

                    String ngram = token.substring(i, i + n);
                    currentTokens.add(ngram);
                }

            } else
                currentTokens.add(token);
        }

    }

    /**
     * @see edu.udo.cs.wvtool.util.TokenEnumeration#hasMoreTokens()
     */
    public boolean hasMoreTokens() {

        // If the current token does not equal the null value, then there is at
        // least this token left
        if (input != null)
            return (currentTokens.size() > 0);
        else
            return false;
    }

    /**
     * @see edu.udo.cs.wvtool.util.TokenEnumeration#nextToken()
     */
    public String nextToken() throws WVToolException {

        String result = null;

        // If unequal null, return the current token and read another one from
        // the stream

        if (currentTokens.size() > 0) {
            result = (String) currentTokens.get(0);
            currentTokens.remove(0);
            if (currentTokens.size() == 0)
                readNextToken();
        } else
            result = null;

        return result;
    }

}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -