analyzerutils.java

来自「这是关于中文分词的有关程序」· Java 代码 · 共 138 行

JAVA

138 行

/*
 * Copyright 2002-2005 the original author or authors.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
/*
 * Created on 2005-12-16
 * author 谢骋超
 * 
 */
package cn.edu.zju.dartsplitter.analysis;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;

import junit.framework.Assert;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;

/**
 * 一个显示analyzer分词效果的实用类
 * @author xiecc
 * @email xieccy@gmail.com xieccy@yahoo.com
 * homepage：  http://blog.itpub.net/xiecc
 * projectpage: http://ccnt.zju.edu.cn/projects
 *
 */
public class AnalyzerUtils {
    public static Token[] tokensFromAnalysis(Analyzer analyzer, String text)
            throws IOException {
        TokenStream stream = analyzer.tokenStream("contents", new StringReader(
                text));
        ArrayList tokenList = new ArrayList();
        while (true) {
            Token token = stream.next();
            if (token == null)
                break;

            tokenList.add(token);
        }

        return (Token[]) tokenList.toArray(new Token[0]);
    }

    public static void displayTokens(Analyzer analyzer, String text)
            throws IOException {
        Token[] tokens = tokensFromAnalysis(analyzer, text);

        for (int i = 0; i < tokens.length; i++) {
            Token token = tokens[i];

            System.out.print("[" + token.termText() + "] ");
        }
    }

    public static void displayTokensWithPositions(Analyzer analyzer, String text)
            throws IOException {
        Token[] tokens = tokensFromAnalysis(analyzer, text);

        int position = 0;

        for (int i = 0; i < tokens.length; i++) {
            Token token = tokens[i];

            int increment = token.getPositionIncrement();

            if (increment > 0) {
                position = position + increment;
                System.out.println();
                System.out.print(position + ": ");
            }

            System.out.print("[" + token.termText() + "] ");
        }
        System.out.println();
    }

    public static void displayTokensWithFullDetails(Analyzer analyzer,
            String text) throws IOException {
        Token[] tokens = tokensFromAnalysis(analyzer, text);

        int position = 0;

        for (int i = 0; i < tokens.length; i++) {
            Token token = tokens[i];

            int increment = token.getPositionIncrement();

            if (increment > 0) {
                position = position + increment;
                System.out.println();
                System.out.print(position + ": ");
            }

            System.out.print("[" + token.termText() + ":" + token.startOffset()
                    + "->" + token.endOffset() + ":" + token.type() + "] ");
        }
        System.out.println();
    }

    public static void assertTokensEqual(Token[] tokens, String[] strings) {
        Assert.assertEquals(strings.length, tokens.length);

        for (int i = 0; i < tokens.length; i++) {
            Assert.assertEquals("index " + i, strings[i], tokens[i].termText());
        }
    }

    public static void main(String[] args) throws IOException {
        System.out.println("SimpleAnalyzer");
        displayTokensWithFullDetails(new SimpleAnalyzer(),
                "学习外语的乐园，英语、法语、德语、俄语，专业外语等");
        displayTokensWithPositions(new SimpleAnalyzer(),
        "学习外语的乐园，英语、法语、德语、俄语，专业外语等");
        System.out.println("\n----");
        System.out.println("StandardAnalyzer");
        displayTokensWithFullDetails(new StandardAnalyzer(),
                "学习外语的乐园，英语、法语、德语、俄语，专业外语等");
        displayTokensWithPositions(new StandardAnalyzer(),
                "学习外语的乐园，英语、法语、德语、俄语，专业外语等");
    }
}

analyzerutils.java - 源码说明

本页面展示了「这是关于中文分词的有关程序」中的 analyzerutils.java 源码文件，采用 Java 编程语言编写，共 138 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。

虫虫下载站收录了大量与分相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。

⌨️ 快捷键说明

复制代码Ctrl + C

搜索代码Ctrl + F

全屏模式F11

增大字号Ctrl + =

减小字号Ctrl + -

显示快捷键?