⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 chinesetokenizer.java

📁 关于 Jaoso新闻文章发布系统 --- --- --- --- --- --- --- --- --- --- --- --- --- -- 版本信息:Jaoso新闻文章发布系统 0.9.1b
💻 JAVA
字号:
package jaoso.framework.core.search.lucene.analyzer;

import org.apache.lucene.analysis.*;

import java.io.Reader;


/**
 * Title: ChineseTokenizer
 * Description: Extract tokens from the Stream using Character.getType()
 *              Rule: A Chinese character as a single token
 * Copyright:   Copyright (c) 2001
 * Company:
 * @author Yiyi Sun
 * @version 1.0
 *
 */
public final class ChineseTokenizer extends Tokenizer {

    //~ Static fields/initializers =============================================

    /**  DOCUMENT ME! */
    private static final int MAX_WORD_LEN = 255;

    /**  DOCUMENT ME! */
    private static final int IO_BUFFER_SIZE = 1024;

    //~ Instance fields ========================================================

    /**  DOCUMENT ME! */
    private final char[] buffer = new char[MAX_WORD_LEN];

    /**  DOCUMENT ME! */
    private final char[] ioBuffer = new char[IO_BUFFER_SIZE];

    /**  DOCUMENT ME! */
    private int bufferIndex = 0;

    /**  DOCUMENT ME! */
    private int dataLen = 0;

    /**  DOCUMENT ME! */
    private int length;

    /**  DOCUMENT ME! */
    private int offset = 0;

    /**  DOCUMENT ME! */
    private int start;

    //~ Constructors ===========================================================

    /**
     * Creates a new ChineseTokenizer object.
     *
     * @param in DOCUMENT ME!
     */
    public ChineseTokenizer(Reader in) {

        input = in;
    }

    //~ Methods ================================================================

    /**
     * DOCUMENT ME!
     *
     * @return DOCUMENT ME!
     *
     * @throws java.io.IOException DOCUMENT ME!
     */
    public final Token next() throws java.io.IOException {

        length = 0;
        start = offset;

        while (true) {

            final char c;
            offset++;

            if (bufferIndex >= dataLen) {

                dataLen = input.read(ioBuffer);
                bufferIndex = 0;
            }

            ;

            if (dataLen == -1) {

                return flush();
            } else {

                c = (char) ioBuffer[bufferIndex++];
            }

            switch (Character.getType(c)) {

            case Character.DECIMAL_DIGIT_NUMBER:
            case Character.LOWERCASE_LETTER:
            case Character.UPPERCASE_LETTER:
                push(c);

                if (length == MAX_WORD_LEN) {

                    return flush();
                }

                break;

            case Character.OTHER_LETTER:

                if (length > 0) {

                    bufferIndex--;

                    return flush();
                }

                push(c);

                return flush();

            default:

                if (length > 0) {

                    return flush();
                }

                break;
            }
        }
    }

    /**
     * DOCUMENT ME!
     *
     * @return DOCUMENT ME!
     */
    private final Token flush() {

        if (length > 0) {

            //System.out.println(new String(buffer, 0, length));
            return new Token(new String(buffer, 0, length), start,
                start + length);
        } else {

            return null;
        }
    }

    /**
     * DOCUMENT ME!
     *
     * @param c DOCUMENT ME!
     */
    private final void push(char c) {

        if (length == 0) {

            start = offset - 1; // start of token
        }

        buffer[length++] = Character.toLowerCase(c); // buffer it
    }
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -