cjktokenizer.java

来自「一套java版本的搜索引擎源码」· Java 代码 · 共 244 行
JAVA
244 行
package org.apache.lucene.analysis.cjk;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements.  See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License.  You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import org.apache.lucene.analysis.Token;import org.apache.lucene.analysis.Tokenizer;import java.io.Reader;/** * CJKTokenizer was modified from StopTokenizer which does a decent job for * most European languages. It performs other token methods for double-byte * Characters: the token will return at each two charactors with overlap match.<br> * Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it * also need filter filter zero length token ""<br> * for Digit: digit, '+', '#' will token as letter<br> * for more info on Asia language(Chinese Japanese Korean) text segmentation: * please search  <a * href="http://www.google.com/search?q=word+chinese+segment">google</a> * * @author Che, Dong */public final class CJKTokenizer extends Tokenizer {    //~ Static fields/initializers ---------------------------------------------    /** Max word length */    private static final int MAX_WORD_LEN = 255;    /** buffer size: */    private static final int IO_BUFFER_SIZE = 256;    //~ Instance fields --------------------------------------------------------    /** word offset, used to imply which character(in ) is parsed */    private int offset = 0;    /** the index used only for ioBuffer */    private int bufferIndex = 0;    /** data length */    private int dataLen = 0;    /**     * character buffer, store the characters which are used to compose <br>     * the returned Token     */    private final char[] buffer = new char[MAX_WORD_LEN];    /**     * I/O buffer, used to store the content of the input(one of the <br>     * members of Tokenizer)     */    private final char[] ioBuffer = new char[IO_BUFFER_SIZE];    /** word type: single=>ASCII  double=>non-ASCII word=>default */    private String tokenType = "word";    /**     * tag: previous character is a cached double-byte character  "C1C2C3C4"     * ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)     * C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"     */    private boolean preIsTokened = false;    //~ Constructors -----------------------------------------------------------    /**     * Construct a token stream processing the given input.     *     * @param in I/O reader     */    public CJKTokenizer(Reader in) {        input = in;    }    //~ Methods ----------------------------------------------------------------    /**     * Returns the next token in the stream, or null at EOS.     * See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html     * for detail.     *     * @return Token     *     * @throws java.io.IOException - throw IOException when read error <br>     *         hanppened in the InputStream     *     */    public final Token next() throws java.io.IOException {        /** how many character(s) has been stored in buffer */        int length = 0;        /** the position used to create Token */        int start = offset;        while (true) {            /** current charactor */            char c;            /** unicode block of current charactor for detail */            Character.UnicodeBlock ub;            offset++;            if (bufferIndex >= dataLen) {                dataLen = input.read(ioBuffer);                bufferIndex = 0;            }            if (dataLen == -1) {                if (length > 0) {                    if (preIsTokened == true) {                        length = 0;                        preIsTokened = false;                    }                    break;                } else {                    return null;                }            } else {                //get current character                c = ioBuffer[bufferIndex++];                //get the UnicodeBlock of the current character                ub = Character.UnicodeBlock.of(c);            }            //if the current character is ASCII or Extend ASCII            if ((ub == Character.UnicodeBlock.BASIC_LATIN)                    || (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS)               ) {                if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {                    /** convert  HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */                    int i = (int) c;                    i = i - 65248;                    c = (char) i;                }                // if the current character is a letter or "_" "+" "#"                if (Character.isLetterOrDigit(c)                        || ((c == '_') || (c == '+') || (c == '#'))                   ) {                    if (length == 0) {                        // "javaC1C2C3C4linux" <br>                        //      ^--: the current character begin to token the ASCII                        // letter                        start = offset - 1;                    } else if (tokenType == "double") {                        // "javaC1C2C3C4linux" <br>                        //              ^--: the previous non-ASCII                        // : the current character                        offset--;                        bufferIndex--;                        tokenType = "single";                        if (preIsTokened == true) {                            // there is only one non-ASCII has been stored                            length = 0;                            preIsTokened = false;                            break;                        } else {                            break;                        }                    }                    // store the LowerCase(c) in the buffer                    buffer[length++] = Character.toLowerCase(c);                    tokenType = "single";                    // break the procedure if buffer overflowed!                    if (length == MAX_WORD_LEN) {                        break;                    }                } else if (length > 0) {                    if (preIsTokened == true) {                        length = 0;                        preIsTokened = false;                    } else {                        break;                    }                }            } else {                // non-ASCII letter, eg."C1C2C3C4"                if (Character.isLetter(c)) {                    if (length == 0) {                        start = offset - 1;                        buffer[length++] = c;                        tokenType = "double";                    } else {                        if (tokenType == "single") {                            offset--;                            bufferIndex--;                            //return the previous ASCII characters                            break;                        } else {                            buffer[length++] = c;                            tokenType = "double";                            if (length == 2) {                                offset--;                                bufferIndex--;                                preIsTokened = true;                                break;                            }                        }                    }                } else if (length > 0) {                    if (preIsTokened == true) {                        // empty the buffer                        length = 0;                        preIsTokened = false;                    } else {                        break;                    }                }            }        }        return new Token(new String(buffer, 0, length), start, start + length,                         tokenType                        );    }}
cjktokenizer.java - 源码说明

本页面展示了「一套java版本的搜索引擎源码」中的 cjktokenizer.java 源码文件，采用 Java 编程语言编写，共 244 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与java相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?