russiananalyzer.java

来自「一套java版本的搜索引擎源码」· Java 代码 · 共 261 行
JAVA
261 行
package org.apache.lucene.analysis.ru;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements.  See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License.  You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.StopFilter;import org.apache.lucene.analysis.TokenStream;import java.io.Reader;import java.util.Hashtable;import java.util.Set;import java.util.HashSet;/** * Analyzer for Russian language. Supports an external list of stopwords (words that * will not be indexed at all). * A default set of stopwords is used unless an alternative list is specified. * * @author  Boris Okner, b.okner@rogers.com * @version $Id: RussianAnalyzer.java 472959 2006-11-09 16:21:50Z yonik $ */public final class RussianAnalyzer extends Analyzer{    // letters (currently unused letters are commented out)    private final static char A = 0;    private final static char B = 1;    private final static char V = 2;    private final static char G = 3;    private final static char D = 4;    private final static char E = 5;    private final static char ZH = 6;    private final static char Z = 7;    private final static char I = 8;    private final static char I_ = 9;    private final static char K = 10;    private final static char L = 11;    private final static char M = 12;    private final static char N = 13;    private final static char O = 14;    private final static char P = 15;    private final static char R = 16;    private final static char S = 17;    private final static char T = 18;    private final static char U = 19;    //private final static char F = 20;    private final static char X = 21;    //private final static char TS = 22;    private final static char CH = 23;    private final static char SH = 24;    private final static char SHCH = 25;    //private final static char HARD = 26;    private final static char Y = 27;    private final static char SOFT = 28;    private final static char AE = 29;    private final static char IU = 30;    private final static char IA = 31;    /**     * List of typical Russian stopwords.     */    private static char[][] RUSSIAN_STOP_WORDS = {        {A},        {B, E, Z},        {B, O, L, E, E},        {B, Y},        {B, Y, L},        {B, Y, L, A},        {B, Y, L, I},        {B, Y, L, O},        {B, Y, T, SOFT},        {V},        {V, A, M},        {V, A, S},        {V, E, S, SOFT},        {V, O},        {V, O, T},        {V, S, E},        {V, S, E, G, O},        {V, S, E, X},        {V, Y},        {G, D, E},        {D, A},        {D, A, ZH, E},        {D, L, IA},        {D, O},        {E, G, O},        {E, E},        {E, I_,},        {E, IU},        {E, S, L, I},        {E, S, T, SOFT},        {E, SHCH, E},        {ZH, E},        {Z, A},        {Z, D, E, S, SOFT},        {I},        {I, Z},        {I, L, I},        {I, M},        {I, X},        {K},        {K, A, K},        {K, O},        {K, O, G, D, A},        {K, T, O},        {L, I},        {L, I, B, O},        {M, N, E},        {M, O, ZH, E, T},        {M, Y},        {N, A},        {N, A, D, O},        {N, A, SH},        {N, E},        {N, E, G, O},        {N, E, E},        {N, E, T},        {N, I},        {N, I, X},        {N, O},        {N, U},        {O},        {O, B},        {O, D, N, A, K, O},        {O, N},        {O, N, A},        {O, N, I},        {O, N, O},        {O, T},        {O, CH, E, N, SOFT},        {P, O},        {P, O, D},        {P, R, I},        {S},        {S, O},        {T, A, K},        {T, A, K, ZH, E},        {T, A, K, O, I_},        {T, A, M},        {T, E},        {T, E, M},        {T, O},        {T, O, G, O},        {T, O, ZH, E},        {T, O, I_},        {T, O, L, SOFT, K, O},        {T, O, M},        {T, Y},        {U},        {U, ZH, E},        {X, O, T, IA},        {CH, E, G, O},        {CH, E, I_},        {CH, E, M},        {CH, T, O},        {CH, T, O, B, Y},        {CH, SOFT, E},        {CH, SOFT, IA},        {AE, T, A},        {AE, T, I},        {AE, T, O},        {IA}    };    /**     * Contains the stopwords used with the StopFilter.     */    private Set stopSet = new HashSet();    /**     * Charset for Russian letters.     * Represents encoding for 32 lowercase Russian letters.     * Predefined charsets can be taken from RussianCharSets class     */    private char[] charset;    public RussianAnalyzer() {        charset = RussianCharsets.UnicodeRussian;        stopSet = StopFilter.makeStopSet(                    makeStopWords(RussianCharsets.UnicodeRussian));    }    /**     * Builds an analyzer.     */    public RussianAnalyzer(char[] charset)    {        this.charset = charset;        stopSet = StopFilter.makeStopSet(makeStopWords(charset));    }    /**     * Builds an analyzer with the given stop words.     */    public RussianAnalyzer(char[] charset, String[] stopwords)    {        this.charset = charset;        stopSet = StopFilter.makeStopSet(stopwords);    }    // Takes russian stop words and translates them to a String array, using    // the given charset    private static String[] makeStopWords(char[] charset)    {        String[] res = new String[RUSSIAN_STOP_WORDS.length];        for (int i = 0; i < res.length; i++)        {            char[] theStopWord = RUSSIAN_STOP_WORDS[i];            // translate the word, using the charset            StringBuffer theWord = new StringBuffer();            for (int j = 0; j < theStopWord.length; j++)            {                theWord.append(charset[theStopWord[j]]);            }            res[i] = theWord.toString();        }        return res;    }    /**     * Builds an analyzer with the given stop words.     * @todo create a Set version of this ctor     */    public RussianAnalyzer(char[] charset, Hashtable stopwords)    {        this.charset = charset;        stopSet = new HashSet(stopwords.keySet());    }    /**     * Creates a TokenStream which tokenizes all the text in the provided Reader.     *     * @return  A TokenStream build from a RussianLetterTokenizer filtered with     *                  RussianLowerCaseFilter, StopFilter, and RussianStemFilter     */    public TokenStream tokenStream(String fieldName, Reader reader)    {        TokenStream result = new RussianLetterTokenizer(reader, charset);        result = new RussianLowerCaseFilter(result, charset);        result = new StopFilter(result, stopSet);        result = new RussianStemFilter(result, charset);        return result;    }}
russiananalyzer.java - 源码说明

本页面展示了「一套java版本的搜索引擎源码」中的 russiananalyzer.java 源码文件，采用 Java 编程语言编写，共 261 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与java相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?