📄 russianstemmer.java
字号:
package org.apache.lucene.analysis.ru;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *//** * Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description). * * @author Boris Okner, b.okner@rogers.com * @version $Id: RussianStemmer.java 472959 2006-11-09 16:21:50Z yonik $ */class RussianStemmer{ private char[] charset; // positions of RV, R1 and R2 respectively private int RV, R1, R2; // letters (currently unused letters are commented out) private final static char A = 0; //private final static char B = 1; private final static char V = 2; private final static char G = 3; //private final static char D = 4; private final static char E = 5; //private final static char ZH = 6; //private final static char Z = 7; private final static char I = 8; private final static char I_ = 9; //private final static char K = 10; private final static char L = 11; private final static char M = 12; private final static char N = 13; private final static char O = 14; //private final static char P = 15; //private final static char R = 16; private final static char S = 17; private final static char T = 18; private final static char U = 19; //private final static char F = 20; private final static char X = 21; //private final static char TS = 22; //private final static char CH = 23; private final static char SH = 24; private final static char SHCH = 25; //private final static char HARD = 26; private final static char Y = 27; private final static char SOFT = 28; private final static char AE = 29; private final static char IU = 30; private final static char IA = 31; // stem definitions private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA }; private static char[][] perfectiveGerundEndings1 = { { V }, { V, SH, I }, { V, SH, I, S, SOFT } }; private static char[][] perfectiveGerund1Predessors = { { A }, { IA } }; private static char[][] perfectiveGerundEndings2 = { { I, V }, { Y, V }, { I, V, SH, I }, { Y, V, SH, I }, { I, V, SH, I, S, SOFT }, { Y, V, SH, I, S, SOFT } }; private static char[][] adjectiveEndings = { { E, E }, { I, E }, { Y, E }, { O, E }, { E, I_ }, { I, I_ }, { Y, I_ }, { O, I_ }, { E, M }, { I, M }, { Y, M }, { O, M }, { I, X }, { Y, X }, { U, IU }, { IU, IU }, { A, IA }, { IA, IA }, { O, IU }, { E, IU }, { I, M, I }, { Y, M, I }, { E, G, O }, { O, G, O }, { E, M, U }, {O, M, U } }; private static char[][] participleEndings1 = { { SHCH }, { E, M }, { N, N }, { V, SH }, { IU, SHCH } }; private static char[][] participleEndings2 = { { I, V, SH }, { Y, V, SH }, { U, IU, SHCH } }; private static char[][] participle1Predessors = { { A }, { IA } }; private static char[][] reflexiveEndings = { { S, IA }, { S, SOFT } }; private static char[][] verbEndings1 = { { I_ }, { L }, { N }, { L, O }, { N, O }, { E, T }, { IU, T }, { L, A }, { N, A }, { L, I }, { E, M }, { N, Y }, { E, T, E }, { I_, T, E }, { T, SOFT }, { E, SH, SOFT }, { N, N, O } }; private static char[][] verbEndings2 = { { IU }, { U, IU }, { E, N }, { E, I_ }, { IA, T }, { U, I_ }, { I, L }, { Y, L }, { I, M }, { Y, M }, { I, T }, { Y, T }, { I, L, A }, { Y, L, A }, { E, N, A }, { I, T, E }, { I, L, I }, { Y, L, I }, { I, L, O }, { Y, L, O }, { E, N, O }, { U, E, T }, { U, IU, T }, { E, N, Y }, { I, T, SOFT }, { Y, T, SOFT }, { I, SH, SOFT }, { E, I_, T, E }, { U, I_, T, E } }; private static char[][] verb1Predessors = { { A }, { IA } }; private static char[][] nounEndings = { { A }, { U }, { I_ }, { O }, { U }, { E }, { Y }, { I }, { SOFT }, { IA }, { E, V }, { O, V }, { I, E }, { SOFT, E }, { IA, X }, { I, IU }, { E, I }, { I, I }, { E, I_ }, { O, I_ }, { E, M }, { A, M }, { O, M }, { A, X }, { SOFT, IU }, { I, IA }, { SOFT, IA }, { I, I_ }, { IA, M }, { IA, M, I }, { A, M, I }, { I, E, I_ }, { I, IA, M }, { I, E, M }, { I, IA, X }, { I, IA, M, I } }; private static char[][] superlativeEndings = { { E, I_, SH }, { E, I_, SH, E } }; private static char[][] derivationalEndings = { { O, S, T }, { O, S, T, SOFT } }; /** * RussianStemmer constructor comment. */ public RussianStemmer() { super(); } /** * RussianStemmer constructor comment. */ public RussianStemmer(char[] charset) { super(); this.charset = charset; } /** * Adjectival ending is an adjective ending, * optionally preceded by participle ending. * Creation date: (17/03/2002 12:14:58 AM) * @param stemmingZone java.lang.StringBuffer */ private boolean adjectival(StringBuffer stemmingZone) { // look for adjective ending in a stemming zone if (!findAndRemoveEnding(stemmingZone, adjectiveEndings)) return false; // if adjective ending was found, try for participle ending. // variable r is unused, we are just interested in the side effect of // findAndRemoveEnding(): boolean r = findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors) || findAndRemoveEnding(stemmingZone, participleEndings2); return true; } /** * Derivational endings * Creation date: (17/03/2002 12:14:58 AM) * @param stemmingZone java.lang.StringBuffer */ private boolean derivational(StringBuffer stemmingZone) { int endingLength = findEnding(stemmingZone, derivationalEndings); if (endingLength == 0) // no derivational ending found return false; else { // Ensure that the ending locates in R2 if (R2 - RV <= stemmingZone.length() - endingLength) { stemmingZone.setLength(stemmingZone.length() - endingLength); return true; } else { return false; } } } /** * Finds ending among given ending class and returns the length of ending found(0, if not found). * Creation date: (17/03/2002 8:18:34 PM) */ private int findEnding(StringBuffer stemmingZone, int startIndex, char[][] theEndingClass)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -