📄 russianstemmer.java
字号:
package org.apache.lucene.analysis.ru;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).
*
* @author Boris Okner, b.okner@rogers.com
* @version $Id: RussianStemmer.java,v 1.5 2004/03/29 22:48:01 cutting Exp $
*/
class RussianStemmer
{
private char[] charset;
// positions of RV, R1 and R2 respectively
private int RV, R1, R2;
// letters
private static char A = 0;
private static char B = 1;
private static char V = 2;
private static char G = 3;
private static char D = 4;
private static char E = 5;
private static char ZH = 6;
private static char Z = 7;
private static char I = 8;
private static char I_ = 9;
private static char K = 10;
private static char L = 11;
private static char M = 12;
private static char N = 13;
private static char O = 14;
private static char P = 15;
private static char R = 16;
private static char S = 17;
private static char T = 18;
private static char U = 19;
private static char F = 20;
private static char X = 21;
private static char TS = 22;
private static char CH = 23;
private static char SH = 24;
private static char SHCH = 25;
private static char HARD = 26;
private static char Y = 27;
private static char SOFT = 28;
private static char AE = 29;
private static char IU = 30;
private static char IA = 31;
// stem definitions
private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA };
private static char[][] perfectiveGerundEndings1 = {
{ V },
{ V, SH, I },
{ V, SH, I, S, SOFT }
};
private static char[][] perfectiveGerund1Predessors = {
{ A },
{ IA }
};
private static char[][] perfectiveGerundEndings2 = { { I, V }, {
Y, V }, {
I, V, SH, I }, {
Y, V, SH, I }, {
I, V, SH, I, S, SOFT }, {
Y, V, SH, I, S, SOFT }
};
private static char[][] adjectiveEndings = {
{ E, E },
{ I, E },
{ Y, E },
{ O, E },
{ E, I_ },
{ I, I_ },
{ Y, I_ },
{ O, I_ },
{ E, M },
{ I, M },
{ Y, M },
{ O, M },
{ I, X },
{ Y, X },
{ U, IU },
{ IU, IU },
{ A, IA },
{ IA, IA },
{ O, IU },
{ E, IU },
{ I, M, I },
{ Y, M, I },
{ E, G, O },
{ O, G, O },
{ E, M, U },
{O, M, U }
};
private static char[][] participleEndings1 = {
{ SHCH },
{ E, M },
{ N, N },
{ V, SH },
{ IU, SHCH }
};
private static char[][] participleEndings2 = {
{ I, V, SH },
{ Y, V, SH },
{ U, IU, SHCH }
};
private static char[][] participle1Predessors = {
{ A },
{ IA }
};
private static char[][] reflexiveEndings = {
{ S, IA },
{ S, SOFT }
};
private static char[][] verbEndings1 = {
{ I_ },
{ L },
{ N },
{ L, O },
{ N, O },
{ E, T },
{ IU, T },
{ L, A },
{ N, A },
{ L, I },
{ E, M },
{ N, Y },
{ E, T, E },
{ I_, T, E },
{ T, SOFT },
{ E, SH, SOFT },
{ N, N, O }
};
private static char[][] verbEndings2 = {
{ IU },
{ U, IU },
{ E, N },
{ E, I_ },
{ IA, T },
{ U, I_ },
{ I, L },
{ Y, L },
{ I, M },
{ Y, M },
{ I, T },
{ Y, T },
{ I, L, A },
{ Y, L, A },
{ E, N, A },
{ I, T, E },
{ I, L, I },
{ Y, L, I },
{ I, L, O },
{ Y, L, O },
{ E, N, O },
{ U, E, T },
{ U, IU, T },
{ E, N, Y },
{ I, T, SOFT },
{ Y, T, SOFT },
{ I, SH, SOFT },
{ E, I_, T, E },
{ U, I_, T, E }
};
private static char[][] verb1Predessors = {
{ A },
{ IA }
};
private static char[][] nounEndings = {
{ A },
{ U },
{ I_ },
{ O },
{ U },
{ E },
{ Y },
{ I },
{ SOFT },
{ IA },
{ E, V },
{ O, V },
{ I, E },
{ SOFT, E },
{ IA, X },
{ I, IU },
{ E, I },
{ I, I },
{ E, I_ },
{ O, I_ },
{ E, M },
{ A, M },
{ O, M },
{ A, X },
{ SOFT, IU },
{ I, IA },
{ SOFT, IA },
{ I, I_ },
{ IA, M },
{ IA, M, I },
{ A, M, I },
{ I, E, I_ },
{ I, IA, M },
{ I, E, M },
{ I, IA, X },
{ I, IA, M, I }
};
private static char[][] superlativeEndings = {
{ E, I_, SH },
{ E, I_, SH, E }
};
private static char[][] derivationalEndings = {
{ O, S, T },
{ O, S, T, SOFT }
};
/**
* RussianStemmer constructor comment.
*/
public RussianStemmer()
{
super();
}
/**
* RussianStemmer constructor comment.
*/
public RussianStemmer(char[] charset)
{
super();
this.charset = charset;
}
/**
* Adjectival ending is an adjective ending,
* optionally preceded by participle ending.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuffer
*/
private boolean adjectival(StringBuffer stemmingZone)
{
// look for adjective ending in a stemming zone
if (!findAndRemoveEnding(stemmingZone, adjectiveEndings))
return false;
// if adjective ending was found, try for participle ending
boolean r =
findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors)
||
findAndRemoveEnding(stemmingZone, participleEndings2);
return true;
}
/**
* Derivational endings
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuffer
*/
private boolean derivational(StringBuffer stemmingZone)
{
int endingLength = findEnding(stemmingZone, derivationalEndings);
if (endingLength == 0)
// no derivational ending found
return false;
else
{
// Ensure that the ending locates in R2
if (R2 - RV <= stemmingZone.length() - endingLength)
{
stemmingZone.setLength(stemmingZone.length() - endingLength);
return true;
}
else
{
return false;
}
}
}
/**
* Finds ending among given ending class and returns the length of ending found(0, if not found).
* Creation date: (17/03/2002 8:18:34 PM)
*/
private int findEnding(StringBuffer stemmingZone, int startIndex, char[][] theEndingClass)
{
boolean match = false;
for (int i = theEndingClass.length - 1; i >= 0; i--)
{
char[] theEnding = theEndingClass[i];
// check if the ending is bigger than stemming zone
if (startIndex < theEnding.length - 1)
{
match = false;
continue;
}
match = true;
int stemmingIndex = startIndex;
for (int j = theEnding.length - 1; j >= 0; j--)
{
if (stemmingZone.charAt(stemmingIndex--) != charset[theEnding[j]])
{
match = false;
break;
}
}
// check if ending was found
if (match)
{
return theEndingClass[i].length; // cut ending
}
}
return 0;
}
private int findEnding(StringBuffer stemmingZone, char[][] theEndingClass)
{
return findEnding(stemmingZone, stemmingZone.length() - 1, theEndingClass);
}
/**
* Finds the ending among the given class of endings and removes it from stemming zone.
* Creation date: (17/03/2002 8:18:34 PM)
*/
private boolean findAndRemoveEnding(StringBuffer stemmingZone, char[][] theEndingClass)
{
int endingLength = findEnding(stemmingZone, theEndingClass);
if (endingLength == 0)
// not found
return false;
else {
stemmingZone.setLength(stemmingZone.length() - endingLength);
// cut the ending found
return true;
}
}
/**
* Finds the ending among the given class of endings, then checks if this ending was
* preceded by any of given predessors, and if so, removes it from stemming zone.
* Creation date: (17/03/2002 8:18:34 PM)
*/
private boolean findAndRemoveEnding(StringBuffer stemmingZone,
char[][] theEndingClass, char[][] thePredessors)
{
int endingLength = findEnding(stemmingZone, theEndingClass);
if (endingLength == 0)
// not found
return false;
else
{
int predessorLength =
findEnding(stemmingZone,
stemmingZone.length() - endingLength - 1,
thePredessors);
if (predessorLength == 0)
return false;
else {
stemmingZone.setLength(stemmingZone.length() - endingLength);
// cut the ending found
return true;
}
}
}
/**
* Marks positions of RV, R1 and R2 in a given word.
* Creation date: (16/03/2002 3:40:11 PM)
*/
private void markPositions(String word)
{
RV = 0;
R1 = 0;
R2 = 0;
int i = 0;
// find RV
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -