📄 russianstemmer.java
字号:
while (word.length() > i && !isVowel(word.charAt(i)))
{
i++;
}
if (word.length() - 1 < ++i)
return; // RV zone is empty
RV = i;
// find R1
while (word.length() > i && isVowel(word.charAt(i)))
{
i++;
}
if (word.length() - 1 < ++i)
return; // R1 zone is empty
R1 = i;
// find R2
while (word.length() > i && !isVowel(word.charAt(i)))
{
i++;
}
if (word.length() - 1 < ++i)
return; // R2 zone is empty
while (word.length() > i && isVowel(word.charAt(i)))
{
i++;
}
if (word.length() - 1 < ++i)
return; // R2 zone is empty
R2 = i;
}
/**
* Checks if character is a vowel..
* Creation date: (16/03/2002 10:47:03 PM)
* @return boolean
* @param letter char
*/
private boolean isVowel(char letter)
{
for (int i = 0; i < vowels.length; i++)
{
if (letter == charset[vowels[i]])
return true;
}
return false;
}
/**
* Noun endings.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuffer
*/
private boolean noun(StringBuffer stemmingZone)
{
return findAndRemoveEnding(stemmingZone, nounEndings);
}
/**
* Perfective gerund endings.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuffer
*/
private boolean perfectiveGerund(StringBuffer stemmingZone)
{
return findAndRemoveEnding(
stemmingZone,
perfectiveGerundEndings1,
perfectiveGerund1Predessors)
|| findAndRemoveEnding(stemmingZone, perfectiveGerundEndings2);
}
/**
* Reflexive endings.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuffer
*/
private boolean reflexive(StringBuffer stemmingZone)
{
return findAndRemoveEnding(stemmingZone, reflexiveEndings);
}
/**
* Insert the method's description here.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuffer
*/
private boolean removeI(StringBuffer stemmingZone)
{
if (stemmingZone.length() > 0
&& stemmingZone.charAt(stemmingZone.length() - 1) == charset[I])
{
stemmingZone.setLength(stemmingZone.length() - 1);
return true;
}
else
{
return false;
}
}
/**
* Insert the method's description here.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuffer
*/
private boolean removeSoft(StringBuffer stemmingZone)
{
if (stemmingZone.length() > 0
&& stemmingZone.charAt(stemmingZone.length() - 1) == charset[SOFT])
{
stemmingZone.setLength(stemmingZone.length() - 1);
return true;
}
else
{
return false;
}
}
/**
* Insert the method's description here.
* Creation date: (16/03/2002 10:58:42 PM)
* @param newCharset char[]
*/
public void setCharset(char[] newCharset)
{
charset = newCharset;
}
/**
* Set ending definition as in Russian stemming algorithm.
* Creation date: (16/03/2002 11:16:36 PM)
*/
private void setEndings()
{
vowels = new char[] { A, E, I, O, U, Y, AE, IU, IA };
perfectiveGerundEndings1 = new char[][] {
{ V }, { V, SH, I }, { V, SH, I, S, SOFT }
};
perfectiveGerund1Predessors = new char[][] { { A }, { IA }
};
perfectiveGerundEndings2 = new char[][] {
{ I, V },
{ Y, V },
{ I, V, SH, I },
{ Y, V, SH, I },
{ I, V, SH, I, S, SOFT },
{ Y, V, SH, I, S, SOFT }
};
adjectiveEndings = new char[][] {
{ E, E },
{ I, E },
{ Y, E },
{ O, E },
{ E, I_ },
{ I, I_ },
{ Y, I_ },
{ O, I_ },
{ E, M },
{ I, M },
{ Y, M },
{ O, M },
{ I, X },
{ Y, X },
{ U, IU },
{ IU, IU },
{ A, IA },
{ IA, IA },
{ O, IU },
{ E, IU },
{ I, M, I },
{ Y, M, I },
{ E, G, O },
{ O, G, O },
{ E, M, U },
{ O, M, U }
};
participleEndings1 = new char[][] {
{ SHCH },
{ E, M },
{ N, N },
{ V, SH },
{ IU, SHCH }
};
participleEndings2 = new char[][] {
{ I, V, SH },
{ Y, V, SH },
{ U, IU, SHCH }
};
participle1Predessors = new char[][] {
{ A },
{ IA }
};
reflexiveEndings = new char[][] {
{ S, IA },
{ S, SOFT }
};
verbEndings1 = new char[][] {
{ I_ },
{ L },
{ N },
{ L, O },
{ N, O },
{ E, T },
{ IU, T },
{ L, A },
{ N, A },
{ L, I },
{ E, M },
{ N, Y },
{ E, T, E },
{ I_, T, E },
{ T, SOFT },
{ E, SH, SOFT },
{ N, N, O }
};
verbEndings2 = new char[][] {
{ IU },
{ U, IU },
{ E, N },
{ E, I_ },
{ IA, T },
{ U, I_ },
{ I, L },
{ Y, L },
{ I, M },
{ Y, M },
{ I, T },
{ Y, T },
{ I, L, A },
{ Y, L, A },
{ E, N, A },
{ I, T, E },
{ I, L, I },
{ Y, L, I },
{ I, L, O },
{ Y, L, O },
{ E, N, O },
{ U, E, T },
{ U, IU, T },
{ E, N, Y },
{ I, T, SOFT },
{ Y, T, SOFT },
{ I, SH, SOFT },
{ E, I_, T, E },
{ U, I_, T, E }
};
verb1Predessors = new char[][] {
{ A },
{ IA }
};
nounEndings = new char[][] {
{ A },
{ IU },
{ I_ },
{ O },
{ U },
{ E },
{ Y },
{ I },
{ SOFT },
{ IA },
{ E, V },
{ O, V },
{ I, E },
{ SOFT, E },
{ IA, X },
{ I, IU },
{ E, I },
{ I, I },
{ E, I_ },
{ O, I_ },
{ E, M },
{ A, M },
{ O, M },
{ A, X },
{ SOFT, IU },
{ I, IA },
{ SOFT, IA },
{ I, I_ },
{ IA, M },
{ IA, M, I },
{ A, M, I },
{ I, E, I_ },
{ I, IA, M },
{ I, E, M },
{ I, IA, X },
{ I, IA, M, I }
};
superlativeEndings = new char[][] {
{ E, I_, SH },
{ E, I_, SH, E }
};
derivationalEndings = new char[][] {
{ O, S, T },
{ O, S, T, SOFT }
};
}
/**
* Finds the stem for given Russian word.
* Creation date: (16/03/2002 3:36:48 PM)
* @return java.lang.String
* @param input java.lang.String
*/
public String stem(String input)
{
markPositions(input);
if (RV == 0)
return input; //RV wasn't detected, nothing to stem
StringBuffer stemmingZone = new StringBuffer(input.substring(RV));
// stemming goes on in RV
// Step 1
if (!perfectiveGerund(stemmingZone))
{
reflexive(stemmingZone);
boolean r =
adjectival(stemmingZone)
|| verb(stemmingZone)
|| noun(stemmingZone);
}
// Step 2
removeI(stemmingZone);
// Step 3
derivational(stemmingZone);
// Step 4
superlative(stemmingZone);
undoubleN(stemmingZone);
removeSoft(stemmingZone);
// return result
return input.substring(0, RV) + stemmingZone.toString();
}
/**
* Superlative endings.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuffer
*/
private boolean superlative(StringBuffer stemmingZone)
{
return findAndRemoveEnding(stemmingZone, superlativeEndings);
}
/**
* Undoubles N.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuffer
*/
private boolean undoubleN(StringBuffer stemmingZone)
{
char[][] doubleN = {
{ N, N }
};
if (findEnding(stemmingZone, doubleN) != 0)
{
stemmingZone.setLength(stemmingZone.length() - 1);
return true;
}
else
{
return false;
}
}
/**
* Verb endings.
* Creation date: (17/03/2002 12:14:58 AM)
* @param stemmingZone java.lang.StringBuffer
*/
private boolean verb(StringBuffer stemmingZone)
{
return findAndRemoveEnding(
stemmingZone,
verbEndings1,
verb1Predessors)
|| findAndRemoveEnding(stemmingZone, verbEndings2);
}
/**
* Static method for stemming with different charsets
*/
public static String stem(String theWord, char[] charset)
{
RussianStemmer stemmer = new RussianStemmer();
stemmer.setCharset(charset);
return stemmer.stem(theWord);
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -