📄 russianstemmer.java
字号:
{ boolean match = false; for (int i = theEndingClass.length - 1; i >= 0; i--) { char[] theEnding = theEndingClass[i]; // check if the ending is bigger than stemming zone if (startIndex < theEnding.length - 1) { match = false; continue; } match = true; int stemmingIndex = startIndex; for (int j = theEnding.length - 1; j >= 0; j--) { if (stemmingZone.charAt(stemmingIndex--) != charset[theEnding[j]]) { match = false; break; } } // check if ending was found if (match) { return theEndingClass[i].length; // cut ending } } return 0; } private int findEnding(StringBuffer stemmingZone, char[][] theEndingClass) { return findEnding(stemmingZone, stemmingZone.length() - 1, theEndingClass); } /** * Finds the ending among the given class of endings and removes it from stemming zone. * Creation date: (17/03/2002 8:18:34 PM) */ private boolean findAndRemoveEnding(StringBuffer stemmingZone, char[][] theEndingClass) { int endingLength = findEnding(stemmingZone, theEndingClass); if (endingLength == 0) // not found return false; else { stemmingZone.setLength(stemmingZone.length() - endingLength); // cut the ending found return true; } } /** * Finds the ending among the given class of endings, then checks if this ending was * preceded by any of given predessors, and if so, removes it from stemming zone. * Creation date: (17/03/2002 8:18:34 PM) */ private boolean findAndRemoveEnding(StringBuffer stemmingZone, char[][] theEndingClass, char[][] thePredessors) { int endingLength = findEnding(stemmingZone, theEndingClass); if (endingLength == 0) // not found return false; else { int predessorLength = findEnding(stemmingZone, stemmingZone.length() - endingLength - 1, thePredessors); if (predessorLength == 0) return false; else { stemmingZone.setLength(stemmingZone.length() - endingLength); // cut the ending found return true; } } } /** * Marks positions of RV, R1 and R2 in a given word. * Creation date: (16/03/2002 3:40:11 PM) */ private void markPositions(String word) { RV = 0; R1 = 0; R2 = 0; int i = 0; // find RV while (word.length() > i && !isVowel(word.charAt(i))) { i++; } if (word.length() - 1 < ++i) return; // RV zone is empty RV = i; // find R1 while (word.length() > i && isVowel(word.charAt(i))) { i++; } if (word.length() - 1 < ++i) return; // R1 zone is empty R1 = i; // find R2 while (word.length() > i && !isVowel(word.charAt(i))) { i++; } if (word.length() - 1 < ++i) return; // R2 zone is empty while (word.length() > i && isVowel(word.charAt(i))) { i++; } if (word.length() - 1 < ++i) return; // R2 zone is empty R2 = i; } /** * Checks if character is a vowel.. * Creation date: (16/03/2002 10:47:03 PM) * @return boolean * @param letter char */ private boolean isVowel(char letter) { for (int i = 0; i < vowels.length; i++) { if (letter == charset[vowels[i]]) return true; } return false; } /** * Noun endings. * Creation date: (17/03/2002 12:14:58 AM) * @param stemmingZone java.lang.StringBuffer */ private boolean noun(StringBuffer stemmingZone) { return findAndRemoveEnding(stemmingZone, nounEndings); } /** * Perfective gerund endings. * Creation date: (17/03/2002 12:14:58 AM) * @param stemmingZone java.lang.StringBuffer */ private boolean perfectiveGerund(StringBuffer stemmingZone) { return findAndRemoveEnding( stemmingZone, perfectiveGerundEndings1, perfectiveGerund1Predessors) || findAndRemoveEnding(stemmingZone, perfectiveGerundEndings2); } /** * Reflexive endings. * Creation date: (17/03/2002 12:14:58 AM) * @param stemmingZone java.lang.StringBuffer */ private boolean reflexive(StringBuffer stemmingZone) { return findAndRemoveEnding(stemmingZone, reflexiveEndings); } /** * Insert the method's description here. * Creation date: (17/03/2002 12:14:58 AM) * @param stemmingZone java.lang.StringBuffer */ private boolean removeI(StringBuffer stemmingZone) { if (stemmingZone.length() > 0 && stemmingZone.charAt(stemmingZone.length() - 1) == charset[I]) { stemmingZone.setLength(stemmingZone.length() - 1); return true; } else { return false; } } /** * Insert the method's description here. * Creation date: (17/03/2002 12:14:58 AM) * @param stemmingZone java.lang.StringBuffer */ private boolean removeSoft(StringBuffer stemmingZone) { if (stemmingZone.length() > 0 && stemmingZone.charAt(stemmingZone.length() - 1) == charset[SOFT]) { stemmingZone.setLength(stemmingZone.length() - 1); return true; } else { return false; } } /** * Insert the method's description here. * Creation date: (16/03/2002 10:58:42 PM) * @param newCharset char[] */ public void setCharset(char[] newCharset) { charset = newCharset; } /** * Finds the stem for given Russian word. * Creation date: (16/03/2002 3:36:48 PM) * @return java.lang.String * @param input java.lang.String */ public String stem(String input) { markPositions(input); if (RV == 0) return input; //RV wasn't detected, nothing to stem StringBuffer stemmingZone = new StringBuffer(input.substring(RV)); // stemming goes on in RV // Step 1 if (!perfectiveGerund(stemmingZone)) { reflexive(stemmingZone); // variable r is unused, we are just interested in the flow that gets // created by logical expression: apply adjectival(); if that fails, // apply verb() etc boolean r = adjectival(stemmingZone) || verb(stemmingZone) || noun(stemmingZone); } // Step 2 removeI(stemmingZone); // Step 3 derivational(stemmingZone); // Step 4 superlative(stemmingZone); undoubleN(stemmingZone); removeSoft(stemmingZone); // return result return input.substring(0, RV) + stemmingZone.toString(); } /** * Superlative endings. * Creation date: (17/03/2002 12:14:58 AM) * @param stemmingZone java.lang.StringBuffer */ private boolean superlative(StringBuffer stemmingZone) { return findAndRemoveEnding(stemmingZone, superlativeEndings); } /** * Undoubles N. * Creation date: (17/03/2002 12:14:58 AM) * @param stemmingZone java.lang.StringBuffer */ private boolean undoubleN(StringBuffer stemmingZone) { char[][] doubleN = { { N, N } }; if (findEnding(stemmingZone, doubleN) != 0) { stemmingZone.setLength(stemmingZone.length() - 1); return true; } else { return false; } } /** * Verb endings. * Creation date: (17/03/2002 12:14:58 AM) * @param stemmingZone java.lang.StringBuffer */ private boolean verb(StringBuffer stemmingZone) { return findAndRemoveEnding( stemmingZone, verbEndings1, verb1Predessors) || findAndRemoveEnding(stemmingZone, verbEndings2); } /** * Static method for stemming with different charsets */ public static String stem(String theWord, char[] charset) { RussianStemmer stemmer = new RussianStemmer(); stemmer.setCharset(charset); return stemmer.stem(theWord); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -