📄 lovinsstemmer.java
字号:
throw new IllegalArgumentException("Fatal error.");
}
}
}
el--;
}
return word;
}
/**
* Recodes ending of given word.
*/
private String recodeEnding(String word) {
int lastPos = word.length() - 1;
// Rule 1
if (word.endsWith("bb") || word.endsWith("dd") || word.endsWith("gg") || word.endsWith("ll")
|| word.endsWith("mm") || word.endsWith("nn") || word.endsWith("pp") || word.endsWith("rr")
|| word.endsWith("ss") || word.endsWith("tt")) {
word = word.substring(0, lastPos);
lastPos--;
}
// Rule 2
if (word.endsWith("iev")) {
word = word.substring(0, lastPos - 2).concat("ief");
}
// Rule 3
if (word.endsWith("uct")) {
word = word.substring(0, lastPos - 2).concat("uc");
lastPos--;
}
// Rule 4
if (word.endsWith("umpt")) {
word = word.substring(0, lastPos - 3).concat("um");
lastPos -= 2;
}
// Rule 5
if (word.endsWith("rpt")) {
word = word.substring(0, lastPos - 2).concat("rb");
lastPos--;
}
// Rule 6
if (word.endsWith("urs")) {
word = word.substring(0, lastPos - 2).concat("ur");
lastPos--;
}
// Rule 7
if (word.endsWith("istr")) {
word = word.substring(0, lastPos - 3).concat("ister");
lastPos++;
}
// Rule 7a
if (word.endsWith("metr")) {
word = word.substring(0, lastPos - 3).concat("meter");
lastPos++;
}
// Rule 8
if (word.endsWith("olv")) {
word = word.substring(0, lastPos - 2).concat("olut");
lastPos++;
}
// Rule 9
if (word.endsWith("ul")) {
if ((lastPos - 2 < 0)
|| ((word.charAt(lastPos - 2) != 'a') && (word.charAt(lastPos - 2) != 'i') && (word
.charAt(lastPos - 2) != 'o'))) {
word = word.substring(0, lastPos - 1).concat("l");
lastPos--;
}
}
// Rule 10
if (word.endsWith("bex")) {
word = word.substring(0, lastPos - 2).concat("bic");
}
// Rule 11
if (word.endsWith("dex")) {
word = word.substring(0, lastPos - 2).concat("dic");
}
// Rule 12
if (word.endsWith("pex")) {
word = word.substring(0, lastPos - 2).concat("pic");
}
// Rule 13
if (word.endsWith("tex")) {
word = word.substring(0, lastPos - 2).concat("tic");
}
// Rule 14
if (word.endsWith("ax")) {
word = word.substring(0, lastPos - 1).concat("ac");
}
// Rule 15
if (word.endsWith("ex")) {
word = word.substring(0, lastPos - 1).concat("ec");
}
// Rule 16
if (word.endsWith("ix")) {
word = word.substring(0, lastPos - 1).concat("ic");
}
// Rule 17
if (word.endsWith("lux")) {
word = word.substring(0, lastPos - 2).concat("luc");
}
// Rule 18
if (word.endsWith("uad")) {
word = word.substring(0, lastPos - 2).concat("uas");
}
// Rule 19
if (word.endsWith("vad")) {
word = word.substring(0, lastPos - 2).concat("vas");
}
// Rule 20
if (word.endsWith("cid")) {
word = word.substring(0, lastPos - 2).concat("cis");
}
// Rule 21
if (word.endsWith("lid")) {
word = word.substring(0, lastPos - 2).concat("lis");
}
// Rule 22
if (word.endsWith("erid")) {
word = word.substring(0, lastPos - 3).concat("eris");
}
// Rule 23
if (word.endsWith("pand")) {
word = word.substring(0, lastPos - 3).concat("pans");
}
// Rule 24
if (word.endsWith("end")) {
if ((lastPos - 3 < 0) || (word.charAt(lastPos - 3) != 's')) {
word = word.substring(0, lastPos - 2).concat("ens");
}
}
// Rule 25
if (word.endsWith("ond")) {
word = word.substring(0, lastPos - 2).concat("ons");
}
// Rule 26
if (word.endsWith("lud")) {
word = word.substring(0, lastPos - 2).concat("lus");
}
// Rule 27
if (word.endsWith("rud")) {
word = word.substring(0, lastPos - 2).concat("rus");
}
// Rule 28
if (word.endsWith("her")) {
if ((lastPos - 3 < 0) || ((word.charAt(lastPos - 3) != 'p') && (word.charAt(lastPos - 3) != 't'))) {
word = word.substring(0, lastPos - 2).concat("hes");
}
}
// Rule 29
if (word.endsWith("mit")) {
word = word.substring(0, lastPos - 2).concat("mis");
}
// Rule 30
if (word.endsWith("end")) {
if ((lastPos - 3 < 0) || (word.charAt(lastPos - 3) != 'm')) {
word = word.substring(0, lastPos - 2).concat("ens");
}
}
// Rule 31
if (word.endsWith("ert")) {
word = word.substring(0, lastPos - 2).concat("ers");
}
// Rule 32
if (word.endsWith("et")) {
if ((lastPos - 2 < 0) || (word.charAt(lastPos - 2) != 'n')) {
word = word.substring(0, lastPos - 1).concat("es");
}
}
// Rule 33
if (word.endsWith("yt")) {
word = word.substring(0, lastPos - 1).concat("ys");
}
// Rule 34
if (word.endsWith("yz")) {
word = word.substring(0, lastPos - 1).concat("ys");
}
return word;
}
/**
* Returns the stemmed version of the given word. Expects word to be lower
* case.
*
* @param word a string consisting of a single word
*/
public String stem(String word) {
if (word.length() > 2) {
return recodeEnding(removeEnding(word.toLowerCase()));
} else {
return word.toLowerCase();
}
}
/**
* Stems everything in the given string.
*/
public String stemString(String str) {
StringBuffer result = new StringBuffer();
int start = -1;
for (int j = 0; j < str.length(); j++) {
char c = str.charAt(j);
if (Character.isLetterOrDigit(c)) {
if (start == -1) {
start = j;
}
} else if (c == '\'') {
if (start == -1) {
result.append(c);
}
} else {
if (start != -1) {
result.append(stem(str.substring(start, j)));
start = -1;
}
result.append(c);
}
}
if (start != -1) {
result.append(stem(str.substring(start, str.length())));
}
return result.toString();
}
/**
* Stems text coming into stdin and writes it to stdout.
*/
public static void main(String[] ops) {
LovinsStemmer ls = new LovinsStemmer();
try {
int num;
StringBuffer wordBuffer = new StringBuffer();
while ((num = System.in.read()) != -1) {
char c = (char) num;
if (((num >= (int) 'A') && (num <= (int) 'Z')) || ((num >= (int) 'a') && (num <= (int) 'z'))) {
wordBuffer.append(c);
} else {
if (wordBuffer.length() > 0) {
System.out.print(ls.stem(wordBuffer.toString().toLowerCase()));
wordBuffer = new StringBuffer();
}
System.out.print(c);
}
}
} catch (Exception e) {
WVToolLogger.getGlobalLogger().logException("Could not build base form", e);
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -