⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 lovinsstemmer.java

📁 一个很不错的词频统计程序,目前只支持英文,中文的本人正在修改中.改好后上传给大家分享
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
                        throw new IllegalArgumentException("Fatal error.");
                    }
                }
            }
            el--;
        }
        return word;
    }

    /**
     * Recodes ending of given word.
     */
    private String recodeEnding(String word) {

        int lastPos = word.length() - 1;

        // Rule 1
        if (word.endsWith("bb") || word.endsWith("dd") || word.endsWith("gg") || word.endsWith("ll")
                || word.endsWith("mm") || word.endsWith("nn") || word.endsWith("pp") || word.endsWith("rr")
                || word.endsWith("ss") || word.endsWith("tt")) {
            word = word.substring(0, lastPos);
            lastPos--;
        }

        // Rule 2
        if (word.endsWith("iev")) {
            word = word.substring(0, lastPos - 2).concat("ief");
        }

        // Rule 3
        if (word.endsWith("uct")) {
            word = word.substring(0, lastPos - 2).concat("uc");
            lastPos--;
        }

        // Rule 4
        if (word.endsWith("umpt")) {
            word = word.substring(0, lastPos - 3).concat("um");
            lastPos -= 2;
        }

        // Rule 5
        if (word.endsWith("rpt")) {
            word = word.substring(0, lastPos - 2).concat("rb");
            lastPos--;
        }

        // Rule 6
        if (word.endsWith("urs")) {
            word = word.substring(0, lastPos - 2).concat("ur");
            lastPos--;
        }

        // Rule 7
        if (word.endsWith("istr")) {
            word = word.substring(0, lastPos - 3).concat("ister");
            lastPos++;
        }

        // Rule 7a
        if (word.endsWith("metr")) {
            word = word.substring(0, lastPos - 3).concat("meter");
            lastPos++;
        }

        // Rule 8
        if (word.endsWith("olv")) {
            word = word.substring(0, lastPos - 2).concat("olut");
            lastPos++;
        }

        // Rule 9
        if (word.endsWith("ul")) {
            if ((lastPos - 2 < 0)
                    || ((word.charAt(lastPos - 2) != 'a') && (word.charAt(lastPos - 2) != 'i') && (word
                            .charAt(lastPos - 2) != 'o'))) {
                word = word.substring(0, lastPos - 1).concat("l");
                lastPos--;
            }
        }

        // Rule 10
        if (word.endsWith("bex")) {
            word = word.substring(0, lastPos - 2).concat("bic");
        }

        // Rule 11
        if (word.endsWith("dex")) {
            word = word.substring(0, lastPos - 2).concat("dic");
        }

        // Rule 12
        if (word.endsWith("pex")) {
            word = word.substring(0, lastPos - 2).concat("pic");
        }

        // Rule 13
        if (word.endsWith("tex")) {
            word = word.substring(0, lastPos - 2).concat("tic");
        }

        // Rule 14
        if (word.endsWith("ax")) {
            word = word.substring(0, lastPos - 1).concat("ac");
        }

        // Rule 15
        if (word.endsWith("ex")) {
            word = word.substring(0, lastPos - 1).concat("ec");
        }

        // Rule 16
        if (word.endsWith("ix")) {
            word = word.substring(0, lastPos - 1).concat("ic");
        }

        // Rule 17
        if (word.endsWith("lux")) {
            word = word.substring(0, lastPos - 2).concat("luc");
        }

        // Rule 18
        if (word.endsWith("uad")) {
            word = word.substring(0, lastPos - 2).concat("uas");
        }

        // Rule 19
        if (word.endsWith("vad")) {
            word = word.substring(0, lastPos - 2).concat("vas");
        }

        // Rule 20
        if (word.endsWith("cid")) {
            word = word.substring(0, lastPos - 2).concat("cis");
        }

        // Rule 21
        if (word.endsWith("lid")) {
            word = word.substring(0, lastPos - 2).concat("lis");
        }

        // Rule 22
        if (word.endsWith("erid")) {
            word = word.substring(0, lastPos - 3).concat("eris");
        }

        // Rule 23
        if (word.endsWith("pand")) {
            word = word.substring(0, lastPos - 3).concat("pans");
        }

        // Rule 24
        if (word.endsWith("end")) {
            if ((lastPos - 3 < 0) || (word.charAt(lastPos - 3) != 's')) {
                word = word.substring(0, lastPos - 2).concat("ens");
            }
        }

        // Rule 25
        if (word.endsWith("ond")) {
            word = word.substring(0, lastPos - 2).concat("ons");
        }

        // Rule 26
        if (word.endsWith("lud")) {
            word = word.substring(0, lastPos - 2).concat("lus");
        }

        // Rule 27
        if (word.endsWith("rud")) {
            word = word.substring(0, lastPos - 2).concat("rus");
        }

        // Rule 28
        if (word.endsWith("her")) {
            if ((lastPos - 3 < 0) || ((word.charAt(lastPos - 3) != 'p') && (word.charAt(lastPos - 3) != 't'))) {
                word = word.substring(0, lastPos - 2).concat("hes");
            }
        }

        // Rule 29
        if (word.endsWith("mit")) {
            word = word.substring(0, lastPos - 2).concat("mis");
        }

        // Rule 30
        if (word.endsWith("end")) {
            if ((lastPos - 3 < 0) || (word.charAt(lastPos - 3) != 'm')) {
                word = word.substring(0, lastPos - 2).concat("ens");
            }
        }

        // Rule 31
        if (word.endsWith("ert")) {
            word = word.substring(0, lastPos - 2).concat("ers");
        }

        // Rule 32
        if (word.endsWith("et")) {
            if ((lastPos - 2 < 0) || (word.charAt(lastPos - 2) != 'n')) {
                word = word.substring(0, lastPos - 1).concat("es");
            }
        }

        // Rule 33
        if (word.endsWith("yt")) {
            word = word.substring(0, lastPos - 1).concat("ys");
        }

        // Rule 34
        if (word.endsWith("yz")) {
            word = word.substring(0, lastPos - 1).concat("ys");
        }

        return word;
    }

    /**
     * Returns the stemmed version of the given word. Expects word to be lower
     * case.
     * 
     * @param word a string consisting of a single word
     */
    public String stem(String word) {

        if (word.length() > 2) {
            return recodeEnding(removeEnding(word.toLowerCase()));
        } else {
            return word.toLowerCase();
        }
    }

    /**
     * Stems everything in the given string.
     */
    public String stemString(String str) {

        StringBuffer result = new StringBuffer();
        int start = -1;
        for (int j = 0; j < str.length(); j++) {
            char c = str.charAt(j);
            if (Character.isLetterOrDigit(c)) {
                if (start == -1) {
                    start = j;
                }
            } else if (c == '\'') {
                if (start == -1) {
                    result.append(c);
                }
            } else {
                if (start != -1) {
                    result.append(stem(str.substring(start, j)));
                    start = -1;
                }
                result.append(c);
            }
        }
        if (start != -1) {
            result.append(stem(str.substring(start, str.length())));
        }
        return result.toString();
    }

    /**
     * Stems text coming into stdin and writes it to stdout.
     */
    public static void main(String[] ops) {

        LovinsStemmer ls = new LovinsStemmer();

        try {
            int num;
            StringBuffer wordBuffer = new StringBuffer();
            while ((num = System.in.read()) != -1) {
                char c = (char) num;
                if (((num >= (int) 'A') && (num <= (int) 'Z')) || ((num >= (int) 'a') && (num <= (int) 'z'))) {
                    wordBuffer.append(c);
                } else {
                    if (wordBuffer.length() > 0) {
                        System.out.print(ls.stem(wordBuffer.toString().toLowerCase()));
                        wordBuffer = new StringBuffer();
                    }
                    System.out.print(c);
                }
            }
        } catch (Exception e) {
            WVToolLogger.getGlobalLogger().logException("Could not build base form", e);
        }
    }
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -