⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 russianstemmer.java

📁 索引aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
        while (word.length() > i && !isVowel(word.charAt(i)))
        {
            i++;
        }
        if (word.length() - 1 < ++i)
            return; // RV zone is empty
        RV = i;
        // find R1
        while (word.length() > i && isVowel(word.charAt(i)))
        {
            i++;
        }
        if (word.length() - 1 < ++i)
            return; // R1 zone is empty
        R1 = i;
        // find R2
        while (word.length() > i && !isVowel(word.charAt(i)))
        {
            i++;
        }
        if (word.length() - 1 < ++i)
            return; // R2 zone is empty
        while (word.length() > i && isVowel(word.charAt(i)))
        {
            i++;
        }
        if (word.length() - 1 < ++i)
            return; // R2 zone is empty
        R2 = i;
    }

    /**
     * Checks if character is a vowel..
     * Creation date: (16/03/2002 10:47:03 PM)
     * @return boolean
     * @param letter char
     */
    private boolean isVowel(char letter)
    {
        for (int i = 0; i < vowels.length; i++)
        {
            if (letter == charset[vowels[i]])
                return true;
        }
        return false;
    }

    /**
     * Noun endings.
     * Creation date: (17/03/2002 12:14:58 AM)
     * @param stemmingZone java.lang.StringBuffer
     */
    private boolean noun(StringBuffer stemmingZone)
    {
        return findAndRemoveEnding(stemmingZone, nounEndings);
    }

    /**
     * Perfective gerund endings.
     * Creation date: (17/03/2002 12:14:58 AM)
     * @param stemmingZone java.lang.StringBuffer
     */
    private boolean perfectiveGerund(StringBuffer stemmingZone)
    {
        return findAndRemoveEnding(
            stemmingZone,
            perfectiveGerundEndings1,
            perfectiveGerund1Predessors)
            || findAndRemoveEnding(stemmingZone, perfectiveGerundEndings2);
    }

    /**
     * Reflexive endings.
     * Creation date: (17/03/2002 12:14:58 AM)
     * @param stemmingZone java.lang.StringBuffer
     */
    private boolean reflexive(StringBuffer stemmingZone)
    {
        return findAndRemoveEnding(stemmingZone, reflexiveEndings);
    }

    /**
     * Insert the method's description here.
     * Creation date: (17/03/2002 12:14:58 AM)
     * @param stemmingZone java.lang.StringBuffer
     */
    private boolean removeI(StringBuffer stemmingZone)
    {
        if (stemmingZone.length() > 0
            && stemmingZone.charAt(stemmingZone.length() - 1) == charset[I])
        {
            stemmingZone.setLength(stemmingZone.length() - 1);
            return true;
        }
        else
        {
            return false;
        }
    }

    /**
     * Insert the method's description here.
     * Creation date: (17/03/2002 12:14:58 AM)
     * @param stemmingZone java.lang.StringBuffer
     */
    private boolean removeSoft(StringBuffer stemmingZone)
    {
        if (stemmingZone.length() > 0
            && stemmingZone.charAt(stemmingZone.length() - 1) == charset[SOFT])
        {
            stemmingZone.setLength(stemmingZone.length() - 1);
            return true;
        }
        else
        {
            return false;
        }
    }

    /**
     * Insert the method's description here.
     * Creation date: (16/03/2002 10:58:42 PM)
     * @param newCharset char[]
     */
    public void setCharset(char[] newCharset)
    {
        charset = newCharset;
    }

    /**
     * Set ending definition as in Russian stemming algorithm.
     * Creation date: (16/03/2002 11:16:36 PM)
     */
    private void setEndings()
    {
        vowels = new char[] { A, E, I, O, U, Y, AE, IU, IA };

        perfectiveGerundEndings1 = new char[][] {
            { V }, { V, SH, I }, { V, SH, I, S, SOFT }
        };

        perfectiveGerund1Predessors = new char[][] { { A }, { IA }
        };

        perfectiveGerundEndings2 = new char[][] {
            { I, V },
            { Y, V },
            { I, V, SH, I },
            { Y, V, SH, I },
            { I, V, SH, I, S, SOFT },
            { Y, V, SH, I, S, SOFT }
        };

        adjectiveEndings = new char[][] {
            { E, E },
            { I, E },
            { Y, E },
            { O, E },
            { E, I_ },
            { I, I_ },
            { Y, I_ },
            { O, I_ },
            { E, M },
            { I, M },
            { Y, M },
            { O, M },
            { I, X },
            { Y, X },
            { U, IU },
            { IU, IU },
            { A, IA },
            { IA, IA },
            { O, IU },
            { E, IU },
            { I, M, I },
            { Y, M, I },
            { E, G, O },
            { O, G, O },
            { E, M, U },
            { O, M, U }
        };

        participleEndings1 = new char[][] {
            { SHCH },
            { E, M },
            { N, N },
            { V, SH },
            { IU, SHCH }
        };

        participleEndings2 = new char[][] {
            { I, V, SH },
            { Y, V, SH },
            { U, IU, SHCH }
        };

        participle1Predessors = new char[][] {
            { A },
            { IA }
        };

        reflexiveEndings = new char[][] {
            { S, IA },
            { S, SOFT }
        };

        verbEndings1 = new char[][] {
            { I_ },
            { L },
            { N },
            { L, O },
            { N, O },
            { E, T },
            { IU, T },
            { L, A },
            { N, A },
            { L, I },
            { E, M },
            { N, Y },
            { E, T, E },
            { I_, T, E },
            { T, SOFT },
            { E, SH, SOFT },
            { N, N, O }
        };

        verbEndings2 = new char[][] {
            { IU },
            { U, IU },
            { E, N },
            { E, I_ },
            { IA, T },
            { U, I_ },
            { I, L },
            { Y, L },
            { I, M },
            { Y, M },
            { I, T },
            { Y, T },
            { I, L, A },
            { Y, L, A },
            { E, N, A },
            { I, T, E },
            { I, L, I },
            { Y, L, I },
            { I, L, O },
            { Y, L, O },
            { E, N, O },
            { U, E, T },
            { U, IU, T },
            { E, N, Y },
            { I, T, SOFT },
            { Y, T, SOFT },
            { I, SH, SOFT },
            { E, I_, T, E },
            { U, I_, T, E }
        };

        verb1Predessors = new char[][] {
            { A },
            { IA }
        };

        nounEndings = new char[][] {
            { A },
            { IU },
            { I_ },
            { O },
            { U },
            { E },
            { Y },
            { I },
            { SOFT },
            { IA },
            { E, V },
            { O, V },
            { I, E },
            { SOFT, E },
            { IA, X },
            { I, IU },
            { E, I },
            { I, I },
            { E, I_ },
            { O, I_ },
            { E, M },
            { A, M },
            { O, M },
            { A, X },
            { SOFT, IU },
            { I, IA },
            { SOFT, IA },
            { I, I_ },
            { IA, M },
            { IA, M, I },
            { A, M, I },
            { I, E, I_ },
            { I, IA, M },
            { I, E, M },
            { I, IA, X },
            { I, IA, M, I }
        };

        superlativeEndings = new char[][] {
            { E, I_, SH },
            { E, I_, SH, E }
        };

        derivationalEndings = new char[][] {
            { O, S, T },
            { O, S, T, SOFT }
        };
    }

    /**
     * Finds the stem for given Russian word.
     * Creation date: (16/03/2002 3:36:48 PM)
     * @return java.lang.String
     * @param input java.lang.String
     */
    public String stem(String input)
    {
        markPositions(input);
        if (RV == 0)
            return input; //RV wasn't detected, nothing to stem
        StringBuffer stemmingZone = new StringBuffer(input.substring(RV));
        // stemming goes on in RV
        // Step 1

        if (!perfectiveGerund(stemmingZone))
        {
            reflexive(stemmingZone);
            boolean r =
                adjectival(stemmingZone)
                || verb(stemmingZone)
                || noun(stemmingZone);
        }
        // Step 2
        removeI(stemmingZone);
        // Step 3
        derivational(stemmingZone);
        // Step 4
        superlative(stemmingZone);
        undoubleN(stemmingZone);
        removeSoft(stemmingZone);
        // return result
        return input.substring(0, RV) + stemmingZone.toString();
    }

    /**
     * Superlative endings.
     * Creation date: (17/03/2002 12:14:58 AM)
     * @param stemmingZone java.lang.StringBuffer
     */
    private boolean superlative(StringBuffer stemmingZone)
    {
        return findAndRemoveEnding(stemmingZone, superlativeEndings);
    }

    /**
     * Undoubles N.
     * Creation date: (17/03/2002 12:14:58 AM)
     * @param stemmingZone java.lang.StringBuffer
     */
    private boolean undoubleN(StringBuffer stemmingZone)
    {
        char[][] doubleN = {
            { N, N }
        };
        if (findEnding(stemmingZone, doubleN) != 0)
        {
            stemmingZone.setLength(stemmingZone.length() - 1);
            return true;
        }
        else
        {
            return false;
        }
    }

    /**
     * Verb endings.
     * Creation date: (17/03/2002 12:14:58 AM)
     * @param stemmingZone java.lang.StringBuffer
     */
    private boolean verb(StringBuffer stemmingZone)
    {
        return findAndRemoveEnding(
            stemmingZone,
            verbEndings1,
            verb1Predessors)
            || findAndRemoveEnding(stemmingZone, verbEndings2);
    }

    /**
     * Static method for stemming with different charsets
     */
    public static String stem(String theWord, char[] charset)
    {
        RussianStemmer stemmer = new RussianStemmer();
        stemmer.setCharset(charset);
        return stemmer.stem(theWord);
    }
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -