📄 russianstemmer.java

📁 索引aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
package org.apache.lucene.analysis.ru;

/**
 * Copyright 2004 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/**
 * Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).
 *
 * @author  Boris Okner, b.okner@rogers.com
 * @version $Id: RussianStemmer.java,v 1.5 2004/03/29 22:48:01 cutting Exp $
 */
class RussianStemmer
{
    private char[] charset;

    // positions of RV, R1 and R2 respectively
    private int RV, R1, R2;

    // letters
    private static char A = 0;
    private static char B = 1;
    private static char V = 2;
    private static char G = 3;
    private static char D = 4;
    private static char E = 5;
    private static char ZH = 6;
    private static char Z = 7;
    private static char I = 8;
    private static char I_ = 9;
    private static char K = 10;
    private static char L = 11;
    private static char M = 12;
    private static char N = 13;
    private static char O = 14;
    private static char P = 15;
    private static char R = 16;
    private static char S = 17;
    private static char T = 18;
    private static char U = 19;
    private static char F = 20;
    private static char X = 21;
    private static char TS = 22;
    private static char CH = 23;
    private static char SH = 24;
    private static char SHCH = 25;
    private static char HARD = 26;
    private static char Y = 27;
    private static char SOFT = 28;
    private static char AE = 29;
    private static char IU = 30;
    private static char IA = 31;

    // stem definitions
    private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA };

    private static char[][] perfectiveGerundEndings1 = {
        { V },
        { V, SH, I },
        { V, SH, I, S, SOFT }
    };

    private static char[][] perfectiveGerund1Predessors = {
        { A },
        { IA }
    };

    private static char[][] perfectiveGerundEndings2 = { { I, V }, {
        Y, V }, {
            I, V, SH, I }, {
                Y, V, SH, I }, {
                    I, V, SH, I, S, SOFT }, {
                        Y, V, SH, I, S, SOFT }
    };

    private static char[][] adjectiveEndings = {
        { E, E },
        { I, E },
        { Y, E },
        { O, E },
        { E, I_ },
        { I, I_ },
        { Y, I_ },
        { O, I_ },
        { E, M },
        { I, M },
        { Y, M },
        { O, M },
        { I, X },
        { Y, X },
        { U, IU },
        { IU, IU },
        { A, IA },
        { IA, IA },
        { O, IU },
        { E, IU },
        { I, M, I },
        { Y, M, I },
        { E, G, O },
        { O, G, O },
        { E, M, U },
        {O, M, U }
    };

    private static char[][] participleEndings1 = {
        { SHCH },
        { E, M },
        { N, N },
        { V, SH },
        { IU, SHCH }
    };

    private static char[][] participleEndings2 = {
        { I, V, SH },
        { Y, V, SH },
        { U, IU, SHCH }
    };

    private static char[][] participle1Predessors = {
        { A },
        { IA }
    };

    private static char[][] reflexiveEndings = {
        { S, IA },
        { S, SOFT }
    };

    private static char[][] verbEndings1 = {
        { I_ },
        { L },
        { N },
        { L, O },
        { N, O },
        { E, T },
        { IU, T },
        { L, A },
        { N, A },
        { L, I },
        { E, M },
        { N, Y },
        { E, T, E },
        { I_, T, E },
        { T, SOFT },
        { E, SH, SOFT },
        { N, N, O }
    };

    private static char[][] verbEndings2 = {
        { IU },
        { U, IU },
        { E, N },
        { E, I_ },
        { IA, T },
        { U, I_ },
        { I, L },
        { Y, L },
        { I, M },
        { Y, M },
        { I, T },
        { Y, T },
        { I, L, A },
        { Y, L, A },
        { E, N, A },
        { I, T, E },
        { I, L, I },
        { Y, L, I },
        { I, L, O },
        { Y, L, O },
        { E, N, O },
        { U, E, T },
        { U, IU, T },
        { E, N, Y },
        { I, T, SOFT },
        { Y, T, SOFT },
        { I, SH, SOFT },
        { E, I_, T, E },
        { U, I_, T, E }
    };

    private static char[][] verb1Predessors = {
        { A },
        { IA }
    };

    private static char[][] nounEndings = {
        { A },
        { U },
        { I_ },
        { O },
        { U },
        { E },
        { Y },
        { I },
        { SOFT },
        { IA },
        { E, V },
        { O, V },
        { I, E },
        { SOFT, E },
        { IA, X },
        { I, IU },
        { E, I },
        { I, I },
        { E, I_ },
        { O, I_ },
        { E, M },
        { A, M },
        { O, M },
        { A, X },
        { SOFT, IU },
        { I, IA },
        { SOFT, IA },
        { I, I_ },
        { IA, M },
        { IA, M, I },
        { A, M, I },
        { I, E, I_ },
        { I, IA, M },
        { I, E, M },
        { I, IA, X },
        { I, IA, M, I }
    };

    private static char[][] superlativeEndings = {
        { E, I_, SH },
        { E, I_, SH, E }
    };

    private static char[][] derivationalEndings = {
        { O, S, T },
        { O, S, T, SOFT }
    };

    /**
     * RussianStemmer constructor comment.
     */
    public RussianStemmer()
    {
        super();
    }

    /**
     * RussianStemmer constructor comment.
     */
    public RussianStemmer(char[] charset)
    {
        super();
        this.charset = charset;
    }

    /**
     * Adjectival ending is an adjective ending,
     * optionally preceded by participle ending.
     * Creation date: (17/03/2002 12:14:58 AM)
     * @param stemmingZone java.lang.StringBuffer
     */
    private boolean adjectival(StringBuffer stemmingZone)
    {
        // look for adjective ending in a stemming zone
        if (!findAndRemoveEnding(stemmingZone, adjectiveEndings))
            return false;
        // if adjective ending was found, try for participle ending
        boolean r =
            findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors)
            ||
            findAndRemoveEnding(stemmingZone, participleEndings2);
        return true;
    }

    /**
     * Derivational endings
     * Creation date: (17/03/2002 12:14:58 AM)
     * @param stemmingZone java.lang.StringBuffer
     */
    private boolean derivational(StringBuffer stemmingZone)
    {
        int endingLength = findEnding(stemmingZone, derivationalEndings);
        if (endingLength == 0)
             // no derivational ending found
            return false;
        else
        {
            // Ensure that the ending locates in R2
            if (R2 - RV <= stemmingZone.length() - endingLength)
            {
                stemmingZone.setLength(stemmingZone.length() - endingLength);
                return true;
            }
            else
            {
                return false;
            }
        }
    }

    /**
     * Finds ending among given ending class and returns the length of ending found(0, if not found).
     * Creation date: (17/03/2002 8:18:34 PM)
     */
    private int findEnding(StringBuffer stemmingZone, int startIndex, char[][] theEndingClass)
    {
        boolean match = false;
        for (int i = theEndingClass.length - 1; i >= 0; i--)
        {
            char[] theEnding = theEndingClass[i];
            // check if the ending is bigger than stemming zone
            if (startIndex < theEnding.length - 1)
            {
                match = false;
                continue;
            }
            match = true;
            int stemmingIndex = startIndex;
            for (int j = theEnding.length - 1; j >= 0; j--)
            {
                if (stemmingZone.charAt(stemmingIndex--) != charset[theEnding[j]])
                {
                    match = false;
                    break;
                }
            }
            // check if ending was found
            if (match)
            {
                return theEndingClass[i].length; // cut ending
            }
        }
        return 0;
    }

    private int findEnding(StringBuffer stemmingZone, char[][] theEndingClass)
    {
        return findEnding(stemmingZone, stemmingZone.length() - 1, theEndingClass);
    }

    /**
     * Finds the ending among the given class of endings and removes it from stemming zone.
     * Creation date: (17/03/2002 8:18:34 PM)
     */
    private boolean findAndRemoveEnding(StringBuffer stemmingZone, char[][] theEndingClass)
    {
        int endingLength = findEnding(stemmingZone, theEndingClass);
        if (endingLength == 0)
            // not found
            return false;
        else {
            stemmingZone.setLength(stemmingZone.length() - endingLength);
            // cut the ending found
            return true;
        }
    }

    /**
     * Finds the ending among the given class of endings, then checks if this ending was
     * preceded by any of given predessors, and if so, removes it from stemming zone.
     * Creation date: (17/03/2002 8:18:34 PM)
     */
    private boolean findAndRemoveEnding(StringBuffer stemmingZone,
        char[][] theEndingClass, char[][] thePredessors)
    {
        int endingLength = findEnding(stemmingZone, theEndingClass);
        if (endingLength == 0)
            // not found
            return false;
        else
        {
            int predessorLength =
                findEnding(stemmingZone,
                    stemmingZone.length() - endingLength - 1,
                    thePredessors);
            if (predessorLength == 0)
                return false;
            else {
                stemmingZone.setLength(stemmingZone.length() - endingLength);
                // cut the ending found
                return true;
            }
        }

    }

    /**
     * Marks positions of RV, R1 and R2 in a given word.
     * Creation date: (16/03/2002 3:40:11 PM)
     */
    private void markPositions(String word)
    {
        RV = 0;
        R1 = 0;
        R2 = 0;
        int i = 0;
        // find RV
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -