📄 porterstemmer.cs

📁 Lucene.Net 版本源码测试通过
💻 CS
📖 第 1 页 / 共 2 页
字号:
12 下一页
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*

Porter stemmer in Java. The original paper is in

Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
no. 3, pp 130-137,

See also http://www.tartarus.org/~martin/PorterStemmer/index.html

Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
Tthe words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
is then out outside the bounds of b.

Similarly,

Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
b[j] is then outside the bounds of b.

Release 3.

[ This version is derived from Release 3, modified by Brian Goetz to
optimize for fewer object creations.  ]
*/

using System;

namespace Lucene.Net.Analysis
{
	
    /// <summary> 
    /// Stemmer, implementing the Porter Stemming Algorithm
    /// 
    /// The Stemmer class transforms a word into its root form.  The input
    /// word can be provided a character at time (by calling add()), or at once
    /// by calling one of the various stem(something) methods.
    /// </summary>
	
    class PorterStemmer
    {
        private char[] b;
        private int i, j, k, k0;
        private bool dirty = false;
        private const int INC = 50; /* unit of size whereby b is increased */
        private const int EXTRA = 1;
		
        public PorterStemmer()
        {
            b = new char[INC];
            i = 0;
        }
		
        /// <summary> reset() resets the stemmer so it can stem another word.  If you invoke
        /// the stemmer by calling add(char) and then Stem(), you must call reset()
        /// before starting another word.
        /// </summary>
        public virtual void  Reset()
        {
            i = 0; dirty = false;
        }
		
        /// <summary> Add a character to the word being stemmed.  When you are finished
        /// adding characters, you can call Stem(void) to process the word.
        /// </summary>
        public virtual void  Add(char ch)
        {
            if (b.Length <= i + EXTRA)
            {
                char[] new_b = new char[b.Length + INC];
                for (int c = 0; c < b.Length; c++)
                    new_b[c] = b[c];
                b = new_b;
            }
            b[i++] = ch;
        }
		
        /// <summary> After a word has been stemmed, it can be retrieved by toString(),
        /// or a reference to the internal buffer can be retrieved by getResultBuffer
        /// and getResultLength (which is generally more efficient.)
        /// </summary>
        public override System.String ToString()
        {
            return new System.String(b, 0, i);
        }
		
        /// <summary> Returns the length of the word resulting from the stemming process.</summary>
        public virtual int GetResultLength()
        {
            return i;
        }
		
        /// <summary> Returns a reference to a character buffer containing the results of
        /// the stemming process.  You also need to consult getResultLength()
        /// to determine the length of the result.
        /// </summary>
        public virtual char[] GetResultBuffer()
        {
            return b;
        }
		
        /* cons(i) is true <=> b[i] is a consonant. */
		
        private bool Cons(int i)
        {
            switch (b[i])
            {
				
                case 'a': 
                case 'e': 
                case 'i': 
                case 'o': 
                case 'u': 
                    return false;
				
                case 'y': 
                    return (i == k0) ? true : !Cons(i - 1);
				
                default: 
                    return true;
				
            }
        }
		
        /* m() measures the number of consonant sequences between k0 and j. if c is
        a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
        presence,
		
        <c><v>       gives 0
        <c>vc<v>     gives 1
        <c>vcvc<v>   gives 2
        <c>vcvcvc<v> gives 3
        ....
        */
		
        private int M()
        {
            int n = 0;
            int i = k0;
            while (true)
            {
                if (i > j)
                    return n;
                if (!Cons(i))
                    break;
                i++;
            }
            i++;
            while (true)
            {
                while (true)
                {
                    if (i > j)
                        return n;
                    if (Cons(i))
                        break;
                    i++;
                }
                i++;
                n++;
                while (true)
                {
                    if (i > j)
                        return n;
                    if (!Cons(i))
                        break;
                    i++;
                }
                i++;
            }
        }
		
        /* vowelinstem() is true <=> k0,...j contains a vowel */
		
        private bool Vowelinstem()
        {
            int i;
            for (i = k0; i <= j; i++)
                if (!Cons(i))
                    return true;
            return false;
        }
		
        /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
		
        private bool Doublec(int j)
        {
            if (j < k0 + 1)
                return false;
            if (b[j] != b[j - 1])
                return false;
            return Cons(j);
        }
		
        /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
        and also if the second c is not w,x or y. this is used when trying to
        restore an e at the end of a short word. e.g.
		
        cav(e), lov(e), hop(e), crim(e), but
        snow, box, tray.
		
        */
		
        private bool Cvc(int i)
        {
            if (i < k0 + 2 || !Cons(i) || Cons(i - 1) || !Cons(i - 2))
                return false;
            else
            {
                int ch = b[i];
                if (ch == 'w' || ch == 'x' || ch == 'y')
                    return false;
            }
            return true;
        }
		
        private bool Ends(System.String s)
        {
            int l = s.Length;
            int o = k - l + 1;
            if (o < k0)
                return false;
            for (int i = 0; i < l; i++)
                if (b[o + i] != s[i])
                    return false;
            j = k - l;
            return true;
        }
		
        /* setto(s) sets (j+1),...k to the characters in the string s, readjusting
        k. */
		
        internal virtual void  Setto(System.String s)
        {
            int l = s.Length;
            int o = j + 1;
            for (int i = 0; i < l; i++)
                b[o + i] = s[i];
            k = j + l;
            dirty = true;
        }
		
        /* r(s) is used further down. */
		
        internal virtual void  R(System.String s)
        {
            if (M() > 0)
                Setto(s);
        }
		
        /* step1() gets rid of plurals and -ed or -ing. e.g.
		
        caresses  ->  caress
        ponies    ->  poni
        ties      ->  ti
        caress    ->  caress
        cats      ->  cat
		
        feed      ->  feed
        agreed    ->  agree
        disabled  ->  disable
		
        matting   ->  mat
        mating    ->  mate
        meeting   ->  meet
        milling   ->  mill
        messing   ->  mess
		
        meetings  ->  meet
		
        */
		
        private void  Step1()
        {
            if (b[k] == 's')
            {
                if (Ends("sses"))
                    k -= 2;
                else if (Ends("ies"))
                    Setto("i");
                else if (b[k - 1] != 's')
                    k--;
            }
            if (Ends("eed"))
            {
                if (M() > 0)
                    k--;
            }
            else if ((Ends("ed") || Ends("ing")) && Vowelinstem())
            {
                k = j;
                if (Ends("at"))
                    Setto("ate");
                else if (Ends("bl"))
                    Setto("ble");
                else if (Ends("iz"))
                    Setto("ize");
                else if (Doublec(k))
                {
                    int ch = b[k--];
                    if (ch == 'l' || ch == 's' || ch == 'z')
                        k++;
                }
                else if (M() == 1 && Cvc(k))
                    Setto("e");
            }
        }
		
        /* step2() turns terminal y to i when there is another vowel in the stem. */
		
        private void  Step2()
        {
            if (Ends("y") && Vowelinstem())
            {
                b[k] = 'i';
                dirty = true;
            }
        }
		
        /* step3() maps double suffices to single ones. so -ization ( = -ize plus
        -ation) maps to -ize etc. note that the string before the suffix must give
        m() > 0. */
		
        private void  Step3()
        {
            if (k == k0)
                return ; /* For Bug 1 */
            switch (b[k - 1])
            {
				
                case 'a': 
                    if (Ends("ational"))
                    {
                        R("ate"); break;
                    }
                    if (Ends("tional"))
                    {
                        R("tion"); break;
                    }
                    break;
				
                case 'c': 
                    if (Ends("enci"))
                    {
                        R("ence"); break;
                    }
                    if (Ends("anci"))
                    {
                        R("ance"); break;
                    }
                    break;
				
                case 'e': 
                    if (Ends("izer"))
                    {
                        R("ize"); break;
                    }
                    break;
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -