📄 porter.cpp

📁 潜在语义准备
💻 CPP
📖 第 1 页 / 共 2 页
字号:
12 下一页
// IRTools Copyright (C) 2000
// This is free software.  See the file COPYING for details.

/**********************************************************************

    Copyright (C) 2000 Gregory B. Newby

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.  The file
    COPYING contains the license.

    Dr. Newby may be contacted:
	3401 Monadnock Ridge
	Efland, NC, 27243
	Email: gbnewby@ils.unc.edu

**********************************************************************/
/* GBN: Updated to run as part of abstract IR system; also changed
   prototype for ReplaceEnd; otherwise this is straight out of the
   book.  'Runs good.' */

/*******************************   stem.c   ***********************************

   Purpose:    Implementation of the Porter stemming algorithm documented 
               in: Porter, M.F., "An Algorithm For Suffix Stripping," 
               Program 14 (3), July 1980, pp. 130-137.

   Provenance: Written by B. Frakes and C. Cox, 1986.
               Changed by C. Fox, 1990.
                  - made measure function a DFA
                  - restructured structs
                  - renamed functions and variables
                  - restricted function and variable scopes
               Changed by C. Fox, July, 1991.
                  - added ANSI C declarations 
                  - branch tested to 90% coverage

   Notes:      This code will make little sense without the the Porter
               article.  The stemming function converts its input to
               lower case.
**/

/************************   Standard Include Files   *************************/

#include "porter.h"
#include <stdio.h>
#include <string.h>
#include <ctype.h>

/*****************************************************************************/
/*****************   Private Defines and Data Structures   *******************/

#define FALSE                         0
#define TRUE                          1
#define EOS                         '\0'

#define IsVowel(c)        ('a'==(c)||'e'==(c)||'i'==(c)||'o'==(c)||'u'==(c))

typedef struct {
           int id;                 /* returned if rule fired */
           char *old_end;          /* suffix replaced */
           char *new_end;          /* suffix replacement */
           int old_offset;         /* from end of word to start of suffix */
           int new_offset;         /* from beginning to end of new suffix */
           int min_root_size;      /* min root word size for replacement */
           int (*condition)(char *);     /* the replacement test function */
           } RuleList;

static char LAMBDA[1] = "";        /* the constant empty string */
static char *end;                  /* pointer to the end of the word */

/*****************************************************************************/
/********************   Private Function Declarations   **********************/

#if 1
/*#ifdef __STDC__ ryang */

static int WordSize( char *word );
static int ContainsVowel( char *word );
static int EndsWithCVC( char *word );
static int AddAnE( char *word );
static int RemoveAnE( char *word );
/* static int ReplaceEnd( char *word, RuleList rule );  gbn */
static int ReplaceEnd( char *word, RuleList *rule );

#else

static int WordSize( /* word */ );
static int ContainsVowel( /* word */ );
static int EndsWithCVC( /* word */ );
static int AddAnE( /* word */ );
static int RemoveAnE( /* word */ );
static int ReplaceEnd( /* word, rule */ );

#endif

/******************************************************************************/
/*****************   Initialized Private Data Structures   ********************/

/* gbn: added braces around each row of initializers.  Not necessary
   per K&R 2nd ed., but gcc -Wall complains otherwise.  6/14/97 */

static RuleList step1a_rules[] =
           {
	     {             101,  "sses",      "ss",    3,  1, -1,  NULL},
	     {             102,  "ies",       "i",     2,  0, -1,  NULL},
	     {             103,  "ss",        "ss",    1,  1, -1,  NULL},
	     {             104,  "s",         LAMBDA,  0, -1, -1,  NULL},
	     {             000,  NULL,        NULL,    0,  0,  0,  NULL},
           };

static RuleList step1b_rules[] =
           {
	     {             105,  "eed",       "ee",    2,  1,  0,  NULL},
	     {             106,  "ed",        LAMBDA,  1, -1, -1,  ContainsVowel},
	     {             107,  "ing",       LAMBDA,  2, -1, -1,  ContainsVowel},
	     {             000,  NULL,        NULL,    0,  0,  0,  NULL},
           };

static RuleList step1b1_rules[] =
           {
{             108,  "at",        "ate",   1,  2, -1,  NULL},
{             109,  "bl",        "ble",   1,  2, -1,  NULL},
{             110,  "iz",        "ize",   1,  2, -1,  NULL},
{             111,  "bb",        "b",     1,  0, -1,  NULL},
{             112,  "dd",        "d",     1,  0, -1,  NULL},
{             113,  "ff",        "f",     1,  0, -1,  NULL},
{             114,  "gg",        "g",     1,  0, -1,  NULL},
{             115,  "mm",        "m",     1,  0, -1,  NULL},
{             116,  "nn",        "n",     1,  0, -1,  NULL},
{             117,  "pp",        "p",     1,  0, -1,  NULL},
{             118,  "rr",        "r",     1,  0, -1,  NULL},
{             119,  "tt",        "t",     1,  0, -1,  NULL},
{             120,  "ww",        "w",     1,  0, -1,  NULL},
{             121,  "xx",        "x",     1,  0, -1,  NULL},
{             122,  LAMBDA,      "e",    -1,  0, -1,  AddAnE},
{             000,  NULL,        NULL,    0,  0,  0,  NULL},
             };

static RuleList step1c_rules[] =
           {
{             123,  "y",         "i",      0,  0, -1,  ContainsVowel},
{             000,  NULL,        NULL,    0,  0,  0,  NULL},
           };

static RuleList step2_rules[] =
           {
{             203,  "ational",   "ate",   6,  2,  0,  NULL},
{             204,  "tional",    "tion",  5,  3,  0,  NULL},
{             205,  "enci",      "ence",  3,  3,  0,  NULL},
{             206,  "anci",      "ance",  3,  3,  0,  NULL},
{             207,  "izer",      "ize",   3,  2,  0,  NULL},
{             208,  "abli",      "able",  3,  3,  0,  NULL},
{             209,  "alli",      "al",    3,  1,  0,  NULL},
{             210,  "entli",     "ent",   4,  2,  0,  NULL},
{             211,  "eli",       "e",     2,  0,  0,  NULL},
{             213,  "ousli",     "ous",   4,  2,  0,  NULL},
{             214,  "ization",   "ize",   6,  2,  0,  NULL},
{	 215,  "ation",     "ate",   4,  2,  0,  NULL},
{             216,  "ator",      "ate",   3,  2,  0,  NULL},
{             217,  "alism",     "al",    4,  1,  0,  NULL},
{             218,  "iveness",   "ive",   6,  2,  0,  NULL},
{             219,  "fulnes",    "ful",   5,  2,  0,  NULL},
{             220,  "ousness",   "ous",   6,  2,  0,  NULL},
{             221,  "aliti",     "al",    4,  1,  0,  NULL},
{             222,  "iviti",     "ive",   4,  2,  0,  NULL},
{             223,  "biliti",    "ble",   5,  2,  0,  NULL},
{             000,  NULL,        NULL,    0,  0,  0,  NULL},
           };

static RuleList step3_rules[] =
           {
{             301,  "icate",     "ic",    4,  1,  0,  NULL},
{             302,  "ative",     LAMBDA,  4, -1,  0,  NULL},
{             303,  "alize",     "al",    4,  1,  0,  NULL},
{             304,  "iciti",     "ic",    4,  1,  0,  NULL},
{             305,  "ical",      "ic",    3,  1,  0,  NULL},
{             308,  "ful",       LAMBDA,  2, -1,  0,  NULL},
{	309,  "ness",      LAMBDA,  3, -1,  0,  NULL},
{             000,  NULL,        NULL,    0,  0,  0,  NULL},
           };

static RuleList step4_rules[] =
           {
{             401,  "al",        LAMBDA,  1, -1,  1,  NULL},
{             402,  "ance",      LAMBDA,  3, -1,  1,  NULL},
{             403,  "ence",      LAMBDA,  3, -1,  1,  NULL},
{             405,  "er",        LAMBDA,  1, -1,  1,  NULL},
{             406,  "ic",        LAMBDA,  1, -1,  1,  NULL},
{             407,  "able",      LAMBDA,  3, -1,  1,  NULL},
{             408,  "ible",      LAMBDA,  3, -1,  1,  NULL},
{             409,  "ant",       LAMBDA,  2, -1,  1,  NULL},
{             410,  "ement",     LAMBDA,  4, -1,  1,  NULL},
{             411,  "ment",      LAMBDA,  3, -1,  1,  NULL},
{             412,  "ent",       LAMBDA,  2, -1,  1,  NULL},
{             423,  "sion",      "s",     3,  0,  1,  NULL},
{             424,  "tion",      "t",     3,  0,  1,  NULL},
{             415,  "ou",        LAMBDA,  1, -1,  1,  NULL},
{             416,  "ism",       LAMBDA,  2, -1,  1,  NULL},
{             417,  "ate",       LAMBDA,  2, -1,  1,  NULL},
{             418,  "iti",       LAMBDA,  2, -1,  1,  NULL},
  {             419,  "ous",       LAMBDA,  2, -1,  1,  NULL},
{             420,  "ive",       LAMBDA,  2, -1,  1,  NULL},
{             421,  "ize",       LAMBDA,  2, -1,  1,  NULL},
{             000,  NULL,        NULL,    0,  0,  0,  NULL},
           };

static RuleList step5a_rules[] =
           {
{             501,  "e",         LAMBDA,  0, -1,  1,  NULL},
{             502,  "e",         LAMBDA,  0, -1, -1,  RemoveAnE},
{             000,  NULL,        NULL,    0,  0,  0,  NULL},
           };

static RuleList step5b_rules[] =
           {
{             503,  "ll",        "l",     1,  0,  1,  NULL},
{             000,  NULL,        NULL,    0,  0,  0,  NULL},
           };

/*****************************************************************************/
/********************   Private Function Declarations   **********************/

/*FN***************************************************************************

       WordSize( word )

   Returns: int -- a weird count of word size in adjusted syllables

   Purpose: Count syllables in a special way:  count the number 
            vowel-consonant pairs in a word, disregarding initial 
            consonants and final vowels.  The letter "y" counts as a
            consonant at the beginning of a word and when it has a vowel
            in front of it; otherwise (when it follows a consonant) it
            is treated as a vowel.  For example, the WordSize of "cat" 
            is 1, of "any" is 1, of "amount" is 2, of "anything" is 3.

   Plan:    Run a DFA to compute the word size

   Notes:   The easiest and fastest way to compute this funny measure is
            with a finite state machine.  The initial state 0 checks
            the first letter.  If it is a vowel, then the machine changes
            to state 1, which is the "last letter was a vowel" state.
            If the first letter is a consonant or y, then it changes
            to state 2, the "last letter was a consonant state".  In
            state 1, a y is treated as a consonant (since it follows
            a vowel), but in state 2, y is treated as a vowel (since
            it follows a consonant.  The result counter is incremented
            on the transition from state 1 to state 2, since this
            transition only occurs after a vowel-consonant pair, which
            is what we are counting.
**/

static int
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -