📄 porter.cpp
字号:
// IRTools Copyright (C) 2000
// This is free software. See the file COPYING for details.
/**********************************************************************
Copyright (C) 2000 Gregory B. Newby
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details. The file
COPYING contains the license.
Dr. Newby may be contacted:
3401 Monadnock Ridge
Efland, NC, 27243
Email: gbnewby@ils.unc.edu
**********************************************************************/
/* GBN: Updated to run as part of abstract IR system; also changed
prototype for ReplaceEnd; otherwise this is straight out of the
book. 'Runs good.' */
/******************************* stem.c ***********************************
Purpose: Implementation of the Porter stemming algorithm documented
in: Porter, M.F., "An Algorithm For Suffix Stripping,"
Program 14 (3), July 1980, pp. 130-137.
Provenance: Written by B. Frakes and C. Cox, 1986.
Changed by C. Fox, 1990.
- made measure function a DFA
- restructured structs
- renamed functions and variables
- restricted function and variable scopes
Changed by C. Fox, July, 1991.
- added ANSI C declarations
- branch tested to 90% coverage
Notes: This code will make little sense without the the Porter
article. The stemming function converts its input to
lower case.
**/
/************************ Standard Include Files *************************/
#include "porter.h"
#include <stdio.h>
#include <string.h>
#include <ctype.h>
/*****************************************************************************/
/***************** Private Defines and Data Structures *******************/
#define FALSE 0
#define TRUE 1
#define EOS '\0'
#define IsVowel(c) ('a'==(c)||'e'==(c)||'i'==(c)||'o'==(c)||'u'==(c))
typedef struct {
int id; /* returned if rule fired */
char *old_end; /* suffix replaced */
char *new_end; /* suffix replacement */
int old_offset; /* from end of word to start of suffix */
int new_offset; /* from beginning to end of new suffix */
int min_root_size; /* min root word size for replacement */
int (*condition)(char *); /* the replacement test function */
} RuleList;
static char LAMBDA[1] = ""; /* the constant empty string */
static char *end; /* pointer to the end of the word */
/*****************************************************************************/
/******************** Private Function Declarations **********************/
#if 1
/*#ifdef __STDC__ ryang */
static int WordSize( char *word );
static int ContainsVowel( char *word );
static int EndsWithCVC( char *word );
static int AddAnE( char *word );
static int RemoveAnE( char *word );
/* static int ReplaceEnd( char *word, RuleList rule ); gbn */
static int ReplaceEnd( char *word, RuleList *rule );
#else
static int WordSize( /* word */ );
static int ContainsVowel( /* word */ );
static int EndsWithCVC( /* word */ );
static int AddAnE( /* word */ );
static int RemoveAnE( /* word */ );
static int ReplaceEnd( /* word, rule */ );
#endif
/******************************************************************************/
/***************** Initialized Private Data Structures ********************/
/* gbn: added braces around each row of initializers. Not necessary
per K&R 2nd ed., but gcc -Wall complains otherwise. 6/14/97 */
static RuleList step1a_rules[] =
{
{ 101, "sses", "ss", 3, 1, -1, NULL},
{ 102, "ies", "i", 2, 0, -1, NULL},
{ 103, "ss", "ss", 1, 1, -1, NULL},
{ 104, "s", LAMBDA, 0, -1, -1, NULL},
{ 000, NULL, NULL, 0, 0, 0, NULL},
};
static RuleList step1b_rules[] =
{
{ 105, "eed", "ee", 2, 1, 0, NULL},
{ 106, "ed", LAMBDA, 1, -1, -1, ContainsVowel},
{ 107, "ing", LAMBDA, 2, -1, -1, ContainsVowel},
{ 000, NULL, NULL, 0, 0, 0, NULL},
};
static RuleList step1b1_rules[] =
{
{ 108, "at", "ate", 1, 2, -1, NULL},
{ 109, "bl", "ble", 1, 2, -1, NULL},
{ 110, "iz", "ize", 1, 2, -1, NULL},
{ 111, "bb", "b", 1, 0, -1, NULL},
{ 112, "dd", "d", 1, 0, -1, NULL},
{ 113, "ff", "f", 1, 0, -1, NULL},
{ 114, "gg", "g", 1, 0, -1, NULL},
{ 115, "mm", "m", 1, 0, -1, NULL},
{ 116, "nn", "n", 1, 0, -1, NULL},
{ 117, "pp", "p", 1, 0, -1, NULL},
{ 118, "rr", "r", 1, 0, -1, NULL},
{ 119, "tt", "t", 1, 0, -1, NULL},
{ 120, "ww", "w", 1, 0, -1, NULL},
{ 121, "xx", "x", 1, 0, -1, NULL},
{ 122, LAMBDA, "e", -1, 0, -1, AddAnE},
{ 000, NULL, NULL, 0, 0, 0, NULL},
};
static RuleList step1c_rules[] =
{
{ 123, "y", "i", 0, 0, -1, ContainsVowel},
{ 000, NULL, NULL, 0, 0, 0, NULL},
};
static RuleList step2_rules[] =
{
{ 203, "ational", "ate", 6, 2, 0, NULL},
{ 204, "tional", "tion", 5, 3, 0, NULL},
{ 205, "enci", "ence", 3, 3, 0, NULL},
{ 206, "anci", "ance", 3, 3, 0, NULL},
{ 207, "izer", "ize", 3, 2, 0, NULL},
{ 208, "abli", "able", 3, 3, 0, NULL},
{ 209, "alli", "al", 3, 1, 0, NULL},
{ 210, "entli", "ent", 4, 2, 0, NULL},
{ 211, "eli", "e", 2, 0, 0, NULL},
{ 213, "ousli", "ous", 4, 2, 0, NULL},
{ 214, "ization", "ize", 6, 2, 0, NULL},
{ 215, "ation", "ate", 4, 2, 0, NULL},
{ 216, "ator", "ate", 3, 2, 0, NULL},
{ 217, "alism", "al", 4, 1, 0, NULL},
{ 218, "iveness", "ive", 6, 2, 0, NULL},
{ 219, "fulnes", "ful", 5, 2, 0, NULL},
{ 220, "ousness", "ous", 6, 2, 0, NULL},
{ 221, "aliti", "al", 4, 1, 0, NULL},
{ 222, "iviti", "ive", 4, 2, 0, NULL},
{ 223, "biliti", "ble", 5, 2, 0, NULL},
{ 000, NULL, NULL, 0, 0, 0, NULL},
};
static RuleList step3_rules[] =
{
{ 301, "icate", "ic", 4, 1, 0, NULL},
{ 302, "ative", LAMBDA, 4, -1, 0, NULL},
{ 303, "alize", "al", 4, 1, 0, NULL},
{ 304, "iciti", "ic", 4, 1, 0, NULL},
{ 305, "ical", "ic", 3, 1, 0, NULL},
{ 308, "ful", LAMBDA, 2, -1, 0, NULL},
{ 309, "ness", LAMBDA, 3, -1, 0, NULL},
{ 000, NULL, NULL, 0, 0, 0, NULL},
};
static RuleList step4_rules[] =
{
{ 401, "al", LAMBDA, 1, -1, 1, NULL},
{ 402, "ance", LAMBDA, 3, -1, 1, NULL},
{ 403, "ence", LAMBDA, 3, -1, 1, NULL},
{ 405, "er", LAMBDA, 1, -1, 1, NULL},
{ 406, "ic", LAMBDA, 1, -1, 1, NULL},
{ 407, "able", LAMBDA, 3, -1, 1, NULL},
{ 408, "ible", LAMBDA, 3, -1, 1, NULL},
{ 409, "ant", LAMBDA, 2, -1, 1, NULL},
{ 410, "ement", LAMBDA, 4, -1, 1, NULL},
{ 411, "ment", LAMBDA, 3, -1, 1, NULL},
{ 412, "ent", LAMBDA, 2, -1, 1, NULL},
{ 423, "sion", "s", 3, 0, 1, NULL},
{ 424, "tion", "t", 3, 0, 1, NULL},
{ 415, "ou", LAMBDA, 1, -1, 1, NULL},
{ 416, "ism", LAMBDA, 2, -1, 1, NULL},
{ 417, "ate", LAMBDA, 2, -1, 1, NULL},
{ 418, "iti", LAMBDA, 2, -1, 1, NULL},
{ 419, "ous", LAMBDA, 2, -1, 1, NULL},
{ 420, "ive", LAMBDA, 2, -1, 1, NULL},
{ 421, "ize", LAMBDA, 2, -1, 1, NULL},
{ 000, NULL, NULL, 0, 0, 0, NULL},
};
static RuleList step5a_rules[] =
{
{ 501, "e", LAMBDA, 0, -1, 1, NULL},
{ 502, "e", LAMBDA, 0, -1, -1, RemoveAnE},
{ 000, NULL, NULL, 0, 0, 0, NULL},
};
static RuleList step5b_rules[] =
{
{ 503, "ll", "l", 1, 0, 1, NULL},
{ 000, NULL, NULL, 0, 0, 0, NULL},
};
/*****************************************************************************/
/******************** Private Function Declarations **********************/
/*FN***************************************************************************
WordSize( word )
Returns: int -- a weird count of word size in adjusted syllables
Purpose: Count syllables in a special way: count the number
vowel-consonant pairs in a word, disregarding initial
consonants and final vowels. The letter "y" counts as a
consonant at the beginning of a word and when it has a vowel
in front of it; otherwise (when it follows a consonant) it
is treated as a vowel. For example, the WordSize of "cat"
is 1, of "any" is 1, of "amount" is 2, of "anything" is 3.
Plan: Run a DFA to compute the word size
Notes: The easiest and fastest way to compute this funny measure is
with a finite state machine. The initial state 0 checks
the first letter. If it is a vowel, then the machine changes
to state 1, which is the "last letter was a vowel" state.
If the first letter is a consonant or y, then it changes
to state 2, the "last letter was a consonant state". In
state 1, a y is treated as a consonant (since it follows
a vowel), but in state 2, y is treated as a vowel (since
it follows a consonant. The result counter is incremented
on the transition from state 1 to state 2, since this
transition only occurs after a vowel-consonant pair, which
is what we are counting.
**/
static int
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -