📄 stem_porter.c

📁 使用具有增量学习的监控式学习方法。包括几个不同的分类算法。
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
/****************************   stem_porter.c   *******************************   Purpose:    Implementation of the Porter stemming algorithm documented                in: Porter, M.F., "An Algorithm For Suffix Stripping,"                Program 14 (3), July 1980, pp. 130-137.   Provenance: Written by B. Frakes and C. Cox, 1986.               Changed by C. Fox, 1990.                  - made measure function a DFA                  - restructured structs                  - renamed functions and variables                  - restricted function and variable scopes               Changed by C. Fox, July, 1991.                  - added ANSI C declarations                   - branch tested to 90% coverage               Changed by M. Ylikoski, December, 2001.                  - made struct array initializations proper                  - cast a subscript of type char to int                  - Stem -> porter_stem                  - stem.c -> stem_porter.c                  - #include "stem_porter.h"               Changed by M. Ylikoski, August, 2002.                  - Fixed bug in EndsWithCVC: length must be >= 3   Notes:      This code will make little sense without the the Porter               article.  The stemming function converts its input to               lower case.**//************************   Standard Include Files   *************************/#include <stdio.h>#include <string.h>#include <ctype.h>#include "stem_porter.h"/*****************************************************************************//*****************   Private Defines and Data Structures   *******************/#define FALSE                         0#define TRUE                          1#define EOS                         '\0'#define IsVowel(c)        ('a'==(c)||'e'==(c)||'i'==(c)||'o'==(c)||'u'==(c))typedef struct {           int id;                 /* returned if rule fired */           char *old_end;          /* suffix replaced */           char *new_end;          /* suffix replacement */           int old_offset;         /* from end of word to start of suffix */           int new_offset;         /* from beginning to end of new suffix */           int min_root_size;      /* min root word size for replacement */           int (*condition)();     /* the replacement test function */           } RuleList;static char LAMBDA[1] = "";        /* the constant empty string */static char *end;                  /* pointer to the end of the word *//*****************************************************************************//********************   Private Function Declarations   **********************/#ifdef __STDC__static int WordSize( char *word );static int ContainsVowel( char *word );static int EndsWithCVC( char *word );static int AddAnE( char *word );static int RemoveAnE( char *word );static int ReplaceEnd( char *word, RuleList *rule );#elsestatic int WordSize( /* word */ );static int ContainsVowel( /* word */ );static int EndsWithCVC( /* word */ );static int AddAnE( /* word */ );static int RemoveAnE( /* word */ );static int ReplaceEnd( /* word, rule */ );#endif/******************************************************************************//*****************   Initialized Private Data Structures   ********************/static RuleList step1a_rules[] =           {	     { 101,  "sses",      "ss",    3,  1, -1,  NULL },             { 102,  "ies",       "i",     2,  0, -1,  NULL },             { 103,  "ss",        "ss",    1,  1, -1,  NULL },             { 104,  "s",         LAMBDA,  0, -1, -1,  NULL },             { 000,  NULL,        NULL,    0,  0,  0,  NULL },           };static RuleList step1b_rules[] =           {             { 105,  "eed",       "ee",    2,  1,  0,  NULL },             { 106,  "ed",        LAMBDA,  1, -1, -1,  ContainsVowel },             { 107,  "ing",       LAMBDA,  2, -1, -1,  ContainsVowel },             { 000,  NULL,        NULL,    0,  0,  0,  NULL },           };static RuleList step1b1_rules[] =           {             { 108,  "at",        "ate",   1,  2, -1,  NULL },             { 109,  "bl",        "ble",   1,  2, -1,  NULL },             { 110,  "iz",        "ize",   1,  2, -1,  NULL },             { 111,  "bb",        "b",     1,  0, -1,  NULL },             { 112,  "dd",        "d",     1,  0, -1,  NULL },             { 113,  "ff",        "f",     1,  0, -1,  NULL },             { 114,  "gg",        "g",     1,  0, -1,  NULL },             { 115,  "mm",        "m",     1,  0, -1,  NULL },             { 116,  "nn",        "n",     1,  0, -1,  NULL },             { 117,  "pp",        "p",     1,  0, -1,  NULL },             { 118,  "rr",        "r",     1,  0, -1,  NULL },             { 119,  "tt",        "t",     1,  0, -1,  NULL },             { 120,  "ww",        "w",     1,  0, -1,  NULL },             { 121,  "xx",        "x",     1,  0, -1,  NULL },             { 122,  LAMBDA,      "e",    -1,  0, -1,  AddAnE },             { 000,  NULL,        NULL,    0,  0,  0,  NULL },             };static RuleList step1c_rules[] =           {             { 123,  "y",         "i",      0,  0, -1,  ContainsVowel },             { 000,  NULL,        NULL,    0,  0,  0,  NULL },           };static RuleList step2_rules[] =           {             { 203,  "ational",   "ate",   6,  2,  0,  NULL },             { 204,  "tional",    "tion",  5,  3,  0,  NULL },             { 205,  "enci",      "ence",  3,  3,  0,  NULL },             { 206,  "anci",      "ance",  3,  3,  0,  NULL },             { 207,  "izer",      "ize",   3,  2,  0,  NULL },             { 208,  "abli",      "able",  3,  3,  0,  NULL },             { 209,  "alli",      "al",    3,  1,  0,  NULL },             { 210,  "entli",     "ent",   4,  2,  0,  NULL },             { 211,  "eli",       "e",     2,  0,  0,  NULL },             { 213,  "ousli",     "ous",   4,  2,  0,  NULL },             { 214,  "ization",   "ize",   6,  2,  0,  NULL },             { 215,  "ation",     "ate",   4,  2,  0,  NULL },             { 216,  "ator",      "ate",   3,  2,  0,  NULL },             { 217,  "alism",     "al",    4,  1,  0,  NULL },             { 218,  "iveness",   "ive",   6,  2,  0,  NULL },             { 219,  "fulnes",    "ful",   5,  2,  0,  NULL },             { 220,  "ousness",   "ous",   6,  2,  0,  NULL },             { 221,  "aliti",     "al",    4,  1,  0,  NULL },             { 222,  "iviti",     "ive",   4,  2,  0,  NULL },             { 223,  "biliti",    "ble",   5,  2,  0,  NULL },             { 000,  NULL,        NULL,    0,  0,  0,  NULL },           };static RuleList step3_rules[] =           {             { 301,  "icate",     "ic",    4,  1,  0,  NULL },             { 302,  "ative",     LAMBDA,  4, -1,  0,  NULL },             { 303,  "alize",     "al",    4,  1,  0,  NULL },             { 304,  "iciti",     "ic",    4,  1,  0,  NULL },             { 305,  "ical",      "ic",    3,  1,  0,  NULL },             { 308,  "ful",       LAMBDA,  2, -1,  0,  NULL },             { 309,  "ness",      LAMBDA,  3, -1,  0,  NULL },             { 000,  NULL,        NULL,    0,  0,  0,  NULL },           };static RuleList step4_rules[] =           {             { 401,  "al",        LAMBDA,  1, -1,  1,  NULL },             { 402,  "ance",      LAMBDA,  3, -1,  1,  NULL },             { 403,  "ence",      LAMBDA,  3, -1,  1,  NULL },             { 405,  "er",        LAMBDA,  1, -1,  1,  NULL },             { 406,  "ic",        LAMBDA,  1, -1,  1,  NULL },             { 407,  "able",      LAMBDA,  3, -1,  1,  NULL },             { 408,  "ible",      LAMBDA,  3, -1,  1,  NULL },             { 409,  "ant",       LAMBDA,  2, -1,  1,  NULL },             { 410,  "ement",     LAMBDA,  4, -1,  1,  NULL },             { 411,  "ment",      LAMBDA,  3, -1,  1,  NULL },             { 412,  "ent",       LAMBDA,  2, -1,  1,  NULL },             { 423,  "sion",      "s",     3,  0,  1,  NULL },             { 424,  "tion",      "t",     3,  0,  1,  NULL },             { 415,  "ou",        LAMBDA,  1, -1,  1,  NULL },             { 416,  "ism",       LAMBDA,  2, -1,  1,  NULL },             { 417,  "ate",       LAMBDA,  2, -1,  1,  NULL },             { 418,  "iti",       LAMBDA,  2, -1,  1,  NULL },             { 419,  "ous",       LAMBDA,  2, -1,  1,  NULL },             { 420,  "ive",       LAMBDA,  2, -1,  1,  NULL },             { 421,  "ize",       LAMBDA,  2, -1,  1,  NULL },             { 000,  NULL,        NULL,    0,  0,  0,  NULL },           };static RuleList step5a_rules[] =           {             { 501,  "e",         LAMBDA,  0, -1,  1,  NULL },             { 502,  "e",         LAMBDA,  0, -1, -1,  RemoveAnE },             { 000,  NULL,        NULL,    0,  0,  0,  NULL },           };static RuleList step5b_rules[] =           {             { 503,  "ll",        "l",     1,  0,  1,  NULL },             { 000,  NULL,        NULL,    0,  0,  0,  NULL },           };/*****************************************************************************//********************   Private Function Declarations   **********************//*FN***************************************************************************       WordSize( word )   Returns: int -- a weird count of word size in adjusted syllables   Purpose: Count syllables in a special way:  count the number             vowel-consonant pairs in a word, disregarding initial             consonants and final vowels.  The letter "y" counts as a            consonant at the beginning of a word and when it has a vowel            in front of it; otherwise (when it follows a consonant) it            is treated as a vowel.  For example, the WordSize of "cat"             is 1, of "any" is 1, of "amount" is 2, of "anything" is 3.   Plan:    Run a DFA to compute the word size   Notes:   The easiest and fastest way to compute this funny measure is            with a finite state machine.  The initial state 0 checks            the first letter.  If it is a vowel, then the machine changes            to state 1, which is the "last letter was a vowel" state.            If the first letter is a consonant or y, then it changes            to state 2, the "last letter was a consonant state".  In            state 1, a y is treated as a consonant (since it follows            a vowel), but in state 2, y is treated as a vowel (since            it follows a consonant.  The result counter is incremented            on the transition from state 1 to state 2, since this            transition only occurs after a vowel-consonant pair, which            is what we are counting.**/static intWordSize( word )   char *word;   /* in: word having its WordSize taken */   {   register int result;   /* WordSize of the word */   register int state;    /* current state in machine */   result = 0;   state = 0;                 /* Run a DFA to compute the word size */   while ( EOS != *word )
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -