⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 stem.c

📁 信息检索中常用的技术
💻 C
📖 第 1 页 / 共 2 页
字号:
/*******************************   stem.c   ***********************************   Purpose:    Implementation of the Porter stemming algorithm documented                in: Porter, M.F., "An Algorithm For Suffix Stripping,"                Program 14 (3), July 1980, pp. 130-137.   Provenance: Written by B. Frakes and C. Cox, 1986.               Changed by C. Fox, 1990.                  - made measure function a DFA                  - restructured structs                  - renamed functions and variables                  - restricted function and variable scopes               Changed by C. Fox, July, 1991.                  - added ANSI C declarations                   - branch tested to 90% coverage   Notes:      This code will make little sense without the the Porter               article.  The stemming function converts its input to               lower case.**//************************   Standard Include Files   *************************/#include <stdio.h>#include <string.h>#include <ctype.h>/*****************************************************************************//*****************   Private Defines and Data Structures   *******************/#define FALSE                         0#define TRUE                          1#define EOS                         '\0'#define IsVowel(c)        ('a'==(c)||'e'==(c)||'i'==(c)||'o'==(c)||'u'==(c))typedef struct {           int id;                 /* returned if rule fired */           char *old_end;          /* suffix replaced */           char *new_end;          /* suffix replacement */           int old_offset;         /* from end of word to start of suffix */           int new_offset;         /* from beginning to end of new suffix */           int min_root_size;      /* min root word size for replacement */           int (*condition)();     /* the replacement test function */           } RuleList;static char LAMBDA[1] = "";        /* the constant empty string */static char *end;                  /* pointer to the end of the word *//*****************************************************************************//********************   Private Function Declarations   **********************/#ifdef __STDC__static int WordSize( char *word );static int ContainsVowel( char *word );static int EndsWithCVC( char *word );static int AddAnE( char *word );static int RemoveAnE( char *word );static int ReplaceEnd( char *word, RuleList *rule );#elsestatic int WordSize( /* word */ );static int ContainsVowel( /* word */ );static int EndsWithCVC( /* word */ );static int AddAnE( /* word */ );static int RemoveAnE( /* word */ );static int ReplaceEnd( /* word, rule */ );#endif/******************************************************************************//*****************   Initialized Private Data Structures   ********************/static RuleList step1a_rules[] =           {             101,  "sses",      "ss",    3,  1, -1,  NULL,             102,  "ies",       "i",     2,  0, -1,  NULL,             103,  "ss",        "ss",    1,  1, -1,  NULL,             104,  "s",         LAMBDA,  0, -1, -1,  NULL,             000,  NULL,        NULL,    0,  0,  0,  NULL,           };static RuleList step1b_rules[] =           {             105,  "eed",       "ee",    2,  1,  0,  NULL,             106,  "ed",        LAMBDA,  1, -1, -1,  ContainsVowel,             107,  "ing",       LAMBDA,  2, -1, -1,  ContainsVowel,             000,  NULL,        NULL,    0,  0,  0,  NULL,           };static RuleList step1b1_rules[] =           {             108,  "at",        "ate",   1,  2, -1,  NULL,             109,  "bl",        "ble",   1,  2, -1,  NULL,             110,  "iz",        "ize",   1,  2, -1,  NULL,             111,  "bb",        "b",     1,  0, -1,  NULL,             112,  "dd",        "d",     1,  0, -1,  NULL,             113,  "ff",        "f",     1,  0, -1,  NULL,             114,  "gg",        "g",     1,  0, -1,  NULL,             115,  "mm",        "m",     1,  0, -1,  NULL,             116,  "nn",        "n",     1,  0, -1,  NULL,             117,  "pp",        "p",     1,  0, -1,  NULL,             118,  "rr",        "r",     1,  0, -1,  NULL,             119,  "tt",        "t",     1,  0, -1,  NULL,             120,  "ww",        "w",     1,  0, -1,  NULL,             121,  "xx",        "x",     1,  0, -1,  NULL,             122,  LAMBDA,      "e",    -1,  0, -1,  AddAnE,             000,  NULL,        NULL,    0,  0,  0,  NULL,             };static RuleList step1c_rules[] =           {             123,  "y",         "i",      0,  0, -1,  ContainsVowel,             000,  NULL,        NULL,    0,  0,  0,  NULL,           };static RuleList step2_rules[] =           {             203,  "ational",   "ate",   6,  2,  0,  NULL,             204,  "tional",    "tion",  5,  3,  0,  NULL,             205,  "enci",      "ence",  3,  3,  0,  NULL,             206,  "anci",      "ance",  3,  3,  0,  NULL,             207,  "izer",      "ize",   3,  2,  0,  NULL,             208,  "abli",      "able",  3,  3,  0,  NULL,             209,  "alli",      "al",    3,  1,  0,  NULL,             210,  "entli",     "ent",   4,  2,  0,  NULL,             211,  "eli",       "e",     2,  0,  0,  NULL,             213,  "ousli",     "ous",   4,  2,  0,  NULL,             214,  "ization",   "ize",   6,  2,  0,  NULL,             215,  "ation",     "ate",   4,  2,  0,  NULL,             216,  "ator",      "ate",   3,  2,  0,  NULL,             217,  "alism",     "al",    4,  1,  0,  NULL,             218,  "iveness",   "ive",   6,  2,  0,  NULL,             219,  "fulnes",    "ful",   5,  2,  0,  NULL,             220,  "ousness",   "ous",   6,  2,  0,  NULL,             221,  "aliti",     "al",    4,  1,  0,  NULL,             222,  "iviti",     "ive",   4,  2,  0,  NULL,             223,  "biliti",    "ble",   5,  2,  0,  NULL,             000,  NULL,        NULL,    0,  0,  0,  NULL,           };static RuleList step3_rules[] =           {             301,  "icate",     "ic",    4,  1,  0,  NULL,             302,  "ative",     LAMBDA,  4, -1,  0,  NULL,             303,  "alize",     "al",    4,  1,  0,  NULL,             304,  "iciti",     "ic",    4,  1,  0,  NULL,             305,  "ical",      "ic",    3,  1,  0,  NULL,             308,  "ful",       LAMBDA,  2, -1,  0,  NULL,             309,  "ness",      LAMBDA,  3, -1,  0,  NULL,             000,  NULL,        NULL,    0,  0,  0,  NULL,           };static RuleList step4_rules[] =           {             401,  "al",        LAMBDA,  1, -1,  1,  NULL,             402,  "ance",      LAMBDA,  3, -1,  1,  NULL,             403,  "ence",      LAMBDA,  3, -1,  1,  NULL,             405,  "er",        LAMBDA,  1, -1,  1,  NULL,             406,  "ic",        LAMBDA,  1, -1,  1,  NULL,             407,  "able",      LAMBDA,  3, -1,  1,  NULL,             408,  "ible",      LAMBDA,  3, -1,  1,  NULL,             409,  "ant",       LAMBDA,  2, -1,  1,  NULL,             410,  "ement",     LAMBDA,  4, -1,  1,  NULL,             411,  "ment",      LAMBDA,  3, -1,  1,  NULL,             412,  "ent",       LAMBDA,  2, -1,  1,  NULL,             423,  "sion",      "s",     3,  0,  1,  NULL,             424,  "tion",      "t",     3,  0,  1,  NULL,             415,  "ou",        LAMBDA,  1, -1,  1,  NULL,             416,  "ism",       LAMBDA,  2, -1,  1,  NULL,             417,  "ate",       LAMBDA,  2, -1,  1,  NULL,             418,  "iti",       LAMBDA,  2, -1,  1,  NULL,             419,  "ous",       LAMBDA,  2, -1,  1,  NULL,             420,  "ive",       LAMBDA,  2, -1,  1,  NULL,             421,  "ize",       LAMBDA,  2, -1,  1,  NULL,             000,  NULL,        NULL,    0,  0,  0,  NULL,           };static RuleList step5a_rules[] =           {             501,  "e",         LAMBDA,  0, -1,  1,  NULL,             502,  "e",         LAMBDA,  0, -1, -1,  RemoveAnE,             000,  NULL,        NULL,    0,  0,  0,  NULL,           };static RuleList step5b_rules[] =           {             503,  "ll",        "l",     1,  0,  1,  NULL,             000,  NULL,        NULL,    0,  0,  0,  NULL,           };/*****************************************************************************//********************   Private Function Declarations   **********************//*FN***************************************************************************       WordSize( word )   Returns: int -- a weird count of word size in adjusted syllables   Purpose: Count syllables in a special way:  count the number             vowel-consonant pairs in a word, disregarding initial             consonants and final vowels.  The letter "y" counts as a            consonant at the beginning of a word and when it has a vowel            in front of it; otherwise (when it follows a consonant) it            is treated as a vowel.  For example, the WordSize of "cat"             is 1, of "any" is 1, of "amount" is 2, of "anything" is 3.   Plan:    Run a DFA to compute the word size   Notes:   The easiest and fastest way to compute this funny measure is            with a finite state machine.  The initial state 0 checks            the first letter.  If it is a vowel, then the machine changes            to state 1, which is the "last letter was a vowel" state.            If the first letter is a consonant or y, then it changes            to state 2, the "last letter was a consonant state".  In            state 1, a y is treated as a consonant (since it follows            a vowel), but in state 2, y is treated as a vowel (since            it follows a consonant.  The result counter is incremented            on the transition from state 1 to state 2, since this            transition only occurs after a vowel-consonant pair, which            is what we are counting.**/static intWordSize( word )   char *word;   /* in: word having its WordSize taken */   {   register int result;   /* WordSize of the word */   register int state;    /* current state in machine */   result = 0;   state = 0;                 /* Run a DFA to compute the word size */   while ( EOS != *word )      {      switch ( state )         {         case 0: state = (IsVowel(*word)) ? 1 : 2;                 break;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -