📄 porter_english.dct
字号:
case 'u': if (ends(z, "ous", 3)) break; return; case 'v': if (ends(z, "ive", 3)) break; return; case 'z': if (ends(z, "ize", 3)) break; return; default: return; } if (m(z) > 1) z->k = z->j;}/* step_5(z) removes a final -e if m(z) > 1, and changes -ll to -l if m(z) > 1.*/static void step_5(struct english_stemmer * z){ z->j = z->k; if (z->p[z->k] == 'e') { int a = m(z); if (a > 1 || (a == 1 && !cvc(z, z->k - 1))) z->k--; } if (z->p[z->k] == 'l' && doublec(z, z->k) && m(z) > 1) z->k--;}static const char * english_stem(void * z_, const char * q, int i0, int i1){ struct english_stemmer * z = (struct english_stemmer *) z_; int p_size = z->p_size; if (i1 - i0 + 50 > p_size) { free(z->p); p_size = i1 - i0 + 75; /* ample */ z->p_size = p_size; z->p = (char *) malloc(p_size); } memmove(z->p, q + i0, i1 - i0 + 1); z->k = i1 - i0; { const char * t = search_pool(z->irregulars, z->k + 1, z->p); if (t != 0) { z->k = strlen(t) - 1; return t; } } if (z->k > 1) /*-DEPARTURE-*/ /* With this line, strings of length 1 or 2 don't go through the stemming process, although no mention is made of this in the published algorithm. Remove the line to match the published algorithm. */ { step_1ab(z); step_1c(z); step_2(z); step_3(z); step_4(z); step_5(z); } z->p[z->k + 1] = 0; /* C string form for now */ return z->p;}/* -NEW- This is a table of irregular forms. It is quite short, but still reflects the errors actually drawn to Martin Porter's attention over a 20 year period! Extend it as necessary. The form of the table is: "p1" "s11/s12/s13/ ... /" "p2" "s21/s22/s23/ ... /" ... "pn" "sn1/sn2/sn3/ ... /" 0, 0 String sij is mapped to paradigm form pi, and the main stemming process is then bypassed.*/static const char * irregular_forms[] = { "sky", "sky/skies/", "die", "dying/", "lie", "lying/", "tie", "tying/", "news", "news/", "inning", "innings/inning/", "outing", "outings/outing/", "canning", "cannings/canning/", "howe", "howe/", /*-NEW-*/ "proceed", "proceed/", "exceed", "exceed/", "succeed", "succeed/", /* Hiranmay Ghosh */ 0, 0 /* terminator */};/* * is_stopword part */typedef struct { unsigned char val; unsigned char flag; unsigned char right; unsigned char child;} ESWNODE;/* is exists left tree ? */#define L 0x01/* finish word flag */#define F 0x02#define ISLEFT(x) (((ESWNODE*)x)->flag & L)#define ISFINISH(x) (((ESWNODE*)x)->flag & F)static ESWNODE engstoptree[] = { {'m',L,9,126}, {'d',L,4,71}, {'b',L,2,40}, {'a',F,0,14}, {'c',0,0,62}, {'f',L,2,79}, {'e',0,0,75}, {'h',0,1,90}, {'i',F,0,108}, {'t',L,4,177}, {'o',L,2,135}, {'n',0,0,131}, {'s',0,0,156}, {'v',L,2,210}, {'u',0,0,201}, {'w',0,1,211}, {'y',0,0,237}, {'m',L|F,5,0}, {'f',L,2,12}, {'b',0,0,7}, {'g',0,1,13}, {'l',0,0,17}, {'r',L,2,19}, {'n',F,0,16}, {'s',F,1,0}, {'t',F,0,0}, {'o',0,0,1}, {'u',0,1,2}, {'v',F,0,0}, {'t',F,0,0}, {'t',0,0,1}, {'e',0,0,1}, {'r',F,0,0}, {'a',0,0,1}, {'i',0,0,1}, {'n',F,0,1}, {'s',0,0,1}, {'t',F,0,0}, {'l',F,0,0}, {'d',F,1,0}, {'i',F,0,0}, {'e',F,0,0}, {'o',L,2,21}, {'e',F,0,3}, {'u',0,1,21}, {'y',F,0,0}, {'f',L,3,9}, {'c',0,1,4}, {'e',0,0,6}, {'l',0,1,8}, {'t',0,0,9}, {'a',0,0,1}, {'u',0,0,1}, {'s',F,0,0}, {'n',F,0,0}, {'o',0,0,1}, {'r',F,0,0}, {'o',0,0,1}, {'w',F,0,0}, {'w',0,0,1}, {'e',0,0,1}, {'e',0,0,1}, {'n',F,0,0}, {'t',0,0,1}, {'h',F,0,0}, {'t',F,0,0}, {'a',0,1,2}, {'o',0,0,2}, {'n',F,0,0}, {'u',0,0,1}, {'l',0,0,1}, {'d',F,0,0}, {'o',L|F,2,4}, {'i',0,0,2}, {'u',0,0,5}, {'d',F,0,0}, {'e',F,1,0}, {'w',0,0,1}, {'n',F,0,0}, {'r',0,0,1}, {'e',F,0,0}, {'a',0,0,1}, {'c',0,0,1}, {'h',F,0,0}, {'o',L,2,5}, {'e',0,0,3}, {'r',0,1,4}, {'u',0,0,5}, {'w',F,0,0}, {'r',F,0,0}, {'o',0,0,1}, {'m',F,0,0}, {'r',0,0,1}, {'t',0,0,1}, {'h',0,0,1}, {'e',0,0,1}, {'r',F,0,0}, {'e',L|F,2,7}, {'a',F,0,3}, {'i',F,1,11}, {'o',0,0,15}, {'d',F,1,0}, {'v',0,0,1}, {'e',F,0,0}, {'r',F,0,1}, {'e',F,1,0}, {'s',0,0,1}, {'e',0,0,1}, {'l',0,0,1}, {'f',F,0,0}, {'m',F,0,1}, {'s',0,0,1}, {'e',0,0,1}, {'l',0,0,1}, {'f',F,0,0}, {'w',F,0,0}, {'n',L|F,2,4}, {'f',F,0,0}, {'s',F,1,0}, {'t',F,0,3}, {'t',0,0,1}, {'o',F,0,0}, {'s',0,0,1}, {'e',0,0,1}, {'l',0,0,1}, {'f',F,0,0}, {'o',L,3,6}, {'a',0,1,4}, {'e',F,0,0}, {'u',0,1,7}, {'y',F,0,8}, {'y',F,0,0}, {'r',0,1,2}, {'s',0,0,2}, {'e',F,0,0}, {'t',F,0,0}, {'s',0,0,1}, {'t',F,0,0}, {'s',0,0,1}, {'e',0,0,1}, {'l',0,0,1}, {'f',F,0,0}, {'o',F,0,1}, {'r',F,1,0}, {'t',F,0,0}, {'t',L,4,11}, {'n',L|F,2,7}, {'f',F,0,5}, {'r',F,0,0}, {'v',L,2,16}, {'u',0,0,9}, {'w',0,0,16}, {'f',F,0,0}, {'c',F,1,0}, {'l',0,0,1}, {'i',F,0,0}, {'h',0,0,1}, {'e',0,0,1}, {'r',F,0,0}, {'r',F,1,2}, {'t',F,0,0}, {'s',0,0,1}, {'e',0,0,1}, {'l',0,0,1}, {'v',F,0,0}, {'e',0,0,1}, {'r',F,0,0}, {'n',F,0,0}, {'h',L,2,6}, {'a',0,0,3}, {'o',F,1,12}, {'u',0,0,13}, {'m',0,0,1}, {'e',F,0,0}, {'e',L|F,2,0}, {'a',0,0,2}, {'o',0,0,3}, {'l',0,0,1}, {'l',F,0,0}, {'u',0,0,1}, {'l',0,0,1}, {'d',F,0,0}, {'m',0,0,1}, {'e',F,0,0}, {'c',0,0,1}, {'h',F,0,0}, {'h',0,1,2}, {'o',F,0,27}, {'i',L|F,3,0}, {'a',0,1,4}, {'e',F,0,5}, {'o',0,1,17}, {'r',0,0,18}, {'n',F,1,0}, {'t',F,0,0}, {'n',L|F,3,0}, {'i',0,1,5}, {'m',F,0,5}, {'s',L,2,9}, {'r',0,0,7}, {'y',F,0,0}, {'r',F,0,0}, {'s',0,0,1}, {'e',0,0,1}, {'l',0,0,1}, {'v',F,0,0}, {'e',F,0,0}, {'e',F,0,0}, {'s',0,0,1}, {'e',F,0,0}, {'o',0,0,1}, {'u',0,0,1}, {'g',0,0,1}, {'h',F,0,0}, {'o',F,0,0}, {'n',0,1,2}, {'p',F,0,0}, {'d',0,1,2}, {'t',0,0,3}, {'e',0,0,1}, {'r',F,0,0}, {'i',0,0,1}, {'l',F,0,0}, {'e',0,0,1}, {'r',0,0,1}, {'i',F,0,0}, {'h',L,3,7}, {'a',F,1,0}, {'e',F,0,3}, {'i',0,1,17}, {'o',0,0,20}, {'r',0,0,1}, {'e',F,0,0}, {'e',L,2,5}, {'a',0,0,3}, {'i',F,1,6}, {'o',F,0,9}, {'t',F,0,0}, {'n',F,1,0}, {'r',0,0,1}, {'e',F,0,0}, {'c',0,1,2}, {'l',0,0,2}, {'h',F,0,0}, {'e',F,0,0}, {'m',F,0,0}, {'l',0,1,2}, {'t',0,0,2}, {'l',F,0,0}, {'h',F,0,0}, {'u',0,0,1}, {'l',0,0,1}, {'d',F,0,0}, {'o',0,0,1}, {'u',F,0,1}, {'r',F,0,1}, {'s',0,0,1}, {'e',0,0,1}, {'l',0,0,1}, {'f',F,1,0}, {'v',F,0,0}};static unsigned intfind_english_stopword( unsigned char *buf, int len ) { ESWNODE *ptr = engstoptree; int result = 0; unsigned char *cur = buf; while( cur - buf < len ) { if ( ptr->val == *cur ) { cur++; if ( ISFINISH(ptr) ) result = cur - buf; if ( ! ptr->child ) break; ptr += ptr->child; } else if ( ptr->val > *cur ) { if ( ISLEFT(ptr) ) ptr++; else break; } else { if ( ptr->right ) ptr += ptr->right; else break; } } return result;} #undef L#undef F#undef ISLEFT#undef ISFINISHstatic intis_stopengword(void* obj,char* word,int len) { return ( len == find_english_stopword((unsigned char*)word, len) ) ? 1 : 0;}static void * setup_english_stemmer(){ struct english_stemmer * z = (struct english_stemmer *) malloc(sizeof(struct english_stemmer)); z->p = 0; z->p_size = 0; z->irregulars = create_pool(irregular_forms); return (void *) z;}static void closedown_english_stemmer(void * z_){ struct english_stemmer * z = (struct english_stemmer *) z_; free_pool(z->irregulars); free(z->p); free(z);}static char*engstemming(void* obj, char *word, int *len) { struct english_stemmer * z = (struct english_stemmer *) obj; const char* stemmed_word; char *result = word; while(result-word < *len) { *result = tolower((unsigned char) *result); result++; } stemmed_word = english_stem(obj, word, 0, *len-1); *len = z->k + 1; result = (char*)palloc( *len ); memcpy((void*)result, (void*)stemmed_word, *len); return result;}#endif /* DICT_BODY */#ifdef DICT_TABLETABLE_DICT_START "C", setup_english_stemmer, closedown_english_stemmer, engstemming, NULL, is_stopengwordTABLE_DICT_END#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -