📄 porter_english.dct

📁 PostgreSQL7.4.6 for Linux
💻 DCT
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* * ----START-LICENCE---- * Copyright 1999,2000 BrightStation PLC * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 * USA * -----END-LICENCE----- *//* Version 1: see http://open.muscat.com/ for further information */#ifdef DICT_BODY#include <ctype.h> /* tolower */static void * setup_english_stemmer();static const char * english_stem(void * z, const char * q, int i0, int i1);static void closedown_english_stemmer(void * z);/* To set up the english stemming process:       void * z = setup_stemmer();   to use it:       char * p = stem(z, q, i0, i1);   The word to be stemmed is in byte address q offsets i0 to i1   inclusive (i.e. from q[i0] to q[i1]). The stemmed result is the   C string at address p.   To close down the stemming process:   closedown_stemmer(z);*//* The English stemming algorithm is essentially the Porter stemming * algorithm, and has been coded up by its author. It follows the algorithm * presented in * * Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, * no. 3, pp 130-137, * * only differing from it at the points marked -DEPARTURE- and -NEW- * below. * * For a more faithful version of the Porter algorithm, see * *     http://www.muscat.com/~martin/stem.html * *//* Later additions:   June 2000   The 'l' of the 'logi' -> 'log' rule is put with the stem, so that   short stems like 'geo' 'theo' etc work like 'archaeo' 'philo' etc.   This follows a suggestion of Barry Wilkins, reasearch student at   Birmingham.   February 2000   the cvc test for not dropping final -e now looks after vc at the   beginning of a word, so are, eve, ice, ore, use keep final -e. In this   test c is any consonant, including w, x and y. This extension was   suggested by Chris Emerson.   -fully    -> -ful   treated like  -fulness -> -ful, and   -tionally -> -tion  treated like  -tional  -> -tion   both in Step 2. These were suggested by Hiranmay Ghosh, of New Delhi.   Invariants proceed, succeed, exceed. Also suggested by Hiranmay Ghosh.*/#include <stdio.h>#include <stdlib.h>#include <string.h>struct pool {    int size;    struct pool_entry * entries;};/*  This is used as a library to resolve exceptions in the various    stemming algorithms. Typical use is,        struct pool * p = create_pool(t);        char * s_translated = search_pool(p, strlen(s), s);        ...        free_pool(p);    t is an array of strings, e.g.        static char * t[] = {            "sky",     "sky/skies/",            "die",     "dying/",            "lie",     "lying/",            "tie",     "tying/",            ....            0, 0        };    if s is "sky", "skies", "dying" etc., translated_s is becomes "sky",    "sky", "die" etc.    The code includes a sort/merge capability which may be turned into    (or replaced by) something more general later on.*//*  merge(n, p, q, r, l, k, f) repeatedly merges n-byte sequences of items of    size k from addresses p and q into r. f is the comparison routine and    l is the limit point for q.*/static void merge(int n, char * p, char * q, char * r, char * l, int k,                  int (*f)(char *, char *)){  char * q0 = q;   if (q0 > l) { memmove(r, p, l-p); return; }   while (p < q0)   {  char * pl = n+p;      char * ql = n+q;      if (ql > l) ql = l;      while(true)      {  if (p >= pl) { memmove(r, q, ql-q); r += ql-q; q = ql; break; }         if (q >= ql) { memmove(r, p, pl-p); r += pl-p; p = pl; break; }         if (f(p, q)) { memmove(r, p, k); p += k; }                else { memmove(r, q, k); q += k; }         r += k;      }   }   memmove(r, q, l-q);}/*  In sort(p, c, k, f), p+c is a byte address at which begin a sequence of    items of size k to be sorted. p+l is the address of the byte after the    last of these items, so l - c is divisible by k. f is a comparison function    for a pair of these items: f(p+i, q+j) is true if the item at p+i is before    the item at q+j, false if it is equal to or after it.*/static void sort(char * p, int c, int l, int k,                 int (*f)(char *, char *)){    char * q = malloc(l-c);  /* temporary work space */    int j = k;    int w = l-c;    while (j < w)    {   int cycle;        for (cycle = 1; cycle <= 2; cycle++)        {   int h = (w+j-1) / j / 2 * j;     /* half way */            if (cycle == 1) merge(j, p+c, p+c+h, q, p+l, k, f);                       else merge(j, q, q+h, p+c, q+w, k, f);            j *= 2;        }    }    free(q);}struct pool_entry {    const char * translation;    const char * pointer;    int length;};static void print_entry(struct pool_entry * p)    {       { int j; for (j=0;j<p->length;j++) fprintf(stderr, "%c", (p->pointer)[j]); }       fprintf(stderr, " --> %s\n", p->translation);    }/*  - debugging aid    static void print_pool(struct pool * p)    {   int i;        int size = p->size;        struct pool_entry * q = p->entries;        fprintf(stderr, "\nPool:\n");        for (i = 0; i < size; i++) print_entry(q+i);    }*//* compare(p, q) is our comparison function, used for f above*/static int compare(char * char_p, char * char_q){   struct pool_entry * p = (struct pool_entry *) char_p;    struct pool_entry * q = (struct pool_entry *) char_q;    if (p->length == q->length) return memcmp(p->pointer, q->pointer, p->length) < 0;    return p->length < q->length;}static int count_slashes(const char * s[]){   int slash_count = 0;    int i;    for (i = 1; s[i] != 0; i += 2)    {   const char * p = s[i];        int j = 0;        while (p[j] != 0) if (p[j++] == '/') slash_count++;    }    return slash_count;}static struct pool * create_pool(const char * s[]){   int size = count_slashes(s);    struct pool_entry * z = (struct pool_entry *) malloc(size * sizeof(struct pool_entry));    struct pool_entry * q = z;    int i;    for (i = 1; s[i] != 0; i += 2)    {   const char * p = s[i];        int j = 0;        int j0 = 0;        while(true)        {   if (p[j] == 0)            {  if (j0 != j) { fprintf(stderr, "%s lacks final '/'\n", p); exit(1); }               break;            }            if (p[j] == '/')            {                q->translation = s[i-1];                q->pointer = p+j0; q->length = j-j0;                q++;                j0 = j+1;            }            j++;        }    }    sort((char *) z, 0, size * sizeof(struct pool_entry), sizeof(struct pool_entry), compare);    /* now validate the contents */    for (i = 1; i < size; i++)    {   struct pool_entry * p = z+i;        struct pool_entry * q = z+i-1;        if (p->length == q->length && memcmp(p->pointer, q->pointer, p->length) == 0)        {   fprintf(stderr, "warning: "); print_entry(p);            fprintf(stderr, " and "); print_entry(q);        }    }    {   struct pool * p = (struct pool *) malloc(sizeof(struct pool));        p->entries = z;        p->size = size;        return p;    }}static int compare_to_pool(int length, const char * s, int length_p, const char * s_p){   if (length != length_p) return length-length_p;    return memcmp(s, s_p, length);}static const char * search_pool(struct pool * p, int length, char * s){   int i = 0;    int j = p->size;    struct pool_entry * q = p->entries;    if (j == 0) return 0;  /* empty pool */    if (compare_to_pool(length, s, q->length, q->pointer) < 0) return 0;    while(true)    {        int h = (i+j)/2;        int diff = compare_to_pool(length, s, (q+h)->length, (q+h)->pointer);        if (diff == 0) return (q+h)->translation;        if (j-i <= 1) return 0;        if (diff < 0) j = h; else i = h;    }}static void free_pool(struct pool * p){   free(p->entries);    free(p);}struct english_stemmer{    char * p;    int p_size;    int k;    int j;    struct pool * irregulars;};/* The main part of the stemming algorithm starts here. z->p is a buffer   holding a word to be stemmed. The letters are in z->p[0], z->p[1] ...   ending at z->p[z->k]. z->k is readjusted downwards as the stemming   progresses. Zero termination is not in fact used in the algorithm.   Note that only lower case sequences are stemmed. Forcing to lower case   should be done before english_stem(...) is called.   We will write p, k etc in place of z->p, z->k in the comments.*//* cons(z, i) is true <=> p[i] is a consonant.*/static int cons(struct english_stemmer * z, int i){   switch (z->p[i])    {   case 'a': case 'e': case 'i': case 'o': case 'u':            return false;        case 'y':            return (i==0) ? true : !cons(z, i - 1);        default: return true;    }}/* m(z) measures the number of consonant sequences between 0 and j. if c is   a consonant sequence and v a vowel sequence, and <..> indicates arbitrary   presence,      <c><v>       gives 0      <c>vc<v>     gives 1      <c>vcvc<v>   gives 2      <c>vcvcvc<v> gives 3      ....*/static int m(struct english_stemmer * z){   int n = 0;    int i = 0;    while(true)    {   if (i > z->j) return n;        if (! cons(z, i)) break; i++;    }    i++;    while(true)    {   while(true)        {   if (i > z->j) return n;            if (cons(z, i)) break;            i++;        }        i++;        n++;        while(true)        {   if (i > z->j) return n;            if (! cons(z, i)) break;            i++;        }        i++;    }}/* vowelinstem(z) is true p[0], ... p[j] contains a vowel*/static int vowelinstem(struct english_stemmer * z){   int i;    for (i = 0; i <= z->j; i++) if (! cons(z, i)) return true;    return false;}/* doublec(z, i) is true <=> p[i], p[i - 1] contain a double consonant.*/static int doublec(struct english_stemmer * z, int i){   if (i < 1) return false;    if (z->p[i] != z->p[i - 1]) return false;    return cons(z, i);}/* cvc(z, i) is true <=>   a) ( -NEW- ) i == 1, and p[0] p[1] is vowel consonant, or   b) p[i - 2], p[i - 1], p[i] has the form consonant -      vowel - consonant and also if the second c is not w, x or y. this is used      when trying to restore an e at the end of a short word. e.g.         cav(e), lov(e), hop(e), crim(e), but         snow, box, tray.*/static int cvc(struct english_stemmer * z, int i){    if (i == 0) return false;   /* i == 0 never happens perhaps */    if (i == 1) return !cons(z, 0) && cons(z, 1);    if (!cons(z, i) || cons(z, i - 1) || !cons(z, i - 2)) return false;    {   int ch = z->p[i];        if (ch == 'w' || ch == 'x' || ch == 'y') return false;    }    return true;}/* ends(z, s, length) is true <=> p[0], ... p[k] ends with the string s.*/static int ends(struct english_stemmer * z, const char * s, int length){    if (length > z->k + 1) return false;    if (memcmp(z->p + z->k - length + 1, s, length) != 0) return false;    z->j = z->k - length;    return true;}/* setto(z, s, length) sets p[j + 1] ... to the characters in the string s,   readjusting k.*/static void setto(struct english_stemmer * z, const char * s, int length){    memmove(z->p + z->j + 1, s, length);    z->k = z->j + length;}/* r(z, s, length) is used further down. */static void r(struct english_stemmer * z, const char * s, int length){    if (m(z) > 0) setto(z, s, length);}/* step_1ab(z) gets rid of plurals and -ed or -ing. e.g.       caresses  ->  caress       ponies    ->  poni       sties     ->  sti       tie       ->  tie       (-NEW-: see below)       caress    ->  caress       cats      ->  cat       feed      ->  feed       agreed    ->  agree       disabled  ->  disable       matting   ->  mat       mating    ->  mate       meeting   ->  meet       milling   ->  mill       messing   ->  mess       meetings  ->  meet*/static void step_1ab(struct english_stemmer * z){   if (z->p[z->k] == 's')    {   if (ends(z, "sses", 4)) z->k -= 2; else        if (ends(z, "ies", 3))            if (z->j == 0) z->k--; else z->k -= 2;        /* this line extends the original algorithm, so that 'flies'->'fli' but           'dies'->'die' etc */        else            if (z->p[z->k - 1] != 's') z->k--;    }    if (ends(z, "ied", 3)) { if (z->j == 0) z->k--; else z->k -= 2; } else    /* this line extends the original algorithm, so that 'spied'->'spi' but       'died'->'die' etc */    if (ends(z, "eed", 3)) { if (m(z) > 0) z->k--; } else    if ((ends(z, "ed", 2) || ends(z, "ing", 3)) && vowelinstem(z))    {   z->k = z->j;        if (ends(z, "at", 2)) setto(z, "ate", 3); else        if (ends(z, "bl", 2)) setto(z, "ble", 3); else        if (ends(z, "iz", 2)) setto(z, "ize", 3); else        if (doublec(z, z->k))        {   z->k--;            {   int ch = z->p[z->k];                if (ch == 'l' || ch == 's' || ch == 'z') z->k++;            }        }        else if (m(z) == 1 && cvc(z, z->k)) setto(z, "e", 1);    }}/* step_1c(z) turns terminal y to i when there is another vowel in the stem.   -NEW-: This has been modified from the original Porter algorithm so that y->i   is only done when y is preceded by a consonant, but not if the stem   is only a single consonant, i.e.       (*c and not c) Y -> I   So 'happy' -> 'happi', but      'enjoy' -> 'enjoy'  etc   This is a much better rule. Formerly 'enjoy'->'enjoi' and 'enjoyment'->   'enjoy'. Step 1c is perhaps done too soon; but with this modification that   no longer really matters.   Also, the removal of the vowelinstem(z) condition means that 'spy', 'fly',   'try' ... stem to 'spi', 'fli', 'tri' and conflate with 'spied', 'tried',   'flies' ...*/static void step_1c(struct english_stemmer * z){    if (ends(z, "y", 1) && z->j > 0 && cons(z, z->k - 1)) z->p[z->k] = 'i';}/* step_2(z) maps double suffices to single ones. so -ization ( = -ize plus   -ation) maps to -ize etc. Note that the string before the suffix must give   m(z) > 0.*/static void step_2(struct english_stemmer * z){   switch (z->p[z->k - 1])    {        case 'a':            if (ends(z, "ational", 7)) { r(z, "ate", 3); break; }            if (ends(z, "tional", 6)) { r(z, "tion", 4); break; }            break;        case 'c':            if (ends(z, "enci", 4)) { r(z, "ence", 4); break; }            if (ends(z, "anci", 4)) { r(z, "ance", 4); break; }            break;        case 'e':            if (ends(z, "izer", 4)) { r(z, "ize", 3); break; }            break;        case 'l':            if (ends(z, "bli", 3)) { r(z, "ble", 3); break; } /*-DEPARTURE-*/     /* To match the published algorithm, replace this line with        case 'l':            if (ends(z, "abli", 4)) { r(z, "able", 4); break; }     */            if (ends(z, "alli", 4))            {                if (m(z) > 0) { setto(z, "al", 2); step_2(z); } /*-NEW-*/                break;            }            if (ends(z, "fulli", 5)) { r(z, "ful", 3); break; } /*-NEW-*/            if (ends(z, "entli", 5)) { r(z, "ent", 3); break; }            if (ends(z, "eli", 3)) { r(z, "e", 1); break; }            if (ends(z, "ousli", 5)) { r(z, "ous", 3); break; }            break;        case 'o':            if (ends(z, "ization", 7)) { r(z, "ize", 3); break; }            if (ends(z, "ation", 5)) { r(z, "ate", 3); break; }            if (ends(z, "ator", 4)) { r(z, "ate", 3); break; }            break;        case 's':            if (ends(z, "alism", 5)) { r(z, "al", 2); break; }            if (ends(z, "iveness", 7)) { r(z, "ive", 3); break; }            if (ends(z, "fulness", 7)) { r(z, "ful", 3); break; }            if (ends(z, "ousness", 7)) { r(z, "ous", 3); break; }            break;        case 't':            if (ends(z, "aliti", 5)) { r(z, "al", 2); break; }            if (ends(z, "iviti", 5)) { r(z, "ive", 3); break; }            if (ends(z, "biliti", 6)) { r(z, "ble", 3); break; }            break;        case 'g':            if (ends(z, "logi", 4))            {   z->j++;                /*-NEW-*/ /*(Barry Wilkins)*/                r(z, "og", 2); break;            } /*-DEPARTURE-*/     /* To match the published algorithm, delete this line */    }}/* step_3(z) deals with -ic-, -full, -ness etc. Similar strategy to step_2.*/static void step_3(struct english_stemmer * z){   switch (z->p[z->k])    {        case 'e':            if (ends(z, "icate", 5)) { r(z, "ic", 2); break; }            if (ends(z, "ative", 5)) { r(z, "", 0); break; }            if (ends(z, "alize", 5)) { r(z, "al", 2); break; }            break;        case 'i':            if (ends(z, "iciti", 5)) { r(z, "ic", 2); break; }            break;        case 'l':            if (ends(z, "ical", 4)) { r(z, "ic", 2); break; }            if (ends(z, "ful", 3)) { r(z, "", 0); break; }            break;        case 's':            if (ends(z, "ness", 4)) { r(z, "", 0); break; }            break;    }}/* step_4() takes off -ant, -ence etc., in context <c>vcvc<v>.*/static void step_4(struct english_stemmer * z){   switch (z->p[z->k - 1])    {   case 'a':            if (ends(z, "al", 2)) break; return;        case 'c':            if (ends(z, "ance", 4)) break;            if (ends(z, "ence", 4)) break; return;        case 'e':            if (ends(z, "er", 2)) break; return;        case 'i':            if (ends(z, "ic", 2)) break; return;        case 'l':            if (ends(z, "able", 4)) break;            if (ends(z, "ible", 4)) break; return;        case 'n':            if (ends(z, "ant", 3)) break;            if (ends(z, "ement", 5)) break;            if (ends(z, "ment", 4)) break;            if (ends(z, "ent", 3)) break; return;        case 'o':            if (ends(z, "ion", 3) && (z->p[z->j] == 's' ||                                    z->p[z->j] == 't')) break;            if (ends(z, "ou", 2)) break; return;            /* takes care of -ous */        case 's':            if (ends(z, "ism", 3)) break; return;        case 't':            if (ends(z, "ate", 3)) break;            if (ends(z, "iti", 3)) break; return;
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -