📄 stemming(porter edition).cpp
字号:
//********************************************************************************************
// 此程序是基于老师给我的porter.c程序为基础的
//********************************************************************************************
/* ==== :INFO/STRIP.C ==== *//* * AUTHOR : Stuart J. Barr * DATE: : c. September 1986, give or take a few months... *//* * to save confusion... * USAGE : define KEYWORDSIZE (6 is a good value, in practise) * define FALSE, TRUE * if you want to strip prefixes, define PREFIXES * Write a wee C function to call * strip_affixes(string) * where string is a char *. *///********************************************************************************************#include <stdio.h>
#include <iostream.h>
#include <string.h>
#include <ctype.h>
#include <fstream.h>//********************************************************************************************#define TRUE 1#define FALSE 0
#define EOS '\0' #define KEYWORDSIZE 25#define PREFIXES 1#define BIG_KEYWORDSIZE (KEYWORDSIZE+20)//********************************************************************************************void to_lower_case(char*);void clean(char*);
void strip_affixes (char*);
void strip_prefixes(char*);
static isvalid(char);
static measure (char*);
static has_suffix (char *, char*,char*);
static void step_1 (char*);
static void step_2 (char*);
static void step_3 (char*);
static void step_4 (char*);
static void step_5 (char*);
static void strip_suffixes (char*);
static int cvc (char*);
static int vowel (char ,char );
static int contains_vowel (char*);
//********************************************************************************************void strip_affixes (char *string){ to_lower_case(string); clean(string);#ifdef PREFIXES strip_prefixes(string);#endif PREFIXES strip_suffixes(string); string[KEYWORDSIZE] = '\0';}//********************************************************************************************static void to_lower_case (char *kwd){ unsigned int i; for ( i=0 ; i < strlen(kwd) ; i++ ) if ( (kwd[i] >= 'A') && (kwd[i] <= 'Z') ) kwd[i] += 'a' - 'A';}//********************************************************************************************static void clean (char* kwd ){ int i,j,last=strlen(kwd); for ( i=0 ; i < last ; i++ )
{ if ( isvalid(kwd[i]) != 0 )
{ for ( j = i ; j < last-1 ; j++ ) kwd[j] = kwd[j+1]; kwd[last-1] = '\0'; last--; i--; } }}//********************************************************************************************static isvalid(char l){ if ( (l >= 'a') && (l <= 'z') ) return(0); if ( (l >= 'A') && (l <= 'Z') ) return(0); if ( (l >= '0') && (l <= '9') ) return(0); return(1);}//********************************************************************************************#ifdef PREFIXESstatic void strip_prefixes (char *string){ static char *prefixes[] = { "kilo", "micro", "milli", "intra", "ultra", "mega", "nano", "pico", "pseudo", 0 }; int i; for ( i=0 ; prefixes[i] != 0 ; i++ )
{ if ( strncmp(prefixes[i],string,strlen(prefixes[i])) == 0 )
{ unsigned int j; for ( j=0 ; j<strlen(string)-strlen(prefixes[i]) ; j++ ) string[j] = string[j+strlen(prefixes[i])]; string[j] = '\0'; return; } }}#endif PREFIXES//********************************************************************************************static void strip_suffixes (char *string){ step_1 ( string ); step_2 ( string ); step_3 ( string ); step_4 ( string ); step_5 ( string );}//********************************************************************************************static void step_1 (char *string){ char stem[BIG_KEYWORDSIZE]; if ( string[strlen(string)-1] == 's' )
{ if ( (has_suffix(string,"sses",stem) == TRUE) || (has_suffix(string,"ies",stem) == TRUE ) ) string[strlen(string)-2] = '\0'; else if ( string[strlen(string)-2] != 's' ) string[strlen(string)-1] = '\0'; } if ( has_suffix(string,"eed",stem) == TRUE )
{ if ( measure(stem) > 0 ) string[strlen(string)-1] = '\0'; } else
{
if ( ( (has_suffix(string,"ed",stem) == TRUE ) || (has_suffix(string,"ing",stem) == TRUE ) ) && (contains_vowel(stem) == TRUE ) )
{ string[strlen(stem)] = '\0'; if ( ( has_suffix(string,"at",stem) == TRUE ) || ( has_suffix(string,"bl",stem) == TRUE ) || ( has_suffix(string,"iz",stem) == TRUE ) )
{ string[strlen(string)+1] = '\0'; string[strlen(string)] = 'e'; } else
{ int length = strlen(string); if ( (string[length-1] == string[length-2]) && (string[length-1] != 'l') && (string[length-1] != 's') && (string[length-1] != 'z') ) string[length-1] = '\0'; else if ( measure(string) == 1 )
{ if ( cvc(string) == TRUE )
{ string[strlen(string)+1] = '\0'; string[strlen(string)] = 'e'; } } } } } if ( (has_suffix(string,"y",stem) == TRUE) && (contains_vowel(stem) == TRUE) ) string[strlen(string)-1] = 'i';}//********************************************************************************************static void step_2 ( char *string ){ static char *suffixes[][2] = { { "ational", "ate" }, { "tional", "tion" }, { "enci", "ence" }, { "anci", "ance" }, { "izer", "ize" }, { "iser", "ize" }, { "abli", "able" }, { "alli", "al" }, { "entli", "ent" }, { "eli", "e" }, { "ousli", "ous" }, { "ization", "ize" }, { "isation", "ize" }, { "ation", "ate" }, { "ator", "ate" }, { "alism", "al" }, { "iveness", "ive" }, { "fulness", "ful" }, { "ousness", "ous" }, { "aliti", "al" }, { "iviti", "ive" }, { "biliti", "ble" }, { 0, 0 } }; char stem[BIG_KEYWORDSIZE]; register int index; for ( index = 0 ; suffixes[index][0] != 0 ; index++ )
{ if ( has_suffix ( string, suffixes[index][0], stem ) == TRUE )
{ if ( measure ( stem ) > 0 )
{ sprintf ( string, "%s%s", stem, suffixes[index][1] ); return; } } } return;}//********************************************************************************************static void step_3 (char *string) { static char *suffixes[][2] = { { "icate", "ic" }, { "ative", "" }, { "alize", "al" }, { "alise", "al" }, { "iciti", "ic" }, { "ical", "ic" }, { "ful", "" }, { "ness", "" }, { 0, 0 } }; char stem[BIG_KEYWORDSIZE]; register int index; for ( index = 0 ; suffixes[index][0] != 0 ; index++ )
{ if ( has_suffix ( string, suffixes[index][0], stem ) == TRUE ) if ( measure ( stem ) > 0 )
{ sprintf ( string, "%s%s", stem, suffixes[index][1] ); return; } } return;}//********************************************************************************************static void step_4 (char *string){ static char *suffixes[] = { "al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement", "ment", "ent", "sion", "tion", "ou", "ism", "ate", "iti", "ous", "ive", "ize", "ise", 0 }; char stem[BIG_KEYWORDSIZE]; register int index; for ( index = 0 ; suffixes[index] != 0 ; index++ )
{ if ( has_suffix ( string, suffixes[index], stem ) == TRUE ) if ( measure ( stem ) > 1 )
{ strcpy( string, stem ); return; } } return;}//********************************************************************************************static void step_5 (char *string){ if ( string[strlen(string)-1] == 'e' )
{ if ( measure(string) > 1 ) /* measure(string)==measure(stem) if ends in vowel */ string[strlen(string)-1] = '\0'; else if ( measure(string) == 1 )
{ char stem[BIG_KEYWORDSIZE]; strcpy(stem,""); strncat( stem, string, strlen(string)-1 ); if ( cvc(stem) == FALSE ) string[strlen(string)-1] = '\0'; } } if ( (string[strlen(string)-1] == 'l') && (string[strlen(string)-2] == 'l') && (measure(string) > 1) ) string[strlen(string)-1] = '\0';}//********************************************************************************************static has_suffix (char *word, char *suffix,char *stem){ char tmp[BIG_KEYWORDSIZE]; if ( strlen(word) <= strlen(suffix) ) return(FALSE); if ( (strlen(suffix) > 1) && ( word[strlen(word)-2] != suffix[strlen(suffix)-2] ) ) return(FALSE); strcpy ( stem, "" ); strncat ( stem, word, strlen(word) - strlen(suffix) ); strcpy ( tmp, stem ); strcat ( tmp, suffix ); if ( strcmp ( tmp, word ) == 0 ) return(TRUE); else return(FALSE);}//********************************************************************************************static int cvc (char *string){ int length=strlen(string); if ( length < 3 ) return(FALSE); if ( (vowel(string[length-1],string[length-2]) == FALSE) && (string[length-1] != 'w') && (string[length-1] != 'x') && (string[length-1] != 'y') && (vowel(string[length-2],string[length-3]) == TRUE) && ( ( (length==3) && (vowel(string[0],'a') == FALSE) ) || (vowel(string[length-3],string[length-4])==FALSE) ) ) return(TRUE); else return(FALSE);}//********************************************************************************************static int vowel (char ch,char prev){ switch ( ch )
{ case 'a': case 'e': case 'i': case 'o': case 'u': return(TRUE); break; case 'y': return( vowel(prev,'?') == TRUE ); break; default : return(FALSE); break; }}//********************************************************************************************static measure (char *stem){ register int i=0, count = 0; int length=strlen(stem); while ( i < length )
{
for ( ; i < length ; i++ )
{ if ( i > 0 )
{ if ( vowel(stem[i],stem[i-1]) == TRUE ) break; } else
{
if ( vowel(stem[i],'a') == TRUE ) break; } } for ( i++ ; i < length ; i++ )
{ if ( i > 0 )
{ if ( vowel(stem[i],stem[i-1]) == FALSE ) break; } else
{
if ( vowel(stem[i],'?') == FALSE ) break; } } if ( i < length )
{ count++; i++; } } return(count);}//********************************************************************************************static int contains_vowel (char *word) { register unsigned int i; for ( i=0 ; i < strlen(word) ; i++ ) if ( i > 0 )
{ if ( vowel(word[i],word[i-1]) == TRUE ) return(TRUE); } else
{
if ( vowel(word[0],'a') == TRUE ) return(TRUE); } return(FALSE);}
//********************************************************************************************
int main()
{
ifstream infile;
ofstream outfile;
char character;
char filename[30];
cout << "*********************英文单词stemming处理程序**********************"
<< endl << endl;
cout << "请输入需处理文件的文件名(.txt):";
cin >> filename;
infile.open(filename);
outfile.open("outfile.txt");
if ( !infile )
{
cout << "无法打开指定的文件,请重试."
<< endl;
return 1;
}
infile.get(character);
while ( infile )
{
int count = 0;
char* word = new char[50];
while ( isalnum (character) )
{
word[count] = character;
count++;
infile.get(character);
}
word[count] = EOS;
strip_suffixes (word);
outfile << word << " ";
delete [] word;
infile.get(character);
}
outfile << endl;
cout << "文件处理完毕,处理的结果保存在outfile.txt中." << endl;
return 0;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -