📄 stemming(porter edition).cpp

📁 中文切词程序和相关代码
💻 CPP
字号:
//********************************************************************************************
//  此程序是基于老师给我的porter.c程序为基础的
//********************************************************************************************
/* ==== :INFO/STRIP.C ==== *//* * AUTHOR	: Stuart J. Barr  * DATE:	: c. September 1986, give or take a few months... *//* * to save confusion... * USAGE	: define KEYWORDSIZE (6 is a good value, in practise) *		  define FALSE, TRUE *		  if you want to strip prefixes, define PREFIXES *		  Write a wee C function to call *			strip_affixes(string) *		  where string is a char *. *///********************************************************************************************#include <stdio.h>
#include <iostream.h>
#include <string.h>
#include <ctype.h>
#include <fstream.h>//********************************************************************************************#define TRUE   1#define FALSE  0
#define EOS '\0' #define KEYWORDSIZE 25#define PREFIXES 1#define BIG_KEYWORDSIZE (KEYWORDSIZE+20)//********************************************************************************************void	to_lower_case(char*);void    clean(char*);
void    strip_affixes (char*);
void	strip_prefixes(char*);
static isvalid(char);
static measure (char*);
static has_suffix (char *, char*,char*);
static void step_1 (char*);
static void step_2 (char*);
static void step_3 (char*);
static void step_4 (char*);
static void step_5 (char*);
static void strip_suffixes (char*);
static int cvc (char*);
static int vowel (char ,char );
static int contains_vowel (char*);
//********************************************************************************************void strip_affixes (char *string){	to_lower_case(string);	clean(string);#ifdef PREFIXES	strip_prefixes(string);#endif PREFIXES	strip_suffixes(string);	string[KEYWORDSIZE] = '\0';}//********************************************************************************************static void to_lower_case (char *kwd){	unsigned int i;	for ( i=0 ; i < strlen(kwd) ; i++ )		if ( (kwd[i] >= 'A') && (kwd[i] <= 'Z') )			kwd[i] += 'a' - 'A';}//********************************************************************************************static void clean (char* kwd ){	int i,j,last=strlen(kwd);	for ( i=0 ; i < last ; i++ ) 
	{		if ( isvalid(kwd[i]) != 0 ) 
		{			for ( j = i ; j < last-1 ; j++ )				kwd[j] = kwd[j+1];			kwd[last-1] = '\0';			last--;			i--;		}	}}//********************************************************************************************static isvalid(char l){	if ( (l >= 'a') && (l <= 'z') )		return(0);	if ( (l >= 'A') && (l <= 'Z') )		return(0);	if ( (l >= '0') && (l <= '9') )		return(0);	return(1);}//********************************************************************************************#ifdef PREFIXESstatic void strip_prefixes (char *string){	static char *prefixes[] = { "kilo", "micro", "milli", "intra", "ultra",					"mega", "nano", "pico", "pseudo", 0 };	int i;	for ( i=0 ; prefixes[i] != 0 ; i++ ) 
	{		if ( strncmp(prefixes[i],string,strlen(prefixes[i])) == 0 ) 
		{			unsigned int j;			for ( j=0 ; j<strlen(string)-strlen(prefixes[i]) ; j++ )				string[j] = string[j+strlen(prefixes[i])];			string[j] = '\0';			return;		}	}}#endif PREFIXES//********************************************************************************************static void strip_suffixes (char *string){	step_1 ( string );	step_2 ( string );	step_3 ( string );	step_4 ( string );	step_5 ( string );}//********************************************************************************************static void step_1 (char *string){	char stem[BIG_KEYWORDSIZE];	if ( string[strlen(string)-1] == 's' ) 
	{		if ( (has_suffix(string,"sses",stem) == TRUE) ||				(has_suffix(string,"ies",stem) == TRUE ) )			string[strlen(string)-2] = '\0';		else			if ( string[strlen(string)-2] != 's' )				string[strlen(string)-1] = '\0';	}	if ( has_suffix(string,"eed",stem) == TRUE ) 
	{		if ( measure(stem) > 0 )			string[strlen(string)-1] = '\0';	}	else 
	{	
		if ( ( (has_suffix(string,"ed",stem) == TRUE )				|| (has_suffix(string,"ing",stem) == TRUE ) )				&& (contains_vowel(stem) == TRUE ) ) 
		{			string[strlen(stem)] = '\0';			if ( ( has_suffix(string,"at",stem) == TRUE )				|| ( has_suffix(string,"bl",stem) == TRUE )				|| ( has_suffix(string,"iz",stem) == TRUE ) ) 
			{				string[strlen(string)+1] = '\0';				string[strlen(string)] = 'e';			}			else 
			{	int length = strlen(string);				if ( (string[length-1] == string[length-2])						&& (string[length-1] != 'l')						&& (string[length-1] != 's')						&& (string[length-1] != 'z') )					string[length-1] = '\0';				else					if ( measure(string) == 1 ) 
					{					     if ( cvc(string) == TRUE ) 
						 {						     string[strlen(string)+1] = '\0';						     string[strlen(string)] = 'e';						 }					}			}		}	}	if ( (has_suffix(string,"y",stem) == TRUE) &&					(contains_vowel(stem) == TRUE) )		string[strlen(string)-1] = 'i';}//********************************************************************************************static void step_2 ( char *string ){	static char *suffixes[][2] =  { { "ational", "ate" },				        { "tional",  "tion" },				        { "enci",    "ence" },					{ "anci",    "ance" },					{ "izer",    "ize" },					{ "iser",    "ize" },					{ "abli",    "able" },					{ "alli",    "al" },					{ "entli",   "ent" },					{ "eli",     "e" },					{ "ousli",   "ous" },					{ "ization", "ize" },					{ "isation", "ize" },					{ "ation",   "ate" },					{ "ator",    "ate" },					{ "alism",   "al" },					{ "iveness", "ive" },					{ "fulness", "ful" },					{ "ousness", "ous" },					{ "aliti",   "al" },					{ "iviti",   "ive" },					{ "biliti",  "ble" },					{  0,        0     } };	char stem[BIG_KEYWORDSIZE];	register int index;	for ( index = 0 ; suffixes[index][0] != 0 ; index++ ) 
	{		if ( has_suffix ( string, suffixes[index][0], stem ) == TRUE ) 
		{			if ( measure ( stem ) > 0 ) 
			{				sprintf ( string, "%s%s", stem, suffixes[index][1] );				return;			}		}	}	return;}//********************************************************************************************static void step_3 (char *string)	{	static char *suffixes[][2] = { { "icate", "ic" },				       { "ative", "" },				       { "alize", "al" },				       { "alise", "al" },				       { "iciti", "ic" },				       { "ical",  "ic" },				       { "ful",   "" },				       { "ness",  "" },				       { 0,       0 } };	char stem[BIG_KEYWORDSIZE];	register int index;	for ( index = 0 ; suffixes[index][0] != 0 ; index++ ) 
	{		if ( has_suffix ( string, suffixes[index][0], stem ) == TRUE )			if ( measure ( stem ) > 0 ) 
			{				sprintf ( string, "%s%s", stem, suffixes[index][1] );				return;			}	}	return;}//********************************************************************************************static void step_4 (char *string){	static char *suffixes[] = { "al", "ance", "ence", "er", "ic", "able",		"ible", "ant", "ement", "ment", "ent", "sion", "tion",		"ou", "ism", "ate", "iti", "ous", "ive", "ize", "ise", 0 };	char stem[BIG_KEYWORDSIZE];	register int index;	for ( index = 0 ; suffixes[index] != 0 ; index++ ) 
	{		if ( has_suffix ( string, suffixes[index], stem ) == TRUE )			if ( measure ( stem ) > 1 ) 
			{				strcpy( string, stem );				return;			}	}	return;}//********************************************************************************************static void step_5 (char *string){	if ( string[strlen(string)-1] == 'e' ) 
	{		if ( measure(string) > 1 )			/* measure(string)==measure(stem) if ends in vowel */			string[strlen(string)-1] = '\0';		else			if ( measure(string) == 1 ) 
			{				char stem[BIG_KEYWORDSIZE];				strcpy(stem,"");				strncat( stem, string, strlen(string)-1 );				if ( cvc(stem) == FALSE )					string[strlen(string)-1] = '\0';			}	}	if ( (string[strlen(string)-1] == 'l')				&& (string[strlen(string)-2] == 'l')				&& (measure(string) > 1) )		string[strlen(string)-1] = '\0';}//********************************************************************************************static has_suffix (char *word, char *suffix,char *stem){	char tmp[BIG_KEYWORDSIZE];	if ( strlen(word) <= strlen(suffix) )		return(FALSE);	if ( (strlen(suffix) > 1) &&			( word[strlen(word)-2] != suffix[strlen(suffix)-2] ) )		return(FALSE);	strcpy ( stem, "" );	strncat ( stem, word, strlen(word) - strlen(suffix) );	strcpy ( tmp, stem );	strcat ( tmp, suffix );	if ( strcmp (  tmp, word ) == 0 )		return(TRUE);	else		return(FALSE);}//********************************************************************************************static int cvc (char *string){	int length=strlen(string);	if ( length < 3 )		return(FALSE);	if ( (vowel(string[length-1],string[length-2]) == FALSE)			&& (string[length-1] != 'w')			&& (string[length-1] != 'x')			&& (string[length-1] != 'y')			&& (vowel(string[length-2],string[length-3]) == TRUE)			&& ( ( (length==3) && (vowel(string[0],'a') == FALSE) )			|| (vowel(string[length-3],string[length-4])==FALSE) ) )		return(TRUE);	else		return(FALSE);}//********************************************************************************************static int vowel (char ch,char prev){	switch ( ch ) 
	{		case 'a':		case 'e':		case 'i':		case 'o':		case 'u': return(TRUE);			      break;		case 'y': return( vowel(prev,'?') == TRUE );			      break;		default : return(FALSE);			      break;	}}//********************************************************************************************static measure (char *stem){	register int i=0, count = 0;	int length=strlen(stem);	while ( i < length ) 
	{
        for ( ; i < length ; i++ ) 
		{			if ( i > 0 ) 
			{				if ( vowel(stem[i],stem[i-1]) == TRUE )					break;			}			else 
			{	
				if ( vowel(stem[i],'a') == TRUE )					break;			}		}		for ( i++ ; i < length ; i++ ) 
		{			if ( i > 0 ) 
			{				if ( vowel(stem[i],stem[i-1]) == FALSE )					break;			}			else 
			{	
				if ( vowel(stem[i],'?') == FALSE )					break;			}		}		if ( i < length ) 
		{			count++;			i++;		}	}	return(count);}//********************************************************************************************static int contains_vowel (char *word)	{	register unsigned int i;	for ( i=0 ; i < strlen(word) ; i++ )		if ( i > 0 ) 
		{			if ( vowel(word[i],word[i-1]) == TRUE )				return(TRUE);		}		else 
		{	
			if ( vowel(word[0],'a') == TRUE )				return(TRUE);		}	return(FALSE);}
//********************************************************************************************
int main()
{

	ifstream infile;
	ofstream outfile;
	char character;
	char filename[30];
	cout << "*********************英文单词stemming处理程序**********************"
		 << endl << endl;
	cout << "请输入需处理文件的文件名(.txt):";
	cin >> filename;
    infile.open(filename);
	outfile.open("outfile.txt");
	if ( !infile )
	{
		cout << "无法打开指定的文件,请重试."
			 << endl;
		return 1;
	}

	infile.get(character);
	while ( infile )
	{
	    int count = 0;
		char* word = new char[50];

	    while ( isalnum (character) )
		{
		    word[count] = character;
		    count++;
			infile.get(character);
		}
		word[count] = EOS;

		strip_suffixes (word);
        outfile << word << " ";
		delete [] word;
		infile.get(character); 
	}
	outfile << endl;
	cout << "文件处理完毕,处理的结果保存在outfile.txt中." << endl;
	return 0;
}
💿 文件大小 5 K
👤 上传用户 lqlm521
📂 所属分类多国语言处理
📄 代码行数 455 行
💻 语言类型 C++
🏷️ 相关标签

#程序 #代码
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -