📄 srcfileread_10_31.c

📁 对文件进行操作
💻 C
字号:
/*可以处理有大片空白的文档*/
#include "segment.h"
#include <direct.h>
#include <io.h>
#include <string.h>
#include "invert.h"
#include "seng.h"
#include "update.h"

#define MAXTLT 1000 /*最长的题目*/
#define MAXKEYWD 1000 /*关键词所占的最多的字符个数*/
#define MAXABT 2000 /*摘要所占的最多的字符个数*/
#define MAXAUTH 1000 /*作者所占的最多的字符个数*/
#define MAXLEN 50    /*题目、关键词、作者长于该数时，做出错处理*/

#define GBLWBTMNUM   161  /*简体中文国标码低位最小值*/
#define GBLWTOPNUM   254  /*简体中文国标码低位最大值*/
#define GBHTBTMNUM   176  /*简体中文国标码高位最小值*/
#define GBHTTOPNUM   247  /*简体中文国标码高位最大值*/


char TypeAnly(const char* sSource, int* Pos)
{
	char Temp[20], type;
	int i;
	while(sSource[(*Pos)++] != '<');
	i = 0;
	while(sSource[*Pos] != '>')
	{
		Temp[i] = sSource[*Pos];
		i++;
		(*Pos)++;
	}
	Temp[i] = '\0';
	if(strcmp(Temp, "title") == 0)
	{
		type = 't';  /*title*/
		return type;
	}
	else if(strcmp(Temp, "author") == 0)
	{
		type = 'a';  /*author*/
		return type;
	}
	else if(strcmp(Temp, "keyword") == 0)
	{
		type = 'k';  /*keyword*/
		return type;
	}
	else if(strcmp(Temp, "abstract") == 0)
	{
		type = 'b';  /*abstract*/
		return type;
	}
	else if(strcmp(Temp, "path") == 0)
	{
		type = 'p';  /*path*/
		return type;
	}
	else if(strcmp(Temp, "text") == 0)
	{
		type = 'x';  /*text*/
		return type;
	}
	else
	{
		type = '0';
		return type;
	}
}
/*在范围内*/
int IsInRange(char *temp)
{
	unsigned char a;
	a = temp[0];
	/*是数字*/
	if((temp[0] >= '0') && (temp[0] <= '9') || (temp[0] == '\n'))
	{
		return 0;
	}
	/*是英文*/
	else if((temp[0] >= 'a') && (temp[0] <= 'z') || (temp[0] >= 'A') && (temp[0] <= 'Z') || (temp[0] == 44))
	{
		return 0;
	}
	/*是汉字*/
	else if(((temp[0] >= -80) && (temp[0] <= -9) && (temp[1] >= -95) && (temp[1] <= -2)) || ((temp[0] == -93) && (temp[1] == -84))|| ((temp[0] == -95) && (temp[1] == -93)))
	{
		return 1;
	}
	else if((temp[0] >= -95) && (temp[0] <= -81) && (temp[1] >= -95) && (temp[1] <= -2))
	{
		return -2; //有括号
	}
	/* sy_1102_modified
	else if((a == '.'))
	{
		return 0;
	}*/
	else if((temp[0] == -93) && (temp[1] == -81))
	{
		return -2;
	}
	else	//if((a == '/') || (a == '~') || (a == '(') || (a == ')') || (a == '"') || (a == '#') || (a == ':') || (a == '~'))
	{
		return -1;
	}/**/
	return 0;
}
/*功能是读字串，
入口参数：char *sSource 源数据
          char *des 目的数据
		  int *pos 指针位置，指的是在sSource中的位置
返回值：返回-1，表示文档正确处理，但没有处理完，返回0表示文档处理完毕,返回-2表示文档格式有误；
*/
int stringRead(const char *sSource, char *des, int *pos)
{
	int i;
	int max,type = 0;
	char temp[2];
	i = 0;
	max = strlen(sSource);
	while((sSource[*pos] != '\0') && (*pos < max) && ((sSource[*pos] != '<') || ((sSource[*pos - 1] != '\n') && (sSource[*pos] == '<'))) )
	{
		
		if((sSource[*pos] == -95) && (sSource[(*pos) + 1] == -95) && sSource[(*pos) + 2] == '\n' && (sSource[(*pos) + 3] == -95) && (sSource[(*pos) + 4] == -95) && sSource[(*pos) + 5] == '\n' )/**/
		{
			return -3;
		}
		/*如果出现连续出现【文摘】，则直接返回*/
		if((sSource[*pos] == -95) && (sSource[(*pos) + 1] == -66))
		{
			if((sSource[(*pos) + 2] == -50) && (sSource[(*pos) + 3] == -60) && (sSource[(*pos) + 4] == -43) && (sSource[(*pos) + 5] == -86) 
				&& (sSource[(*pos) + 6] == -95) && (sSource[(*pos) + 7] == -65) && (sSource[(*pos) + 8] == 10) && (sSource[(*pos) + 9] == -95)
				&& (sSource[(*pos) + 10] == -66) && (sSource[(*pos) + 11] == -50) && (sSource[(*pos) + 12] == -60) && (sSource[(*pos) + 13] == -43)
				&& (sSource[(*pos) + 14] == -86) && (sSource[(*pos) + 15] == -95) && (sSource[(*pos) + 16] == -65))
			{
				return -3;
			}
		}
		
		
		if (sSource[*pos] == '>') {
			*pos = *pos + 1;
			continue;
		}

		temp[0] = sSource[*pos];
		temp[1] = sSource[*pos + 1];
		type = IsInRange(temp);  /*modified by sy 2006_11_29*/

		/*type = 0;*/
		/*是可拷贝的字符*/
		if(type == 0)
		{
			des[i] = sSource[*pos];
			i++;
			(*pos)++;
		}
		else if(type == 1)
		{
			des[i] = sSource[*pos];
			des[i + 1] = sSource[*pos + 1];
			i = i + 2;
			(*pos) += 2;
		}
		else if(type == -2)
		{
			des[i] = ' ';
			des[i + 1] = ' ';
			i += 2;
			(*pos) += 2;
			/*des[i] = ' ';
			i += 1;
			(*pos) += 2;*/
		}
		else
		{
			des[i] = ' ';
			i++;
			(*pos)++;
		}/**/
	}

	/*文档没有处理完*/
	if(sSource[*pos] == '<')
	{
		des[i] = '\0';
		return -1;
	}
	/*文档处理完毕*/
	if(sSource[*pos] == '\0')
	{
		des[i] = '\0';
		return 0;
	}
	return -2;
}
/*处理作者串*/
int	AuthorPro(unsigned char *author)
{
	int i;
	int max;
	unsigned char stemp[3];
	max = strlen(author);
	i = 0;
	while(i < max)
	{
		/*是汉字*/
		if((author[i] <= GBHTTOPNUM) && (author[i] >= GBHTBTMNUM) && (author[i + 1] <= GBLWTOPNUM ) && (author[i + 1] >= GBLWBTMNUM ))
		{
			stemp[0] = author[i];
			stemp[1] = author[i + 1];
			stemp[2] = '\0';
			if(strcmp(stemp, "等") == 0)
			{
				author[i] = '\0';
				return 0;
			}
			i += 2;
			continue;
		}
		/*是英文*/
		else if(((author[i] >= 'a') && (author[i] <= 'z')) || ((author[i] >= 'A') && (author[i] <= 'Z')))
		{
			i++;
			continue;
		}
		/*是其它字符都变成空格*/
		else
		{
			author[i] = ' ';
			i++;
			continue;
		}
	}
	return 0;
}

/*去除多余的回车*/
char* Ent(char * source)
{
	int i, j, max;
	i = 0;
	j = 0;
	max = strlen(source);
	while(i < max)
	{
		if((source[i] == '\n') && (source[i + 1] == '\n'))
		{
			j++;
			i = i + 3;
		}
		else if((source[i] == ' ') && (source[i + 1] == ' '))
		{
			j++;
			i = i + 2;
		}
		else
			source[j++] = source[i++];
	}
	source[j] = '\0';
	return 0;
}
/*功能：将原始文档解读并分词，将分词结果返回给主调函数
入口参数：char *source
返回值:解析文件并分词的结果
*/
char* srcfileread(const char * sSource, pResult myResult, char **stopList)
{
	char * sResult, type;
	char *title, *keyword, *text, *abstract, *author, *temp, *sSource4Trim;
	int pos,i;
	int error, max;
	sResult = (char *) malloc(sizeof(char) * ((strlen(sSource) + 13) * 3 ));
	title = (char *) malloc(sizeof(char) * MAXTLT);
	keyword = (char *) malloc(sizeof(char) * MAXKEYWD);
	abstract = (char *) malloc(sizeof(char) * MAXABT);
	author = (char *) malloc(sizeof(char) * MAXTLT);
	text = (char *) malloc(sizeof(char) * (strlen(sSource) + 13) * 3 );
	sResult[0] = '\0';
	pos = 0;
	type = '0';
	title[0] = '\0';
	keyword[0] = '\0';	
	author[0] = '\0';
	abstract[0] = '\0';
	text[0] = '\0';
		max = strlen(sSource);
	/*得到分解的文档*/
	while(pos < max)
	{
		if(sSource[pos] == '<')
		{
			type = TypeAnly(sSource, &pos);
		}
		/*如果是题目，则调用处理题目的函数*/
		if(type == 't')
		{
			error = stringRead(sSource, title, &pos);
			/*表明文档格式有误*/
			if(error == -2)
			{
				return NULL;
			}
			if(error == -3)
			{
				strcat(sResult, "title@title ");
				strcat(sResult,"\n");
				continue;
			}
			/*调用分词函数*/
			if(strlen(title) < 4)
			{
				strcat(sResult, "title@title ");
				strcat(sResult,"\n");
				continue;
			}
			sSource4Trim=Trim(title);//去分词的空格			

			temp = (char *)malloc((strlen(sSource4Trim)+14)*2);//初始化分词结果串	
			memset(temp,0,sizeof(temp));
			GetSegmentString(sSource4Trim,myResult,stopList,temp);
			/*将结果送入sResult*/
			strcat(sResult, "title@title ");
			strcat(sResult, temp);
			strcat(sResult,"\n");
			free(temp);
		}
		/*如果是作者，则调用处理作者的函数*/
		else if(type == 'a')
		{
			error = stringRead(sSource, author, &pos);
			if(error == -2)
			{
				return NULL;
			}
			if(error == -3)
			{
				strcat(sResult, "author@author ");
				strcat(sResult,"\n");
				continue;
			}
			/*调用处理作者信息的函数*/
			if(strlen(author) <= 1)
			{
				strcat(sResult, "author@author ");
				strcat(sResult,"\n");
				continue;
			}
			AuthorPro(author);
			/*将结果送入sResult*/
			strcat(sResult, "author@author ");
			strcat(sResult, author);
			strcat(sResult,"\n");
		}
		/*如果是关键词，则调用处理关键词的函数*/
		else if(type == 'k')
		{
			error = stringRead(sSource, keyword, &pos);
			if(error == -2)
			{
				return NULL;
			}
			if(error == -3)
			{
				strcat(sResult, "keyword@keyword ");
				strcat(sResult,"\n");
				continue;
			}
			if(strlen(keyword) <= 1)
			{
				strcat(sResult, "keyword@keyword ");
				strcat(sResult,"\n");
				continue;
			}
			/*调用处理关键词信息的函数，该函数和作者串处理方法相同*/
			AuthorPro(keyword);
			/*将结果送入sResult*/
			strcat(sResult, "keyword@keyword ");
			strcat(sResult, keyword);
			strcat(sResult,"\n");
		}
		/*如果是摘要，则调用处理摘要的函数*/
		else if(type == 'b')
		{
			error = stringRead(sSource, abstract, &pos);
			if(error == -2)
			{
				return NULL;
			}
			if(error == -3)
			{
				strcat(sResult, "abstract@abstract ");
				strcat(sResult,"\n");
				continue;
			}
			/*调用分词函数*/
			if(strlen(abstract) < 4)
			{
				strcat(sResult, "abstract@abstract ");
				strcat(sResult,"\n");
				continue;
			}
			sSource4Trim=Trim(abstract);//去分词的空格			
			if(sSource4Trim == NULL)
			{
				continue;
			}
			temp = (char *)malloc((strlen(sSource4Trim)+14)*2);//初始化分词结果串	
			memset(temp,0,sizeof(temp));	
			/*printf("%s\n",sSource4Trim);*/
			GetSegmentString(sSource4Trim,myResult,stopList,temp);
			/*将结果送入sResult*/
			strcat(sResult, "abstract@abstract ");
			strcat(sResult, temp);
			strcat(sResult,"\n");
			free(temp);
		}
		/*如果是正文，则调用处理正文的函数*/
		else if(type == 'x')
		{
			error = stringRead(sSource, text, &pos);
			if(error == -2)
			{
				return NULL;
			}
			/*正文为空*/
			if(error == -3)
			{
				strcat(sResult, "text@text ");
				strcat(sResult,"\n");
				free(title);
				free(keyword);
				free(abstract);
				free(author);
				free(text);
				return sResult;
			}
			/*调用分词函数*/
			/*Ent(text);*/
			if(strlen(text) < 4)
			{
				strcat(sResult, "abstract@abstract ");
				strcat(sResult,"\n");
				free(title);
				free(keyword);
				free(abstract);
				free(author);
				free(text);
				return sResult;
			}
			sSource4Trim=Trim(text);//去分词的空格
			
			temp = (char *)malloc((strlen(sSource4Trim)+14)*2);//初始化分词结果串
			memset(temp,0,sizeof(temp));
			GetSegmentString(sSource4Trim,myResult,stopList,temp);
			/*将结果送入sResult*/
			strcat(sResult, "text@text ");
			strcat(sResult, temp);
			strcat(sResult,"\n");
			free(temp);
		}
		else if((type == '0') || (type == 'p'))
		{
			while ((sSource[pos] != '<') && (sSource[pos] != '\0') && (pos < max))
			{
				pos++;
			}
		}
	}
	free(title);
	free(keyword);
	free(abstract);
	free(author);
	free(text);
	return sResult;
}
💿 文件大小 3 K
👤 上传用户 whyzhao
📂 所属分类多国语言处理
📄 代码行数 479 行
💻 语言类型 C语言
🏷️ 相关标签

#操作
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -