📄 srcfileread_10_31.c
字号:
/*可以处理有大片空白的文档*/
#include "segment.h"
#include <direct.h>
#include <io.h>
#include <string.h>
#include "invert.h"
#include "seng.h"
#include "update.h"
#define MAXTLT 1000 /*最长的题目*/
#define MAXKEYWD 1000 /*关键词所占的最多的字符个数*/
#define MAXABT 2000 /*摘要所占的最多的字符个数*/
#define MAXAUTH 1000 /*作者所占的最多的字符个数*/
#define MAXLEN 50 /*题目、关键词、作者长于该数时,做出错处理*/
#define GBLWBTMNUM 161 /*简体中文国标码低位最小值*/
#define GBLWTOPNUM 254 /*简体中文国标码低位最大值*/
#define GBHTBTMNUM 176 /*简体中文国标码高位最小值*/
#define GBHTTOPNUM 247 /*简体中文国标码高位最大值*/
char TypeAnly(const char* sSource, int* Pos)
{
char Temp[20], type;
int i;
while(sSource[(*Pos)++] != '<');
i = 0;
while(sSource[*Pos] != '>')
{
Temp[i] = sSource[*Pos];
i++;
(*Pos)++;
}
Temp[i] = '\0';
if(strcmp(Temp, "title") == 0)
{
type = 't'; /*title*/
return type;
}
else if(strcmp(Temp, "author") == 0)
{
type = 'a'; /*author*/
return type;
}
else if(strcmp(Temp, "keyword") == 0)
{
type = 'k'; /*keyword*/
return type;
}
else if(strcmp(Temp, "abstract") == 0)
{
type = 'b'; /*abstract*/
return type;
}
else if(strcmp(Temp, "path") == 0)
{
type = 'p'; /*path*/
return type;
}
else if(strcmp(Temp, "text") == 0)
{
type = 'x'; /*text*/
return type;
}
else
{
type = '0';
return type;
}
}
/*在范围内*/
int IsInRange(char *temp)
{
unsigned char a;
a = temp[0];
/*是数字*/
if((temp[0] >= '0') && (temp[0] <= '9') || (temp[0] == '\n'))
{
return 0;
}
/*是英文*/
else if((temp[0] >= 'a') && (temp[0] <= 'z') || (temp[0] >= 'A') && (temp[0] <= 'Z') || (temp[0] == 44))
{
return 0;
}
/*是汉字*/
else if(((temp[0] >= -80) && (temp[0] <= -9) && (temp[1] >= -95) && (temp[1] <= -2)) || ((temp[0] == -93) && (temp[1] == -84))|| ((temp[0] == -95) && (temp[1] == -93)))
{
return 1;
}
else if((temp[0] >= -95) && (temp[0] <= -81) && (temp[1] >= -95) && (temp[1] <= -2))
{
return -2; //有括号
}
/* sy_1102_modified
else if((a == '.'))
{
return 0;
}*/
else if((temp[0] == -93) && (temp[1] == -81))
{
return -2;
}
else //if((a == '/') || (a == '~') || (a == '(') || (a == ')') || (a == '"') || (a == '#') || (a == ':') || (a == '~'))
{
return -1;
}/**/
return 0;
}
/*功能是读字串,
入口参数:char *sSource 源数据
char *des 目的数据
int *pos 指针位置,指的是在sSource中的位置
返回值:返回-1,表示文档正确处理,但没有处理完,返回0表示文档处理完毕,返回-2表示文档格式有误;
*/
int stringRead(const char *sSource, char *des, int *pos)
{
int i;
int max,type = 0;
char temp[2];
i = 0;
max = strlen(sSource);
while((sSource[*pos] != '\0') && (*pos < max) && ((sSource[*pos] != '<') || ((sSource[*pos - 1] != '\n') && (sSource[*pos] == '<'))) )
{
if((sSource[*pos] == -95) && (sSource[(*pos) + 1] == -95) && sSource[(*pos) + 2] == '\n' && (sSource[(*pos) + 3] == -95) && (sSource[(*pos) + 4] == -95) && sSource[(*pos) + 5] == '\n' )/**/
{
return -3;
}
/*如果出现连续出现【文摘】,则直接返回*/
if((sSource[*pos] == -95) && (sSource[(*pos) + 1] == -66))
{
if((sSource[(*pos) + 2] == -50) && (sSource[(*pos) + 3] == -60) && (sSource[(*pos) + 4] == -43) && (sSource[(*pos) + 5] == -86)
&& (sSource[(*pos) + 6] == -95) && (sSource[(*pos) + 7] == -65) && (sSource[(*pos) + 8] == 10) && (sSource[(*pos) + 9] == -95)
&& (sSource[(*pos) + 10] == -66) && (sSource[(*pos) + 11] == -50) && (sSource[(*pos) + 12] == -60) && (sSource[(*pos) + 13] == -43)
&& (sSource[(*pos) + 14] == -86) && (sSource[(*pos) + 15] == -95) && (sSource[(*pos) + 16] == -65))
{
return -3;
}
}
if (sSource[*pos] == '>') {
*pos = *pos + 1;
continue;
}
temp[0] = sSource[*pos];
temp[1] = sSource[*pos + 1];
type = IsInRange(temp); /*modified by sy 2006_11_29*/
/*type = 0;*/
/*是可拷贝的字符*/
if(type == 0)
{
des[i] = sSource[*pos];
i++;
(*pos)++;
}
else if(type == 1)
{
des[i] = sSource[*pos];
des[i + 1] = sSource[*pos + 1];
i = i + 2;
(*pos) += 2;
}
else if(type == -2)
{
des[i] = ' ';
des[i + 1] = ' ';
i += 2;
(*pos) += 2;
/*des[i] = ' ';
i += 1;
(*pos) += 2;*/
}
else
{
des[i] = ' ';
i++;
(*pos)++;
}/**/
}
/*文档没有处理完*/
if(sSource[*pos] == '<')
{
des[i] = '\0';
return -1;
}
/*文档处理完毕*/
if(sSource[*pos] == '\0')
{
des[i] = '\0';
return 0;
}
return -2;
}
/*处理作者串*/
int AuthorPro(unsigned char *author)
{
int i;
int max;
unsigned char stemp[3];
max = strlen(author);
i = 0;
while(i < max)
{
/*是汉字*/
if((author[i] <= GBHTTOPNUM) && (author[i] >= GBHTBTMNUM) && (author[i + 1] <= GBLWTOPNUM ) && (author[i + 1] >= GBLWBTMNUM ))
{
stemp[0] = author[i];
stemp[1] = author[i + 1];
stemp[2] = '\0';
if(strcmp(stemp, "等") == 0)
{
author[i] = '\0';
return 0;
}
i += 2;
continue;
}
/*是英文*/
else if(((author[i] >= 'a') && (author[i] <= 'z')) || ((author[i] >= 'A') && (author[i] <= 'Z')))
{
i++;
continue;
}
/*是其它字符都变成空格*/
else
{
author[i] = ' ';
i++;
continue;
}
}
return 0;
}
/*去除多余的回车*/
char* Ent(char * source)
{
int i, j, max;
i = 0;
j = 0;
max = strlen(source);
while(i < max)
{
if((source[i] == '\n') && (source[i + 1] == '\n'))
{
j++;
i = i + 3;
}
else if((source[i] == ' ') && (source[i + 1] == ' '))
{
j++;
i = i + 2;
}
else
source[j++] = source[i++];
}
source[j] = '\0';
return 0;
}
/*功能:将原始文档解读并分词,将分词结果返回给主调函数
入口参数:char *source
返回值:解析文件并分词的结果
*/
char* srcfileread(const char * sSource, pResult myResult, char **stopList)
{
char * sResult, type;
char *title, *keyword, *text, *abstract, *author, *temp, *sSource4Trim;
int pos,i;
int error, max;
sResult = (char *) malloc(sizeof(char) * ((strlen(sSource) + 13) * 3 ));
title = (char *) malloc(sizeof(char) * MAXTLT);
keyword = (char *) malloc(sizeof(char) * MAXKEYWD);
abstract = (char *) malloc(sizeof(char) * MAXABT);
author = (char *) malloc(sizeof(char) * MAXTLT);
text = (char *) malloc(sizeof(char) * (strlen(sSource) + 13) * 3 );
sResult[0] = '\0';
pos = 0;
type = '0';
title[0] = '\0';
keyword[0] = '\0';
author[0] = '\0';
abstract[0] = '\0';
text[0] = '\0';
max = strlen(sSource);
/*得到分解的文档*/
while(pos < max)
{
if(sSource[pos] == '<')
{
type = TypeAnly(sSource, &pos);
}
/*如果是题目,则调用处理题目的函数*/
if(type == 't')
{
error = stringRead(sSource, title, &pos);
/*表明文档格式有误*/
if(error == -2)
{
return NULL;
}
if(error == -3)
{
strcat(sResult, "title@title ");
strcat(sResult,"\n");
continue;
}
/*调用分词函数*/
if(strlen(title) < 4)
{
strcat(sResult, "title@title ");
strcat(sResult,"\n");
continue;
}
sSource4Trim=Trim(title);//去分词的空格
temp = (char *)malloc((strlen(sSource4Trim)+14)*2);//初始化分词结果串
memset(temp,0,sizeof(temp));
GetSegmentString(sSource4Trim,myResult,stopList,temp);
/*将结果送入sResult*/
strcat(sResult, "title@title ");
strcat(sResult, temp);
strcat(sResult,"\n");
free(temp);
}
/*如果是作者,则调用处理作者的函数*/
else if(type == 'a')
{
error = stringRead(sSource, author, &pos);
if(error == -2)
{
return NULL;
}
if(error == -3)
{
strcat(sResult, "author@author ");
strcat(sResult,"\n");
continue;
}
/*调用处理作者信息的函数*/
if(strlen(author) <= 1)
{
strcat(sResult, "author@author ");
strcat(sResult,"\n");
continue;
}
AuthorPro(author);
/*将结果送入sResult*/
strcat(sResult, "author@author ");
strcat(sResult, author);
strcat(sResult,"\n");
}
/*如果是关键词,则调用处理关键词的函数*/
else if(type == 'k')
{
error = stringRead(sSource, keyword, &pos);
if(error == -2)
{
return NULL;
}
if(error == -3)
{
strcat(sResult, "keyword@keyword ");
strcat(sResult,"\n");
continue;
}
if(strlen(keyword) <= 1)
{
strcat(sResult, "keyword@keyword ");
strcat(sResult,"\n");
continue;
}
/*调用处理关键词信息的函数,该函数和作者串处理方法相同*/
AuthorPro(keyword);
/*将结果送入sResult*/
strcat(sResult, "keyword@keyword ");
strcat(sResult, keyword);
strcat(sResult,"\n");
}
/*如果是摘要,则调用处理摘要的函数*/
else if(type == 'b')
{
error = stringRead(sSource, abstract, &pos);
if(error == -2)
{
return NULL;
}
if(error == -3)
{
strcat(sResult, "abstract@abstract ");
strcat(sResult,"\n");
continue;
}
/*调用分词函数*/
if(strlen(abstract) < 4)
{
strcat(sResult, "abstract@abstract ");
strcat(sResult,"\n");
continue;
}
sSource4Trim=Trim(abstract);//去分词的空格
if(sSource4Trim == NULL)
{
continue;
}
temp = (char *)malloc((strlen(sSource4Trim)+14)*2);//初始化分词结果串
memset(temp,0,sizeof(temp));
/*printf("%s\n",sSource4Trim);*/
GetSegmentString(sSource4Trim,myResult,stopList,temp);
/*将结果送入sResult*/
strcat(sResult, "abstract@abstract ");
strcat(sResult, temp);
strcat(sResult,"\n");
free(temp);
}
/*如果是正文,则调用处理正文的函数*/
else if(type == 'x')
{
error = stringRead(sSource, text, &pos);
if(error == -2)
{
return NULL;
}
/*正文为空*/
if(error == -3)
{
strcat(sResult, "text@text ");
strcat(sResult,"\n");
free(title);
free(keyword);
free(abstract);
free(author);
free(text);
return sResult;
}
/*调用分词函数*/
/*Ent(text);*/
if(strlen(text) < 4)
{
strcat(sResult, "abstract@abstract ");
strcat(sResult,"\n");
free(title);
free(keyword);
free(abstract);
free(author);
free(text);
return sResult;
}
sSource4Trim=Trim(text);//去分词的空格
temp = (char *)malloc((strlen(sSource4Trim)+14)*2);//初始化分词结果串
memset(temp,0,sizeof(temp));
GetSegmentString(sSource4Trim,myResult,stopList,temp);
/*将结果送入sResult*/
strcat(sResult, "text@text ");
strcat(sResult, temp);
strcat(sResult,"\n");
free(temp);
}
else if((type == '0') || (type == 'p'))
{
while ((sSource[pos] != '<') && (sSource[pos] != '\0') && (pos < max))
{
pos++;
}
}
}
free(title);
free(keyword);
free(abstract);
free(author);
free(text);
return sResult;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -