📄 dealtxtfile.h
字号:
//文本处理
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
FILE *fsourcetxt;
int sentenlen,filepos,dictlen,punctulen,nontxtbuflen,filelen;
int bjpunctulen,bjmarklen;
int txtflag,nontxtflag,punctuflag;
char (*dictbuf)[17],(*punctubuf),(*nontxtbuf),punctutmp[3];
char (*bjpunctubuf),(*bjmarkbuf);
void loaddict()
{
FILE *fdict,*fpunctu,*fbjpunctu,*fbjmark;
dictbuf=new char[200000][17];
punctubuf=new char[100];
bjpunctubuf=new char[100];
bjmarkbuf=new char[100];
//把字典读入dictbuf
if((fdict=fopen("dict.txt","r"))==NULL){
printf("cannot open dict!%s\n");
exit(0);
}
dictlen=0;
while(!feof(fdict)){
fscanf(fdict,"%s",dictbuf[dictlen]);
dictlen++;
}
fclose(fdict);
//读入标点库
if((fpunctu=fopen("punctu.txt","r"))==NULL){
printf("cannot open punctu.txt file!%s\n");
exit(0);
}
fseek(fpunctu,0L,SEEK_END);
punctulen=ftell(fpunctu);
fseek(fpunctu,0L,SEEK_SET);
fread(punctubuf,1,punctulen,fpunctu);
fclose(fpunctu);//需要处理的地方
//读入半角标点字符
if((fbjpunctu=fopen("bjpunctu.txt","r"))==NULL){
printf("cannot open bjpunctu.txt file!%s\n");
exit(0);
}
fseek(fbjpunctu,0L,SEEK_END);
bjpunctulen=ftell(fbjpunctu);
fseek(fbjpunctu,0L,SEEK_SET);
fread(bjpunctubuf,1,bjpunctulen,fbjpunctu);
fclose(fbjpunctu);
//读入半角字符
if((fbjmark=fopen("bjmark.txt","r"))==NULL){
printf("cannot open bjmark.txt file!%s\n");
exit(0);
}
fseek(fbjmark,0L,SEEK_END);
bjmarklen=ftell(fbjmark);
fseek(fbjmark,0L,SEEK_SET);
fread(bjmarkbuf,1,bjmarklen,fbjmark);
fclose(fbjmark);
//读入全角标点字符???未做还是放在标点库里面???
}
//二数取大
int getmaxnum(int num1,int num2)
{
int medium;
medium=num1;
if(num1<num2){
medium=num2;
}
return(medium);
}
//二分法查找词典,返回匹配标志
int halfsearch(char* src)
{
int poslow,posmid,poshigh,temp;
poslow=0;
poshigh=dictlen -1;
while(poslow<=poshigh){
posmid=(poslow+poshigh)/2;
temp=strcmp(src,dictbuf[posmid]);
if(temp==0)
return 1;
else if(temp>0)
{
poslow=posmid+1;
}
else{
poshigh=posmid-1;
}
}
if(temp>0){
return (-1)*(posmid+2);
}
else{
return (-1)*(posmid+1);
}
}
//取句子
void getsentence(char *sentence,char *nontxtbuf)
{
int sentenceflag,i;
int bjpunctuflag,bjmarkflag,bjnumflag;
char wordtemp[3],bjpunctutemp[2],bjmarktemp[2],wordtemp1[3];
txtflag=0;
nontxtflag=0;
punctuflag=0;
bjpunctuflag=0;
bjmarkflag=0;
bjnumflag=0;
nontxtbuflen=0;
sentenceflag=(int)sentence;//计算句子长度sentencelen
for(i=0;i<3;i++){
wordtemp[i]='\0';
}
for(i=0;i<3;i++){
wordtemp1[i]='\0';
}
do{//开始取句子
wordtemp[0]=fgetc(fsourcetxt);
if((int)wordtemp[0]>0){ //可能是半角字符
if((int)wordtemp[0]==10){//是换行回车
wordtemp[1]=fgetc(fsourcetxt);
if((int)wordtemp[1]==13){
wordtemp1[0]=fgetc(fsourcetxt);
if((int)wordtemp1[0]==32){
fseek(fsourcetxt,(long)(-1),SEEK_CUR);
}
}
else{
//sth wrong!???
}
}
else{ //不是换行回车
nontxtflag=1;
wordtemp[1]='\0';
for(i=0;i<bjpunctulen;i++){ //看是否是标点
bjpunctutemp[0]=bjpunctubuf[i];
bjpunctutemp[1]='\0';
if(strcmp(wordtemp,bjpunctutemp)==0){
*nontxtbuf=wordtemp[0];
nontxtbuf++;
nontxtbuflen++;
*nontxtbuf='|';
nontxtbuf++;
nontxtbuflen++;
*nontxtbuf='\0';
bjpunctuflag=1;
break;
}
}
if(bjpunctuflag==1){ //半角标点出现,句子取完
if(txtflag==1){
*sentence='\0';
}
nontxtflag=1;
break;
}
else{ //若不是标点,看是否是字母或数字。
for(i=0;i<bjmarklen;i++){
bjmarktemp[0]=bjmarkbuf[i];
bjmarktemp[1]='\0';
if(strcmp(wordtemp,bjmarktemp)==0){
*nontxtbuf=wordtemp[0];
nontxtbuf++;
nontxtbuflen++;
*nontxtbuf='|';
nontxtbuf++;
nontxtbuflen++;
*nontxtbuf='\0';
bjmarkflag=1;
break;
}
}
}
if(bjmarkflag==1){
nontxtflag=1;
if(txtflag==1){
*sentence='\0';
}
break;
}
else{ //半角标点和半角字符里都没有该字符
// MessageBox("Some DBC case not identified!","Error",MB_ICONERROR);
}
}
}
else{ //可能是汉字的部分
txtflag=1;
wordtemp[1]=fgetc(fsourcetxt);
if((int)wordtemp[1]>=0){
// MessageBox("Error in the file! half word");
}
else{
wordtemp[2]='\0';
//看是不是标点
for(i=0;i<punctulen/2;i++){
punctutmp[0]=punctubuf[i*2];
punctutmp[1]=punctubuf[i*2+1];
punctutmp[2]='\0';
if (strcmp(punctutmp,wordtemp)==0){
punctuflag=1;
break;
}//取到的标点在punctutmp中,以备读入segmenttemp
}
//若不是标点,要看是否是全角数字,字符
}
if (punctuflag==1){
*sentence='\0';
break;
}
else{
*sentence=wordtemp[0];
sentence++;
*sentence=wordtemp[1];
sentence++;
}
}
}while(filepos<filelen);
//找到标点作为处理标志,句子取完
sentenlen=(int)sentence-sentenceflag;
}
//以字段为单位处理句子
int field(char *sentence,char *segment)
{
int (*fieldshortcut),(*ptpdistance)[MAXSENTENCELEN],(*fieldroute)[10];
char word[17];
int maxlen,routenum,matchflag,route,sentencepos,wordlen;
int i,j,k,l,fieldcur,getnum,segmentlen;
int fieldoffset,fieldleft,fieldlen,fieldside;
fieldshortcut=new int[MAXSENTENCELEN]; //每个字的最短路径,
ptpdistance=new int[MAXSENTENCELEN][MAXSENTENCELEN];//每个字到其后七个字的匹配矩阵
fieldroute=new int[MAXSENTENCELEN][10]; //每个字的路径记录,[][0]为最短路径数
sentencepos=0;
segmentlen=0;
//句子未完就继续划分字段
while(sentencepos<sentenlen){
fieldoffset=0; //相对于字段开始的偏移量
fieldleft=0; //字段剩余长度
fieldlen=0; //字段字数
fieldside=0; //字段划分词数
matchflag=0;
//初始化为MAXNUM
for(i=0;i<MAXSENTENCELEN;i++){
for(j=0;j<10;j++){
fieldroute[i][j]=MAXNUM;
}
}
fieldshortcut[0]=0; //注意!第一个为0计算用!!!偏移位置比其它多1
for(i=1;i<MAXSENTENCELEN;i++){
fieldshortcut[i]=MAXNUM;
}
for(i=0;i<MAXSENTENCELEN;i++){
for(j=0;j<MAXSENTENCELEN;j++){
ptpdistance[i][j]=MAXNUM;
}
}
//开始处理句子中的字段
do{
//注意:每次只前进一个字!
wordlen=0;
maxlen=0; //最大匹配词长
routenum=0;
if((sentenlen-sentencepos)/2>=7){
getnum=7;
} //视剩余字多少取字数
else{
getnum=(sentenlen-sentencepos)/2;
}
//向前从一字词到七字词进行匹配 !注意序号从0开始
for(i=0;i<getnum;i++){
word[wordlen*2+0]=*sentence;
sentence++;
word[wordlen*2+1]=*sentence;
sentence++;
word[wordlen*2+2]='\0';
wordlen++;
matchflag=halfsearch(word);//从词典中找词
if(matchflag==1){
ptpdistance[fieldoffset+wordlen-1][fieldoffset]=1; //匹配成功,矩阵相应位置置1
maxlen=wordlen; //当前字向后最大匹配词长
}
matchflag=0;
}//当前字匹配完毕
if(maxlen==0){
fieldoffset--;
break; //非文本字段!
}
//计算当前fieldoffset的最短路径,注意,此处从1开始
for(i=0;i<=fieldoffset;i++){
//如果当前的最短路径比已有的短,替代
if((ptpdistance[fieldoffset][i]+fieldshortcut[i])<fieldshortcut[fieldoffset+1]){
fieldshortcut[fieldoffset+1]=ptpdistance[fieldoffset][i]+fieldshortcut[i];
routenum=1; //得到最短路径时路径为1
fieldroute[fieldoffset][routenum]=i; //得到最短路径时的位置
}
else if((ptpdistance[fieldoffset][i]+fieldshortcut[i])==fieldshortcut[fieldoffset+1]){
routenum++;
fieldroute[fieldoffset][routenum]=i; //从0开始计
} //不止一条最短路径时,记下位置
}
fieldroute[fieldoffset][0]=routenum; //第一个位置放路径数
fieldoffset++; //!!!注意:从0开始计
sentencepos+=2;
for(i=0;i<(getnum-1)*2;i++){
sentence--;
} //移指针到下一个字
fieldleft=getmaxnum(maxlen,fieldleft); //得到fieldleft的值
fieldleft--;
}while(fieldleft>0); //字段取完
for(i=0;i<fieldoffset*2;i++){
sentence--;
} //退回句首准备切词
if(fieldoffset==0){
break;
}
else{//准备切词
fieldlen=fieldoffset;
fieldside=fieldshortcut[fieldlen];
routenum=1;
fieldcur=0;
k=0;
route=fieldlen;
fieldroute[fieldlen][k]=route;
k++;
//路径放在fieldroute[fieldlen]里,倒序,中间的字不止一条路径?处理
for(j=1;j<fieldside;j++){
fieldroute[fieldlen][k]=fieldroute[route-1][1];
routenum=routenum*fieldroute[route-1][0];
route=fieldroute[fieldlen][k];
k++;
} //一级级找上去,寻找次数为fieldside
for(i=(k-1);i>=0;i--){
for(l=fieldcur;l<(fieldroute[fieldlen][i]*2);l++){
*segment=*sentence;
segment++;
sentence++;
segmentlen++;
}
*segment='|';
segment++;
segmentlen++;
fieldcur=l;
} //根据路径记录分词
if(routenum>1){ //输出超过一条路径的位置
printf("more route %d\n",sentencepos);
}
} //切完
if(maxlen==0){
break;
}
}
*segment='\0'; //写入标点
delete []ptpdistance;
delete []fieldroute;
delete []fieldshortcut;
return(segmentlen);
}
//处理句子正向最大匹配
int forward(char *sentencetemp,char *segmentbuf)
{
int sentencepos,segmentpos,templen,wordlen;
int i,j,matchflag;
char sevenword[17];
sentencepos=0;
segmentpos=0;
while(sentencepos<sentenlen){
//读入一个七字串
if ((sentenlen-sentencepos)<(WORDLEN-2)){ //不足七个字
templen=sentenlen-sentencepos;
}
else{ //够七个字
templen=WORDLEN-2;
}
for(i=0;i<templen;i++){
sevenword[i]=*sentencetemp;
sentencetemp++;
sentencepos++;
}
for(i=templen;i<WORDLEN;i++){
sevenword[i]='\0';
}
sevenword[WORDLEN]='\0'; //读完七字串
//搜索词典
matchflag=0;
wordlen=templen+2;
for(i=templen/2;i>=1;i--){
sevenword[i*2]='\0';
sevenword[i*2+1]='\0';
wordlen-=2;
matchflag=halfsearch(sevenword);
if(matchflag==1){
for(j=0;j<wordlen;j++){
*segmentbuf=sevenword[j];
segmentbuf++;
segmentpos++;
}
*segmentbuf='|';
segmentbuf++;
segmentpos++;
break;
}
else{
sentencetemp--;
sentencetemp--;
sentencepos-=2;
}
}//匹配完毕
}
if (sentenlen==sentencepos){
*segmentbuf='\0';
}
return(segmentpos);
}
//处理句子逆向最大匹配
int backward(char *sentence,char *backwardbuf)
{
int sentencepos,segmentpos,segmentlen,templen,segmentwordlen;
int matchflag,i,k,t;
char sevenword[17],(*segmentbuf)[16],(*sentencetemp);
segmentbuf=new char[MAXSENTENCELEN][16];
sentencetemp=new char[MAXSENTENCELEN];
//处理句子
strcpy(sentencetemp,sentence);
sentencepos=sentenlen-1;
segmentpos=0;
segmentlen=0;
while(sentencepos>0){
//读入一个七字串
if ((sentencepos+1)<(WORDLEN-2)){ //不足七个字
templen=sentencepos+1;
for(i=0;i<templen;i++){
sevenword[i+2]=sentencetemp[sentencepos+1-templen+i];
}
sentencepos=sentencepos-templen;
for(i=0;i<2;i++){
sevenword[i]='\0';
}
for(i=(templen+2);i<=WORDLEN;i++){
sevenword[i]='\0';
}
}
else{ //够七个字
templen=WORDLEN-2;
for(i=(WORDLEN-templen);i<WORDLEN;i++){
sevenword[i]=sentencetemp[sentencepos+1-WORDLEN+i];
}
sentencepos=sentencepos-templen;
for(i=0;i<(WORDLEN-templen);i++){
sevenword[i]='\0';
}
sevenword[WORDLEN]='\0';
}//读入一个七字串
//搜索词典
matchflag=0;
for(i=templen/2;i>=1;i--){
for(t=0;t<templen;t++){
sevenword[t]=sevenword[t+2];
}
sevenword[2*i]='\0';
sevenword[2*i+1]='\0';
matchflag=halfsearch(sevenword);
if(matchflag==1){
segmentwordlen=strlen(sevenword);
for(k=0;k<segmentwordlen;k++){
segmentbuf[segmentpos][k]=sevenword[k];
}
segmentbuf[segmentpos][k]='|';
segmentbuf[segmentpos][k+1]='\0';
segmentpos++;
break;
}
else{
sentencepos+=2;
}
}//匹配完毕
}
strcpy(backwardbuf,segmentbuf[segmentpos-1]);
for(i=(segmentpos-2);i>=0;i--){
strcat(backwardbuf,segmentbuf[i]);
}
segmentlen=strlen(backwardbuf);
delete []segmentbuf;
delete []sentencetemp;
return(segmentlen);
}
void dealtxtfile(char *dealfilename,char* segfilename)
{
FILE *fsegment;
char (*shortestbuf),(*forwardbuf),(*backwardbuf);
char (*sentencetemp),(*nontxtbuf);
int shortestbuflen,forwardbuflen,backwardbuflen;
shortestbuf=new char[MAXSENTENCELEN];
forwardbuf=new char[MAXSENTENCELEN];
backwardbuf=new char[MAXSENTENCELEN];
sentencetemp=new char[MAXSENTENCELEN];
nontxtbuf=new char[MAXBUFLEN];
loaddict();
if ((fsourcetxt=fopen(dealfilename,"r"))==NULL){
printf("can't open source text file!%s\n");
exit(0);
}
if ((fsegment=fopen(segfilename,"w"))==NULL){
printf("Fail to creat the segment.txt file!%s\n");
}
fseek(fsourcetxt,0L,SEEK_END);
filelen=ftell(fsourcetxt);
fseek(fsourcetxt,0L,SEEK_SET);
filepos=0;
//处理文本
while(!feof(fsourcetxt))
{
//读入一句话
getsentence(sentencetemp,nontxtbuf);
filepos=filepos+sentenlen+2;
//最短路径法处理这句话
if(txtflag==1){
shortestbuflen=field(sentencetemp,shortestbuf);
//正向最大匹配法处理这句话
forwardbuflen=forward(sentencetemp,forwardbuf);
//逆向最大匹配法处理这句话
backwardbuflen=backward(sentencetemp,backwardbuf);
//放入切词文本
if((strcmp(shortestbuf,forwardbuf)==0)&&(strcmp(forwardbuf,backwardbuf)==0)){
fwrite(shortestbuf,1,shortestbuflen,fsegment);
}
else{ //此处需要重点考虑!!!
fwrite(shortestbuf,1,shortestbuflen,fsegment);
}
}
if(nontxtflag==1){
fwrite(nontxtbuf,1,nontxtbuflen,fsegment);
}
else if(punctuflag==1){
fwrite(punctutmp,1,2,fsegment);
}
//else error!!! MessageBox!!!
if(filepos>=filelen){
break;
}
}
printf("succeed! c u! \n");
delete []shortestbuf;
delete []forwardbuf;
delete []backwardbuf;
delete []sentencetemp;
delete []nontxtbuf;
fclose(fsourcetxt);
fclose(fsegment);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -