📄 dealsentence.h
字号:
//处理编辑框中输入的句子
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
int dictlen1,punctulen1,sentencelen;
char (*dictbuf1)[17],(*punctubuf1),(*nontxtbuf1),punctutemp[3];
char (*bjpunctubuf1),(*bjmarkbuf1);
int nontxtbuflen1,bjpunctulen1,bjmarklen1;
int txtflag1,nontxtflag1,punctuflag1;
void loaddict1()
{
FILE *fdict,*fpunctu,*fbjpunctu,*fbjmark;
dictbuf1=new char[200000][17];
punctubuf1=new char[100];
bjpunctubuf1=new char[100];
bjmarkbuf1=new char[100];
//把字典读入dictbuf
if((fdict=fopen("dict.txt","r"))==NULL){
printf("cannot open dict!%s\n");
exit(0);
}
dictlen1=0;
while(!feof(fdict)){
fscanf(fdict,"%s",dictbuf1[dictlen1]);
dictlen1++;
}
fclose(fdict);
//读入标点库
if((fpunctu=fopen("punctu.txt","r"))==NULL){
printf("cannot open punctu.txt file!%s\n");
exit(0);
}
fseek(fpunctu,0L,SEEK_END);
punctulen1=ftell(fpunctu);
fseek(fpunctu,0L,SEEK_SET);
fread(punctubuf1,1,punctulen1,fpunctu);
fclose(fpunctu);//需要处理的地方
//读入半角标点字符
if((fbjpunctu=fopen("bjpunctu.txt","r"))==NULL){
printf("cannot open bjpunctu.txt file!%s\n");
exit(0);
}
fseek(fbjpunctu,0L,SEEK_END);
bjpunctulen1=ftell(fbjpunctu);
fseek(fbjpunctu,0L,SEEK_SET);
fread(bjpunctubuf1,1,bjpunctulen1,fbjpunctu);
fclose(fbjpunctu);
//读入半角字符
if((fbjmark=fopen("bjmark.txt","r"))==NULL){
printf("cannot open bjmark.txt file!%s\n");
exit(0);
}
fseek(fbjmark,0L,SEEK_END);
bjmarklen1=ftell(fbjmark);
fseek(fbjmark,0L,SEEK_SET);
fread(bjmarkbuf1,1,bjmarklen1,fbjmark);
fclose(fbjmark);
//读入全角标点字符???未做还是放在标点库里面???
}
//二数取大
int getmaxnum1(int num1,int num2)
{
int medium;
medium=num1;
if(num1<num2){
medium=num2;
}
return(medium);
}
//二分法查找词典,返回匹配标志
int halfsearch1(char* src1)
{
int poslow,posmid,poshigh,temp;
poslow=0;
poshigh=dictlen1 -1;
while(poslow<=poshigh){
posmid=(poslow+poshigh)/2;
temp=strcmp(src1,dictbuf1[posmid]);
if(temp==0)
return 1;
else if(temp>0)
{
poslow=posmid+1;
}
else{
poshigh=posmid-1;
}
}
if(temp>0){
return (-1)*(posmid+2);
}
else{
return (-1)*(posmid+1);
}
}
//取一个句子,返回句子目前位置
int getsentence1(char *senin, char *sentence,char *nontxtbuf1,int seninpos,int seninlen)
{
int sentenceflag,i;
char wordtemp[3],(*sentemp);
int bjpunctuflag1,bjmarkflag1,bjnumflag1;
char bjpunctutemp[2],bjmarktemp[2];
sentemp=new char[MAXBUFLEN];
strcpy(sentemp,senin);
sentenceflag=(int)sentence;//计算句子长度sentencelen
txtflag1=0;
nontxtflag1=0;
punctuflag1=0;
bjpunctuflag1=0;
bjmarkflag1=0;
bjnumflag1=0;
nontxtbuflen1=0;
sentenceflag=(int)sentence;//计算句子长度sentencelen
for(i=0;i<3;i++){
wordtemp[i]='\0';
}
do{//开始取句子
wordtemp[0]=sentemp[seninpos];
seninpos++;
if((int)wordtemp[0]>0){ //可能是半角字符
nontxtflag1=1;
wordtemp[1]='\0';
for(i=0;i<bjpunctulen1;i++){ //看是否是标点
bjpunctutemp[0]=bjpunctubuf1[i];
bjpunctutemp[1]='\0';
if(strcmp(wordtemp,bjpunctutemp)==0){
*nontxtbuf1=wordtemp[0];
nontxtbuf1++;
nontxtbuflen1++;
*nontxtbuf1='|';
nontxtbuf1++;
nontxtbuflen1++;
*nontxtbuf1='\0';
bjpunctuflag1=1;
break;
}
}
if(bjpunctuflag1==1){ //半角标点出现,句子取完
nontxtflag1=1;
if(txtflag1==1){
*sentence='\0';
}
break;
}
else{ //若不是标点,看是否是字母或数字。
for(i=0;i<bjmarklen1;i++){
bjmarktemp[0]=bjmarkbuf1[i];
bjmarktemp[1]='\0';
if(strcmp(wordtemp,bjmarktemp)==0){
*nontxtbuf1=wordtemp[0];
nontxtbuf1++;
nontxtbuflen1++;
*nontxtbuf1='|';
nontxtbuf1++;
nontxtbuflen1++;
*nontxtbuf1='\0';
bjmarkflag1=1;
break;
}
}
}
if(bjmarkflag1==1){
nontxtflag1=1;
if(txtflag1==1){
*sentence='\0';
}
break;
}
else{ //半角标点和半角字符里都没有该字符
// MessageBox("Some DBC case not identified!","Error",MB_ICONERROR);
}
}
else{ //可能是汉字的部分
txtflag1=1;
wordtemp[1]=sentemp[seninpos];
seninpos++;
if((int)wordtemp[1]>=0){
// MessageBox("Error in the file! half word");
}
else{
wordtemp[2]='\0';
//看是不是标点
for(i=0;i<punctulen1/2;i++){
punctutemp[0]=punctubuf1[i*2];
punctutemp[1]=punctubuf1[i*2+1];
punctutemp[2]='\0';
if (strcmp(punctutemp,wordtemp)==0){
punctuflag1=1;
break;
}//取到的标点在punctutmp中,以备读入segmenttemp
}
//若不是标点,要看是否是全角数字,字符
}
if (punctuflag1==1){
*sentence='\0';
break;
}
else{
*sentence=wordtemp[0];
sentence++;
*sentence=wordtemp[1];
sentence++;
}
}
if(seninpos>=seninlen){
break;
}
}while(seninpos<seninlen);
//句子取完
sentencelen=(int)sentence-sentenceflag;
delete []sentemp;
return(seninpos);
}
//以字段为单位处理句子
int field1(char *sentence,char *segment)
{
int (*fieldshortcut),(*ptpdistance)[MAXSENTENCELEN],(*fieldroute)[10];
char word[17];
int maxlen,routenum,matchflag,route,sentencepos,wordlen;
int i,j,k,l,fieldcur,getnum,segmentlen;
int fieldoffset,fieldleft,fieldlen,fieldside;
fieldshortcut=new int[MAXSENTENCELEN]; //每个字的最短路径,
ptpdistance=new int[MAXSENTENCELEN][MAXSENTENCELEN];//每个字到其后七个字的匹配矩阵
fieldroute=new int[MAXSENTENCELEN][10]; //每个字的路径记录,[][0]为最短路径数
sentencepos=0;
segmentlen=0;
//句子未完就继续划分字段
while(sentencepos<sentencelen){
fieldoffset=0; //相对于字段开始的偏移量
fieldleft=0; //字段剩余长度
fieldlen=0; //字段字数
fieldside=0; //字段划分词数
matchflag=0;
//初始化为MAXNUM
for(i=0;i<MAXSENTENCELEN;i++){
for(j=0;j<10;j++){
fieldroute[i][j]=MAXNUM;
}
}
fieldshortcut[0]=0; //注意!第一个为0计算用!!!偏移位置比其它多1
for(i=1;i<MAXSENTENCELEN;i++){
fieldshortcut[i]=MAXNUM;
}
for(i=0;i<MAXSENTENCELEN;i++){
for(j=0;j<MAXSENTENCELEN;j++){
ptpdistance[i][j]=MAXNUM;
}
}
//开始处理句子中的字段
do{
//注意:每次只前进一个字!
wordlen=0;
maxlen=0; //最大匹配词长
routenum=0;
if((sentencelen-sentencepos)/2>=7){
getnum=7;
} //视剩余字多少取字数
else{
getnum=(sentencelen-sentencepos)/2;
}
//向前从一字词到七字词进行匹配 !注意序号从0开始
for(i=0;i<getnum;i++){
word[wordlen*2+0]=*sentence;
sentence++;
word[wordlen*2+1]=*sentence;
sentence++;
word[wordlen*2+2]='\0';
wordlen++;
matchflag=halfsearch1(word);//从词典中找词
if(matchflag==1){
ptpdistance[fieldoffset+wordlen-1][fieldoffset]=1; //匹配成功,矩阵相应位置置1
maxlen=wordlen; //当前字向后最大匹配词长
}
matchflag=0;
}//当前字匹配完毕
if(maxlen==0){
fieldoffset--;
break; //非文本字段!
}
//计算当前fieldoffset的最短路径,注意,此处从1开始
for(i=0;i<=fieldoffset;i++){
//如果当前的最短路径比已有的短,替代
if((ptpdistance[fieldoffset][i]+fieldshortcut[i])<fieldshortcut[fieldoffset+1]){
fieldshortcut[fieldoffset+1]=ptpdistance[fieldoffset][i]+fieldshortcut[i];
routenum=1; //得到最短路径时路径为1
fieldroute[fieldoffset][routenum]=i; //得到最短路径时的位置
}
else if((ptpdistance[fieldoffset][i]+fieldshortcut[i])==fieldshortcut[fieldoffset+1]){
routenum++;
fieldroute[fieldoffset][routenum]=i; //从0开始计
} //不止一条最短路径时,记下位置
}
fieldroute[fieldoffset][0]=routenum; //第一个位置放路径数
fieldoffset++; //!!!注意:从0开始计
sentencepos+=2;
for(i=0;i<(getnum-1)*2;i++){
sentence--;
} //移指针到下一个字
fieldleft=getmaxnum1(maxlen,fieldleft); //得到fieldleft的值
fieldleft--;
}while(fieldleft>0); //字段取完
for(i=0;i<fieldoffset*2;i++){
sentence--;
} //退回句首准备切词
if(fieldoffset==0){
break;
}
else{//准备切词
fieldlen=fieldoffset;
fieldside=fieldshortcut[fieldlen];
routenum=1;
fieldcur=0;
k=0;
route=fieldlen;
fieldroute[fieldlen][k]=route;
k++;
//路径放在fieldroute[fieldlen]里,倒序,中间的字不止一条路径?处理
for(j=1;j<fieldside;j++){
fieldroute[fieldlen][k]=fieldroute[route-1][1];
routenum=routenum*fieldroute[route-1][0];
route=fieldroute[fieldlen][k];
k++;
} //一级级找上去,寻找次数为fieldside
for(i=(k-1);i>=0;i--){
for(l=fieldcur;l<(fieldroute[fieldlen][i]*2);l++){
*segment=*sentence;
segment++;
sentence++;
segmentlen++;
}
*segment='|';
segment++;
segmentlen++;
fieldcur=l;
} //根据路径记录分词
if(routenum>1){ //输出超过一条路径的位置
printf("more route %d\n",sentencepos);
}
} //切完
if(maxlen==0){
break;
}
}
*segment='\0'; //写入标点
delete []ptpdistance;
delete []fieldroute;
delete []fieldshortcut;
return(segmentlen);
}
//处理句子正向最大匹配
int forward1(char *sentencetemp,char *segmentbuf)
{
int sentencepos,segmentpos,templen,wordlen;
int i,j,matchflag;
char sevenword[17];
sentencepos=0;
segmentpos=0;
while(sentencepos<sentencelen){
//读入一个七字串
if ((sentencelen-sentencepos)<(WORDLEN-2)){ //不足七个字
templen=sentencelen-sentencepos;
}
else{ //够七个字
templen=WORDLEN-2;
}
for(i=0;i<templen;i++){
sevenword[i]=*sentencetemp;
sentencetemp++;
sentencepos++;
}
for(i=templen;i<WORDLEN;i++){
sevenword[i]='\0';
}
sevenword[WORDLEN]='\0'; //读完七字串
//搜索词典
matchflag=0;
wordlen=templen+2;
for(i=templen/2;i>=1;i--){
sevenword[i*2]='\0';
sevenword[i*2+1]='\0';
wordlen-=2;
matchflag=halfsearch1(sevenword);
if(matchflag==1){
for(j=0;j<wordlen;j++){
*segmentbuf=sevenword[j];
segmentbuf++;
segmentpos++;
}
*segmentbuf='|';
segmentbuf++;
segmentpos++;
break;
}
else{
sentencetemp--;
sentencetemp--;
sentencepos-=2;
}
}//匹配完毕
}
if (sentencelen==sentencepos){
*segmentbuf='\0';
}
return(segmentpos);
}
//处理句子逆向最大匹配
int backward1(char *sentence,char *backwardbuf)
{
int sentencepos,segmentpos,segmentlen,templen,segmentwordlen;
int matchflag,i,k,t;
char sevenword[17],(*segmentbuf)[16],(*sentencetemp);
segmentbuf=new char[MAXSENTENCELEN][16];
sentencetemp=new char[MAXSENTENCELEN];
//处理句子
strcpy(sentencetemp,sentence);
sentencepos=sentencelen-1;
segmentpos=0;
segmentlen=0;
while(sentencepos>0){
//读入一个七字串
if ((sentencepos+1)<(WORDLEN-2)){ //不足七个字
templen=sentencepos+1;
for(i=0;i<templen;i++){
sevenword[i+2]=sentencetemp[sentencepos+1-templen+i];
}
sentencepos=sentencepos-templen;
for(i=0;i<2;i++){
sevenword[i]='\0';
}
for(i=(templen+2);i<=WORDLEN;i++){
sevenword[i]='\0';
}
}
else{ //够七个字
templen=WORDLEN-2;
for(i=(WORDLEN-templen);i<WORDLEN;i++){
sevenword[i]=sentencetemp[sentencepos+1-WORDLEN+i];
}
sentencepos=sentencepos-templen;
for(i=0;i<(WORDLEN-templen);i++){
sevenword[i]='\0';
}
sevenword[WORDLEN]='\0';
}//读入一个七字串
//搜索词典
matchflag=0;
for(i=templen/2;i>=1;i--){
for(t=0;t<templen;t++){
sevenword[t]=sevenword[t+2];
}
sevenword[2*i]='\0';
sevenword[2*i+1]='\0';
matchflag=halfsearch1(sevenword);
if(matchflag==1){
segmentwordlen=strlen(sevenword);
for(k=0;k<segmentwordlen;k++){
segmentbuf[segmentpos][k]=sevenword[k];
}
segmentbuf[segmentpos][k]='|';
segmentbuf[segmentpos][k+1]='\0';
segmentpos++;
break;
}
else{
sentencepos+=2;
}
}//匹配完毕
}
strcpy(backwardbuf,segmentbuf[segmentpos-1]);
for(i=(segmentpos-2);i>=0;i--){
strcat(backwardbuf,segmentbuf[i]);
}
segmentlen=strlen(backwardbuf);
delete []segmentbuf;
delete []sentencetemp;
return(segmentlen);
}
void dealsentence(char *senin,char *senout)
{
char (*shortestbuf),(*forwardbuf),(*backwardbuf),(*nontxtbuf);
char (*sentencetemp),(*sentenceseg);
int shortestbuflen,forwardbuflen,backwardbuflen;
int seninlen,senoutlen,seninpos,i;
shortestbuf=new char[MAXSENTENCELEN];
forwardbuf=new char[MAXSENTENCELEN];
backwardbuf=new char[MAXSENTENCELEN];
sentencetemp=new char[MAXSENTENCELEN];
sentenceseg=new char[2*MAXSENTENCELEN];
nontxtbuf=new char[MAXBUFLEN];
seninlen=strlen(senin);
seninpos=0;
senoutlen=0;
loaddict1();
//处理输入句
while(seninpos<seninlen)
{
//读入一句话
seninpos=getsentence1(senin,sentencetemp,nontxtbuf,seninpos,seninlen);
//最短路径法处理这句话
if(txtflag1==1){
shortestbuflen=field1(sentencetemp,shortestbuf);
//正向最大匹配法处理这句话
forwardbuflen=forward1(sentencetemp,forwardbuf);
//逆向最大匹配法处理这句话
backwardbuflen=backward1(sentencetemp,backwardbuf);
//放入切词文本
if((strcmp(shortestbuf,forwardbuf)==0)&&(strcmp(forwardbuf,backwardbuf)==0)){
for(i=0;i<shortestbuflen;i++){
senout[senoutlen]=shortestbuf[i];
senoutlen++;
}
}
else{
for(i=0;i<shortestbuflen;i++){
senout[senoutlen]=shortestbuf[i];
senoutlen++;
}
}
}
if(nontxtflag1==1){
for(i=0;i<nontxtbuflen1;i++){
senout[senoutlen]=nontxtbuf[i];
senoutlen++;
}
}
else if(punctuflag1==1){
senout[senoutlen]=punctutemp[0];
senoutlen++;
senout[senoutlen]=punctutemp[1];
senoutlen++;
}
//else error!!! MessageBox!!!
if(seninpos>=seninlen){
senout[senoutlen]='\0';
break;
}
}
printf("succeed! c u! \n");
delete []shortestbuf;
delete []forwardbuf;
delete []backwardbuf;
delete []sentencetemp;
delete []sentenceseg;
delete []nontxtbuf;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -