📄 result.cpp
字号:
}
if(pItem[i].nHandle/256=='n'||pItem[i+1].nHandle/256=='n')//30208='v'8256
{
pItemRet[j].nHandle='n'*256;
}
i+=1;
if(strlen(pItem[i+1].sWord)==2)
{//AAB:洗/洗/脸、蒙蒙亮
if((pItemRet[j].nHandle==30208&&pItem[i+1].nHandle/256=='n')||
(pItemRet[j].nHandle==24832&&pItem[i+1].nHandle/256=='a')
)
{
strcat(pItemRet[j].sWord,pItem[i+1].sWord);
i+=1;
}
}
j+=1;
bProcessed=true;
}
//Rule 4: AAB 洗/洗澡
else if(nLen==2&&strncmp(pItem[i].sWord,pItem[i+1].sWord,2)==0&&strlen(pItem[i+1].sWord)==4&&(pItem[i].nHandle/256=='v'||pItem[i].nHandle==24832))//v,a
{
strcpy(pItemRet[j].sWord,pItem[i].sWord);
strcat(pItemRet[j].sWord,pItem[i+1].sWord);
//24832=='a'*256
pItemRet[j].nHandle=24832;//'a'
if(pItem[i].nHandle/256=='v'||pItem[i+1].nHandle/256=='v')//30208='v'8256
{
pItemRet[j].nHandle=30208;
}
i+=1;
j+=1;
bProcessed=true;
}
else if(pItem[i].nHandle/256=='u'&&pItem[i].nHandle%256)//uj,ud,uv,uz,ul,ug->u
pItem[i].nHandle='u'*256;
else if(nLen==2&&strncmp(pItem[i].sWord,pItem[i+1].sWord,2)==0&&strlen(pItem[i+1].sWord)==4&&strncmp(pItem[i+1].sWord+2,pItem[i+2].sWord,2)==0)
{//AABB 朴朴素素 枝枝叶叶
strcpy(pItemRet[j].sWord,pItem[i].sWord);
strcat(pItemRet[j].sWord,pItem[i+1].sWord);
strcat(pItemRet[j].sWord,pItem[i+2].sWord);
pItemRet[j].nHandle=pItem[i+1].nHandle;
i+=2;
j+=1;
bProcessed=true;
}
else if(pItem[i].nHandle==28275)//PostFix
{
if(m_uPlace.m_dict.IsExist(pItem[i+1].sWord,4))
{
strcpy(pItemRet[j].sWord,pItem[i].sWord);
strcat(pItemRet[j].sWord,pItem[i+1].sWord);
pItemRet[j].nHandle=28275;
i+=1;
j+=1;
bProcessed=true;
}
else if(strlen(pItem[i+1].sWord)==2&&CC_Find("队",pItem[i+1].sWord))
{
strcpy(pItemRet[j].sWord,pItem[i].sWord);
strcat(pItemRet[j].sWord,pItem[i+1].sWord);
pItemRet[j].nHandle=28276;
i+=1;
j+=1;
bProcessed=true;
}
else if(strlen(pItem[i+1].sWord)==2&&CC_Find("语文字杯",pItem[i+1].sWord))
{
strcpy(pItemRet[j].sWord,pItem[i].sWord);
strcat(pItemRet[j].sWord,pItem[i+1].sWord);
pItemRet[j].nHandle=28282;
i+=1;
j+=1;
bProcessed=true;
}
else if(strlen(pItem[i+1].sWord)==2&&CC_Find("裔",pItem[i+1].sWord))
{
strcpy(pItemRet[j].sWord,pItem[i].sWord);
strcat(pItemRet[j].sWord,pItem[i+1].sWord);
pItemRet[j].nHandle=28160;
i+=1;
j+=1;
bProcessed=true;
}
}
else if(pItem[i].nHandle==30208||pItem[i].nHandle==28160)//v
{
if(strlen(pItem[i+1].sWord)==2&&CC_Find("员",pItem[i+1].sWord))
{
strcpy(pItemRet[j].sWord,pItem[i].sWord);
strcat(pItemRet[j].sWord,pItem[i+1].sWord);
pItemRet[j].nHandle=28160;
i+=1;
j+=1;
bProcessed=true;
}
}
else if(pItem[i].nHandle==28280)
{//www/nx ./w sina/nx; EIM/nx -601/m
strcpy(pItemRet[j].sWord,pItem[i].sWord);
pItemRet[j].nHandle=28280;
while(pItem[i+1].nHandle==28280||strstr("..",pItem[i+1].sWord)||(pItem[i+1].nHandle==27904&&IsAllNum((unsigned char *)pItem[i+1].sWord)))
{
strcat(pItemRet[j].sWord,pItem[i+1].sWord);
i+=1;
}
j+=1;
bProcessed=true;
}
if(!bProcessed)
{//If not processed,that's mean: not need to adjust;
//just copy to the final result
strcpy(pItemRet[j].sWord,pItem[i].sWord);
pItemRet[j++].nHandle=pItem[i].nHandle;
}
i++;
}
pItemRet[j].sWord[0]=0;//Set ending
return true;
}
//Paragraph Segment and POS Tagging
bool CResult::ParagraphProcessing(char *sParagraph,char *sResult)
{
char *sSentence,sChar[3];
char *sSentenceResult;
unsigned int nLen=strlen(sParagraph)+13;
sSentence=new char[nLen];//malloc buffer
sSentenceResult=new char[nLen*3];//malloc buffer
sSentence[0]=0;
unsigned int nPosIndex=0,nParagraphLen=strlen(sParagraph),nSentenceIndex=0;
sChar[2]=0;
sResult[0]=0;//Init the result
bool bFirstIgnore=true;
strcpy(sSentence,SENTENCE_BEGIN);//Add a sentence begin flag
while(nPosIndex<nParagraphLen)
{//Find a whole sentence which separated by ! . \n \r
sChar[0]=sParagraph[nPosIndex];//Get a char
sChar[1]=0;
if(sParagraph[nPosIndex]<0)
{//double byte char
nPosIndex+=1;
sChar[1]=sParagraph[nPosIndex];
}
nPosIndex+=1;
/*
#define SEPERATOR_C_SENTENCE "。!?:;…"
#define SEPERATOR_C_SUB_SENTENCE "、,()“”‘’"
#define SEPERATOR_E_SENTENCE "!?:;"
#define SEPERATOR_E_SUB_SENTENCE ",()\042'"
#define SEPERATOR_LINK "\n\r "
*/
if(CC_Find(SEPERATOR_C_SENTENCE,sChar)||CC_Find(SEPERATOR_C_SUB_SENTENCE,sChar)||strstr(SEPERATOR_E_SENTENCE,sChar)||strstr(SEPERATOR_E_SUB_SENTENCE,sChar)||strstr(SEPERATOR_LINK,sChar))
{//Reach end of a sentence.Get a whole sentence
if(!strstr(SEPERATOR_LINK,sChar))//Not link seperator
{
strcat(sSentence,sChar);
}
if(sSentence[0]!=0&&strcmp(sSentence,SENTENCE_BEGIN)!=0)
{
if(!strstr(SEPERATOR_C_SUB_SENTENCE,sChar)&&!strstr(SEPERATOR_E_SUB_SENTENCE,sChar))
strcat(sSentence,SENTENCE_END);//Add sentence ending flag
Processing(sSentence,1);//Processing and output the result of current sentence.
Output(m_pResult[0],sSentenceResult,bFirstIgnore);//Output to the imediate result
//bFirstIgnore=true;
strcat(sResult,sSentenceResult);//Store in the result buffer
}
if(strstr(SEPERATOR_LINK,sChar))//Link the result with the SEPERATOR_LINK
{
strcat(sResult,sChar);
strcpy(sSentence,SENTENCE_BEGIN);//Add a sentence begin flag
//sSentence[0]=0;//New sentence, and begin new segmentation
//bFirstIgnore=false;
}
else if(strstr(SEPERATOR_C_SENTENCE,sChar)||strstr(SEPERATOR_E_SENTENCE,sChar))
{
strcpy(sSentence,SENTENCE_BEGIN);//Add a sentence begin flag
//sSentence[0]=0;//New sentence, and begin new segmentation
//bFirstIgnore=false;
}
else
{
strcpy(sSentence,sChar);//reset current sentence, and add the previous end at begin position
}
}
else //Other chars and store in the sentence buffer
strcat(sSentence,sChar);
}
if(sSentence[0]!=0&&strcmp(sSentence,SENTENCE_BEGIN)!=0)
{
strcat(sSentence,SENTENCE_END);//Add sentence ending flag
Processing(sSentence,1);//Processing and output the result of current sentence.
Output(m_pResult[0],sSentenceResult,bFirstIgnore);//Output to the imediate result
strcat(sResult,sSentenceResult);//Store in the result buffer
}
delete [] sSentence;//FREE sentence buffer
delete [] sSentenceResult;//free buffer
return true;
}
bool CResult::FileProcessing(char *sSourceFile,char *sResultFile)
{
FILE *fpSource,*fpResult;//The file pointer of read and write
char *sParagraph,*sParagraphResult;
int nLineIndex=1;
sParagraph=new char[4*1024];
sParagraphResult=new char[8*1024];
if((fpSource=fopen(sSourceFile,"rt"))==NULL)
return false;//Cannot open the source file to read
if((fpResult=fopen(sResultFile,"wt"))==NULL)
return false;//Cannot open the result file to write
if(m_nOutputFormat==2)//XML format
fprintf(fpResult,"<?xml version=\042 1.0\042 encoding=\042gb2312\042?><result>");
while(!feof(fpSource))
{
if(fgets(sParagraph,4*1024,fpSource)==0)//Get a paragrah
continue;
TRACE("%d\n",nLineIndex++);
ParagraphProcessing(sParagraph,sParagraphResult);
fprintf(fpResult,"%s",sParagraphResult);
}
delete [] sParagraph;
delete [] sParagraphResult;
fclose(fpSource);
if(m_nOutputFormat==2)//XML format
fprintf(fpResult,"</result>");
fclose(fpResult);
return true;
}
bool CResult::PKU2973POS(int nHandle, char *sPOS973)
{
int nHandleSet[46]={24832,24932,24935,24942,25088,25344,25600,25703,25856,26112,26368,26624,26880,27136,27392,27648,27904,28160,28263,28274,28275,28276,28280,28282,28416,28672,28928,29184,29440,29696,29799,29952,30052,30055,30058,30060,30070,30074,30208,30308,30311,30318,30464,30720,30976,31232};
// "a", "ad","ag","an","b", "c", "d", "dg","e", "f","g", "h", "i", "j", "k", "l", "m", "n", "ng","nr","ns","nt","nx","nz","o", "p", "q", "r", "s", "t", "tg","u", "ud","ug","uj","ul","uv","uz","v", "vd","vg","vn","w", "x", "y", "z"
char sPOSRelated[46][3]={"a", "ad","ga","an","f", "c", "d", "d", "e","nd","g", "h", "i", "j", "k", "l", "m", "n", "gn","nh","ns","ni","ws", "nz","o", "p", "q", "r", "nl","nt","gt","u", "ud","ug","uj","ul","uv","uz","v", "vd","gv","vn","w", "x", "u", "a"};
/*
"Bg","gf",
"Rg","gr",
"Mg","gm",
"Yg","u",
"Ug","u",
"Qg","q",
*/
int nIndex=BinarySearch(nHandle,nHandleSet,46);
if(nIndex==-1)
strcpy(sPOS973,"@");
else
strcpy(sPOS973,sPOSRelated[nIndex]);
return true;
}
bool CResult::ChineseNameSplit(char *sPersonName, char *sSurname, char *sSurname2, char *sGivenName, CDictionary &personDict)
{
int nSurNameLen=4,nLen=strlen(sPersonName),nFreq,i=0,nCharType,nFreqGiven;
char sTemp[3];
if(nLen<3||nLen>8)//Not a traditional Chinese person name
return false;
while(i<nLen)//No Including non-CHinese char
{
nCharType=charType((unsigned char*)sPersonName+i);
if(nCharType!=CT_CHINESE&&nCharType!=CT_OTHER)
return false;
i+=2;
}
sSurname2[0]=0;//init
strncpy(sSurname,sPersonName,nSurNameLen);
sSurname[nSurNameLen]=0;
if(!personDict.IsExist(sSurname,1))
{
nSurNameLen=2;
sSurname[nSurNameLen]=0;
if(!personDict.IsExist(sSurname,1))
{
nSurNameLen=0;
sSurname[nSurNameLen]=0;
}
}
strcpy(sGivenName,sPersonName+nSurNameLen);
if(nLen>6)
{
strncpy(sTemp,sPersonName+nSurNameLen,2);
sTemp[2]=0;//Get the second possible surname
if(personDict.IsExist(sTemp,1))
{//Hongkong women's name: Surname+surname+given name
strcpy(sSurname2,sTemp);
strcpy(sGivenName,sPersonName+nSurNameLen+2);
}
}
nFreq=personDict.GetFrequency(sSurname,1);
strncpy(sTemp,sGivenName,2);
sTemp[2]=0;
nFreqGiven=personDict.GetFrequency(sTemp,2);
if(nSurNameLen!=4&&((nSurNameLen==0&&nLen>4)||strlen(sGivenName)>4||(GetForeignCharCount(sPersonName)>=3&&nFreq<personDict.GetFrequency("张",1)/40&&nFreqGiven<personDict.GetFrequency("华",2)/20)||(nFreq<10&&GetForeignCharCount(sGivenName)==(nLen-nSurNameLen)/2)))
return false;
if(nLen==4&&m_uPerson.IsGivenName(sPersonName))
{//Single Surname+given name
return false;
}
return true;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -