📄 new_ssps.cpp
字号:
if(flag<0)
{
IWord=(struct aWORDdic*)emalloc(sizeof(aWORDdic));
IWord->Len=strlen((char*)cWord);
IWord->sCIYU=(unsigned char *)emalloc(IWord->Len+1);//note here
strcpy((char*)IWord->sCIYU,(char*)cWord);
IWord->pos=(unsigned char*)emalloc(9);
strcpy((char*)IWord->pos,(char*)cPos);
IWord->ptrNext=Wptr;
if(IPos) IPos->ptrNext=IWord;
else WIndexcom[x][y].WList=IWord;//note for IPos
WIndexcom[x][y].WCount++;
// Num_Ins++;
//#ifdef Debug
printf("%s\n",cWord);
//#endif
break;
}
}//end of while(1);
}
else //首字节表明是ASCII
{
x=cWord[0];
Wptr=CIndexcom[x].WList;
if(!Wptr)
{
IWord=(struct aWORDdic*)emalloc(sizeof(aWORDdic));
IWord->ptrNext=NULL;
IWord->Len=strlen((char*)cWord);
IWord->sCIYU=(unsigned char *)emalloc(IWord->Len+1);//note here
strcpy((char*)IWord->sCIYU,(char*)cWord);
IWord->pos=(unsigned char*)emalloc(9);
strcpy((char*)IWord->pos,(char*)cPos);
CIndexcom[x].WList=IWord;
CIndexcom[x].WCount++;
// Num_Ins++;
//#ifdef Debug
printf("%s\n",cWord);
//#endif
return 0;
}
while(1)
{
flag=strcmp((char*)cWord,(char*)Wptr->sCIYU);
if(!flag)
{
// AlreadyExisted++;
//#ifdef Debug
printf("%s already exists.\n",cWord);
//#endif
break;
}
if(flag>0)
{
IPos=Wptr;
Wptr=Wptr->ptrNext;
if(!Wptr)
{
IWord=(struct aWORDdic*)emalloc(sizeof(aWORDdic));
IWord->ptrNext=NULL;
IWord->Len=strlen((char*)cWord);
IWord->sCIYU=(unsigned char *)emalloc(IWord->Len+1);//note here
strcpy((char*)IWord->sCIYU,(char*)cWord);
IWord->pos=(unsigned char*)emalloc(9);
strcpy((char*)IWord->pos,(char*)cPos);
IPos->ptrNext=IWord;
CIndexcom[x].WCount++;
// Num_Ins++;
//#ifdef Debug
printf("%s\n",cWord);
//#endif
return 0;
}
}
if(flag<0)
{
IWord=(struct aWORDdic*)emalloc(sizeof(aWORDdic));
IWord->Len=strlen((char*)cWord);
IWord->sCIYU=(unsigned char *)emalloc(IWord->Len+1);//note here
strcpy((char*)IWord->sCIYU,(char*)cWord);
IWord->pos=(unsigned char*)emalloc(9);
strcpy((char*)IWord->pos,(char*)cPos);
IWord->ptrNext=Wptr;
if(IPos) IPos->ptrNext=IWord;
else CIndexcom[x].WList=IWord;//note for IPos
CIndexcom[x].WCount++;
// Num_Ins++;
//#ifdef Debug
printf("%s\n",cWord);
//#endif
break;
}
}//end of while(1);
}
return 0;
}
//-----------------------------------------------------------//
// 功能:GetaWord
// 参数:
// 返回 NULL : IS THE END
// SOME WORD
//----------------------------------------------------------//
//unsigned char* CSSPS::GetaWord(unsigned char*BufIn,unsigned int ¤tpos,unsigned char*cWord,unsigned char* cPos,char tag1, char tag2) const
//{
// unsigned char *Bufp ;
// if(BufIn == NULL || *(Bufp= BufIn + currentpos) ==0 || currentpos>= strlen((const char*)BufIn))
// return NULL;
//
// *cWord=0;
// unsigned char *Wordp=cWord;
// *cPos = 0;
// unsigned char *Posp = cPos;
//
// int inChinese=0;
// int maxlen = MAXCHWORDLEN;
//
// while(*(Bufp+1) && --maxlen)
// {
// currentpos ++;
// if(*Bufp != tag1)
// {
// if(!inChinese)
// {
// Wordp=cWord;
// inChinese=1;
// }
//
// *Wordp++=*Bufp++;
// }
// else
// {
// Bufp++; // this is for skip an ascii ch!! necessary!!
// if(inChinese)
// {
// //xx++;
// /*if(isalpha(*cWord))
// {
// inChinese = 0;
// continue;
// }*/
// *Wordp=0;
// return cWord;
// }
// }
//
// }//end of while
// if(inChinese)
// {
// *Wordp=0;
// inChinese=0;
// printf("get a word %s 1end \n",cWord);
// currentpos ++;
// return cWord;
// }
// else {
// Bufp = NULL;
// printf("get a word end\n");
// return NULL;
// }
//}
unsigned char* CSSPS::GetaWord(unsigned char*BufIn,unsigned int ¤tpos,unsigned char*cWord,unsigned char* cPos,char tag1,char tag2) const{
unsigned char *Bufp ;
if(BufIn == NULL || *(Bufp= BufIn + currentpos) ==0 || currentpos>= strlen((const char*)BufIn))
return NULL;
*cWord=0;
unsigned char *Wordp=cWord;
*cPos = 0;
unsigned char* Posp = cPos;
unsigned char off1 = 0; // the offset of the tag1 which indicate the end of the word and
// the begin of the pos;
unsigned char off2 = 0; // the offset of the tag2 which indicate the end of the pos and the
// begin of the next word;
int maxlen = MAXCHWORDLEN;
while( *(Bufp + off1) && maxlen--){
if(*(Bufp +off1) != tag1)
off1++;
else break;
}
while( *(Bufp + off2) && maxlen--){
if(*(Bufp+off2) != tag2)
off2++;
else break;
}
//if((currentpos+off2) <= strlen((const char*)BufIn)){
if(off2 < off1){
cout << "format error: the word has no part of speech tag!"<<endl;
return NULL;
}
if((off2-off1) != 9){
cout << "format error: the part of speech tag is not correct!"<<endl;
return NULL;
}
int i;
for(i=0;i<off1;i++){
*Wordp++ = *Bufp++;
}
//Wordp++;
*Wordp = '\0';
Bufp++; // here we skip one char
for(i= off1;i<off2-1;i++){
*Posp++ = *Bufp++;
}
//Posp++;
*Posp = '\0';
Bufp++;
currentpos += (off2+1);
return BufIn;
}
//判断ch是否是一个ascii码
int CSSPS::isASCII(unsigned char ch) const
{
return !(ch&0x80);
}
//判断ch是否是white space characters
int CSSPS::wbspace(unsigned char ch) const
{
// int bool;// 0x21---0x7e
// if(ch==0xff) bool=1;
// if(ch>=0x21 && ch<=0x7e) bool=1;
// return !bool;
return isspace(ch);
}
int CSSPS::isHanzi(unsigned char ch1,unsigned char ch2) const
{
if(ch1>=0xa1 && ch1<=0xf7 && ch2>=0xa1 && ch2<=0xfe && !(ch1==0xcc && ch2==0xcc)) return 1;
else return 0;
}//ch1 has 94 values,ch2 has 94 values,totally 6768 Chinese Characters
//-----------------------------------------------------------//
// 功能:快速分词
// 参数: (入口)sBuffer 源
// (出口)strResult 目标串
//----------------------------------------------------------//
//this edition of Segment is correct for multi process safe.
void CSSPS::Segment(unsigned char *sBuffer,unsigned char *strResult,char tag1,char tag2)
{
int pointer=0;
*strResult='\0';
unsigned char *resultp=strResult;
string ascpos = "ASCI 0";
int x;
int y;
int ascii_flag;
int flag = 0;
struct aWORDdic* Wptr;
int count;
struct aWORDdic* DPos=NULL;
unsigned char* pos = NULL;
int lenofsBuffer=strlen((char *)sBuffer);
while(pointer<lenofsBuffer) {
if(sBuffer[pointer] & 128) //首字节表明是汉字词
{
ascii_flag=0;
if(flag==1) // the flag = 1 signals that the end of a string of ascii code
{ *resultp++ = tag1;
strcpy((char*)resultp,ascpos.c_str());
resultp += 8;
*resultp++ = tag2;//再加一个分界符
flag=0;
}
x=sBuffer[pointer]-0xa1;
y=sBuffer[pointer+1]-0xa1;
Wptr=WIndexcom[x][y].WList;
count=WIndexcom[x][y].WCount;
DPos=NULL;
if(!Wptr) {
*resultp++=sBuffer[pointer++];
*resultp++=sBuffer[pointer++];//将切出的词写入结果字串
*resultp++=D;//再加一个分界符
continue;
}
}//?end of if, the end of getting chinese words
// if the word isn't a wordlist head in the lexicon,
//skip the following steps and continue;
else //首字节表明是ASCII
{
flag=1;
ascii_flag=1;
x=sBuffer[pointer];
Wptr=CIndexcom[x].WList;
count=CIndexcom[x].WCount;
DPos=NULL;
if(!Wptr) {
*resultp++=sBuffer[pointer++];
continue;
}
}//end of else, the end of get ascii code
unsigned char LastLen=2;
int i;
int iCompareResult;
while(1) {
if(!Wptr)
{
if(ascii_flag)
{
*resultp++=sBuffer[pointer++];
break;
}
for(i=0;i<LastLen;i++)
{
*resultp++=sBuffer[pointer++];}//将切出的词写入结果字串
*resultp++ = tag1;
strcpy((char*)resultp,(const char*)pos);
resultp += 8;
*resultp++ = tag2;//再加一个分界符
break;
}
iCompareResult=strncmp((char *)(sBuffer+pointer),(const char *)Wptr->sCIYU,Wptr->Len);
if(iCompareResult<0) {
if(ascii_flag)
{
*resultp++=sBuffer[pointer++];
break;
}
for(i=0;i<LastLen;i++){
*resultp++=sBuffer[pointer++];
}
*resultp++ = tag1;
strcpy((char*)resultp,(const char*)pos);
resultp += 8;
*resultp++= tag2;
break;
}
if(iCompareResult==0) {
LastLen=Wptr->Len;
DPos=Wptr;
pos = Wptr->pos;
Wptr=Wptr->ptrNext;
}
if(iCompareResult>0)
{
DPos=Wptr;
Wptr=Wptr->ptrNext;
}
}//end of while(1)
}//end of while(pointer<...)
*resultp=0;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
//this method segment the input text and the results are stored in vector;each item of the vector is a
//pair<string,int>
//
////////////////////////////////////////////////////////////////////////////////////////////////////////
void CSSPS::Segment(unsigned char *sBuffer,unsigned char * strResult , vector< pair<string,int> > &wordvec)
{
int pointer=0;
*strResult='\0';
unsigned char *resultp=strResult;
string word;
string ascpos = "ASCI 0";
int x;
int y;
int ascii_flag;
int flag = 0;
struct aWORDdic* Wptr;
int count;
struct aWORDdic* DPos=NULL;
unsigned char* pos = NULL;
int lenofsBuffer=strlen((char *)sBuffer);
while(pointer<lenofsBuffer) {
if(sBuffer[pointer] & 128) //首字节表明是汉字词
{
ascii_flag=0;
if(flag==1) // the flag = 1 signals that the end of a string of ascii code
{ *resultp++ = E;
strcpy((char*)resultp,ascpos.c_str());
resultp += 8;
*resultp++=D;//再加一个分界符
wordvec.push_back(pair<string,int>(word,pointer-word.size()));
word = "";
flag=0;
}
x=sBuffer[pointer]-0xa1;
y=sBuffer[pointer+1]-0xa1;
Wptr=WIndexcom[x][y].WList;
count=WIndexcom[x][y].WCount;
DPos=NULL;
if(!Wptr) {
word += (char)sBuffer[pointer];
*resultp++=sBuffer[pointer++];
word += (char)sBuffer[pointer];
*resultp++=sBuffer[pointer++];//将切出的词写入结果字串
*resultp++=D;//再加一个分界符
word = "";
continue;
}
}//?end of if, the end of getting chinese words
// if the word isn't a wordlist head in the lexicon,
//skip the following steps and continue;
else //首字节表明是ASCII
{
flag=1;
ascii_flag=1;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -