📄 new_ssps.cpp

📁 贝叶斯公式
💻 CPP
📖 第 1 页 / 共 4 页
字号:
		if(flag<0)
		{
			IWord=(struct aWORDdic*)emalloc(sizeof(aWORDdic));
			IWord->Len=strlen((char*)cWord);
			IWord->sCIYU=(unsigned char *)emalloc(IWord->Len+1);//note here
			strcpy((char*)IWord->sCIYU,(char*)cWord);
			IWord->pos=(unsigned char*)emalloc(9);
			strcpy((char*)IWord->pos,(char*)cPos);

			IWord->ptrNext=Wptr;
			if(IPos) IPos->ptrNext=IWord;
			else WIndexcom[x][y].WList=IWord;//note for IPos
			WIndexcom[x][y].WCount++;
			
	//		Num_Ins++;

//#ifdef Debug
				printf("%s\n",cWord);
//#endif
				
			break;
		 }
	   }//end of while(1);
	}
	else //首字节表明是ASCII
	{
	   x=cWord[0];

	   Wptr=CIndexcom[x].WList;

	   if(!Wptr)
	  {	
		IWord=(struct aWORDdic*)emalloc(sizeof(aWORDdic));
		IWord->ptrNext=NULL;
		IWord->Len=strlen((char*)cWord);
		IWord->sCIYU=(unsigned char *)emalloc(IWord->Len+1);//note here
		strcpy((char*)IWord->sCIYU,(char*)cWord);
		IWord->pos=(unsigned char*)emalloc(9);
		strcpy((char*)IWord->pos,(char*)cPos);
		
		CIndexcom[x].WList=IWord;
		CIndexcom[x].WCount++;

	//	Num_Ins++;

//#ifdef Debug
		printf("%s\n",cWord);
//#endif
		return 0;
	  }

	  while(1)
	 {
		flag=strcmp((char*)cWord,(char*)Wptr->sCIYU);
		
		if(!flag)
		{
	//		AlreadyExisted++;
        
//#ifdef Debug
			printf("%s already exists.\n",cWord);
//#endif			

			break;
		}

		if(flag>0) 
		{
			IPos=Wptr;
			Wptr=Wptr->ptrNext;
			
			if(!Wptr)
			{
				IWord=(struct aWORDdic*)emalloc(sizeof(aWORDdic));
				IWord->ptrNext=NULL;
				IWord->Len=strlen((char*)cWord);
				IWord->sCIYU=(unsigned char *)emalloc(IWord->Len+1);//note here
				strcpy((char*)IWord->sCIYU,(char*)cWord);
				IWord->pos=(unsigned char*)emalloc(9);
				strcpy((char*)IWord->pos,(char*)cPos);
		
				IPos->ptrNext=IWord;
				CIndexcom[x].WCount++;

	//			Num_Ins++;

//#ifdef Debug
				printf("%s\n",cWord);
//#endif
				
				return 0;
			}
		}

		if(flag<0)
		{
			IWord=(struct aWORDdic*)emalloc(sizeof(aWORDdic));
			IWord->Len=strlen((char*)cWord);
			IWord->sCIYU=(unsigned char *)emalloc(IWord->Len+1);//note here
			strcpy((char*)IWord->sCIYU,(char*)cWord);
			IWord->pos=(unsigned char*)emalloc(9);
			strcpy((char*)IWord->pos,(char*)cPos);

			IWord->ptrNext=Wptr;
			if(IPos) IPos->ptrNext=IWord;
			else CIndexcom[x].WList=IWord;//note for IPos
			CIndexcom[x].WCount++;
			
	//		Num_Ins++;

//#ifdef Debug
				printf("%s\n",cWord);
//#endif
				
			break;
		 }
	   }//end of while(1);
	}
	return 0;
}

//-----------------------------------------------------------//
//   功能：GetaWord
//   参数： 
//	 返回	NULL : IS THE END
//			SOME WORD
//----------------------------------------------------------//
//unsigned char* CSSPS::GetaWord(unsigned char*BufIn,unsigned int &currentpos,unsigned char*cWord,unsigned char* cPos,char tag1, char tag2) const
//{
//	unsigned char *Bufp ;
//	if(BufIn == NULL || *(Bufp= BufIn + currentpos) ==0 || currentpos>= strlen((const char*)BufIn)) 
//		return NULL;
//	
//	*cWord=0;
//	unsigned char *Wordp=cWord;
//	*cPos = 0;
//	unsigned char *Posp = cPos;
//
//    int inChinese=0;
//	int maxlen = MAXCHWORDLEN;
//
//	while(*(Bufp+1) && --maxlen)
//	{
//		currentpos ++;
//		if(*Bufp != tag1)
//		{
//		    if(!inChinese)
//			{
//				Wordp=cWord;
//				inChinese=1;
//			}
//		 
//			*Wordp++=*Bufp++;
//		}
//		else
//		{
//			Bufp++;  // this is for skip an ascii ch!! necessary!!
//			if(inChinese)
//			{
//				//xx++;
//				/*if(isalpha(*cWord))
//				{
//					inChinese = 0;
//					continue;
//				}*/
//				*Wordp=0;
//				return cWord;
//			}
//		}
//	
//	}//end of while
//	if(inChinese)
//	{
//		*Wordp=0;
//		inChinese=0;
//	printf("get a word %s 1end \n",cWord);
//	currentpos ++;
//		return cWord;
//	}
//	else {
//		Bufp = NULL;
//	printf("get a word end\n");
//		return NULL;
//	}
//}
unsigned char* CSSPS::GetaWord(unsigned char*BufIn,unsigned int &currentpos,unsigned char*cWord,unsigned char* cPos,char tag1,char tag2) const{
	unsigned char *Bufp ;
	if(BufIn == NULL || *(Bufp= BufIn + currentpos) ==0 || currentpos>= strlen((const char*)BufIn)) 
		return NULL;
	*cWord=0;
	unsigned char *Wordp=cWord;
	*cPos = 0;
	unsigned char* Posp = cPos;
	unsigned char off1 = 0; // the offset of the tag1 which indicate the end of the word and 
							// the begin of the pos;
	unsigned char off2 = 0; // the offset of the tag2 which indicate the end of the pos and the 
							// begin of the next word;
	int maxlen = MAXCHWORDLEN;
	while( *(Bufp + off1) && maxlen--){
		
		if(*(Bufp +off1) != tag1)
		off1++;
		else break;			
	}
	while( *(Bufp + off2) && maxlen--){
		
		if(*(Bufp+off2) != tag2)
		off2++;
		else break;			
	}
	//if((currentpos+off2) <= strlen((const char*)BufIn)){
	
	if(off2 < off1){
		cout << "format error: the word has no part of speech tag!"<<endl;

		return NULL;
	}
	if((off2-off1) != 9){
		cout << "format error: the part of speech tag is not correct!"<<endl;
		return NULL;
	}
	int i;
	for(i=0;i<off1;i++){
		*Wordp++ = *Bufp++;		
	}
	//Wordp++;
	*Wordp = '\0';
	Bufp++; // here we skip one char 
	for(i= off1;i<off2-1;i++){
		*Posp++ = *Bufp++;
	}
	//Posp++;
	*Posp = '\0';
	Bufp++;
	currentpos += (off2+1);

	return BufIn;

}

//判断ch是否是一个ascii码
int  CSSPS::isASCII(unsigned char ch) const 
{
   return !(ch&0x80);
}

//判断ch是否是white space characters
int CSSPS::wbspace(unsigned char ch) const
{
//	int bool;// 0x21---0x7e	
//	if(ch==0xff) bool=1;
//	if(ch>=0x21 && ch<=0x7e) bool=1;
//	return !bool;
	return isspace(ch);
}

int CSSPS::isHanzi(unsigned char ch1,unsigned char ch2)  const
{
	if(ch1>=0xa1 && ch1<=0xf7 && ch2>=0xa1 && ch2<=0xfe && !(ch1==0xcc && ch2==0xcc)) return 1;
	else return 0;
}//ch1 has 94 values,ch2 has 94 values,totally 6768 Chinese Characters

//-----------------------------------------------------------//
//   功能：快速分词
//   参数： （入口）sBuffer		源
//			（出口）strResult	目标串	
//----------------------------------------------------------//
//this edition of Segment is correct for multi process safe.
void CSSPS::Segment(unsigned char *sBuffer,unsigned char *strResult,char tag1,char tag2)
{
       int pointer=0;
       *strResult='\0';
       unsigned char *resultp=strResult;	   
	   string ascpos = "ASCI   0";
       int x;
       int y;
	   int ascii_flag;
	   int flag = 0;
       struct aWORDdic* Wptr;
       int count;
       struct aWORDdic* DPos=NULL;
	   unsigned char* pos = NULL;
       int lenofsBuffer=strlen((char *)sBuffer);
       while(pointer<lenofsBuffer) {
       	     if(sBuffer[pointer] & 128) //首字节表明是汉字词
       	     {
                 ascii_flag=0;
				 if(flag==1)      // the flag = 1 signals that the end of a string of ascii code
				 {   *resultp++ = tag1;
					 strcpy((char*)resultp,ascpos.c_str());
					 resultp += 8;
					 *resultp++ = tag2;//再加一个分界符		
					
					 flag=0;
				 }
				 x=sBuffer[pointer]-0xa1;
	          y=sBuffer[pointer+1]-0xa1;
	          Wptr=WIndexcom[x][y].WList;
	          count=WIndexcom[x][y].WCount;
	          DPos=NULL;
		     if(!Wptr) {
						
                        *resultp++=sBuffer[pointer++];						
                        *resultp++=sBuffer[pointer++];//将切出的词写入结果字串
						
                        *resultp++=D;//再加一个分界符
						
                        continue;
                      }
               }//?end of if, the end of getting chinese words

			 // if the word isn't a wordlist head in the lexicon, 
			 //skip the following steps and continue;

               else //首字节表明是ASCII
       	      {
                  flag=1;
				  ascii_flag=1;
				   x=sBuffer[pointer];
				  Wptr=CIndexcom[x].WList;
				  count=CIndexcom[x].WCount;
				  DPos=NULL;
				  if(!Wptr) {								
								*resultp++=sBuffer[pointer++];								
								continue;
							  }
			  }//end of else, the end of get ascii code

               unsigned char LastLen=2;
               int i;
               int iCompareResult;

               while(1) {
                        	if(!Wptr)
							{
								if(ascii_flag)
                                 {  
                                    *resultp++=sBuffer[pointer++];
								     break;
								}
								for(i=0;i<LastLen;i++)
								{  
									
									*resultp++=sBuffer[pointer++];}//将切出的词写入结果字串

							   *resultp++ = tag1;
							    strcpy((char*)resultp,(const char*)pos);
							    resultp += 8;
							   
								*resultp++ = tag2;//再加一个分界符
								
								break;
							}

	                    iCompareResult=strncmp((char *)(sBuffer+pointer),(const char *)Wptr->sCIYU,Wptr->Len);
                        if(iCompareResult<0) {
							if(ascii_flag)
							{	
                                 *resultp++=sBuffer[pointer++];
								 break;
							}
							for(i=0;i<LastLen;i++){
								  
                                  *resultp++=sBuffer[pointer++];
							}
							  *resultp++ = tag1;
							  strcpy((char*)resultp,(const char*)pos);
							  resultp += 8;
                              *resultp++= tag2;
							  
                              break;
                         }
                         if(iCompareResult==0) {
                              LastLen=Wptr->Len;
                              DPos=Wptr;
							  pos = Wptr->pos;
			                  Wptr=Wptr->ptrNext;
                         }
                         if(iCompareResult>0)
						 {
							 DPos=Wptr;
			                 Wptr=Wptr->ptrNext;
						 }
                    }//end of while(1)
       }//end of while(pointer<...)
       *resultp=0;
}  

/////////////////////////////////////////////////////////////////////////////////////////////////////////
//this method segment the input text and the results are stored in vector;each item of the vector is a 
//pair<string,int>
//
////////////////////////////////////////////////////////////////////////////////////////////////////////

void CSSPS::Segment(unsigned char *sBuffer,unsigned char * strResult , vector< pair<string,int> > &wordvec)
{
       int pointer=0;
       *strResult='\0';
       unsigned char *resultp=strResult;
	   string word;
	   string ascpos = "ASCI   0";
       int x;
       int y;
	   int ascii_flag;
	   int flag = 0;
       struct aWORDdic* Wptr;
       int count;
       struct aWORDdic* DPos=NULL;
	   unsigned char* pos = NULL;
       int lenofsBuffer=strlen((char *)sBuffer);
       while(pointer<lenofsBuffer) {
       	     if(sBuffer[pointer] & 128) //首字节表明是汉字词
       	     {
                 ascii_flag=0;
				 if(flag==1)      // the flag = 1 signals that the end of a string of ascii code
				 {   *resultp++ = E;
					 strcpy((char*)resultp,ascpos.c_str());
					 resultp += 8;
					 *resultp++=D;//再加一个分界符
					 wordvec.push_back(pair<string,int>(word,pointer-word.size()));
					 word = "";
					 flag=0;
				 }
				 x=sBuffer[pointer]-0xa1;
	          y=sBuffer[pointer+1]-0xa1;
	          Wptr=WIndexcom[x][y].WList;
	          count=WIndexcom[x][y].WCount;
	          DPos=NULL;
		     if(!Wptr) {
						word += (char)sBuffer[pointer];
                        *resultp++=sBuffer[pointer++];
						word += (char)sBuffer[pointer];
                        *resultp++=sBuffer[pointer++];//将切出的词写入结果字串
						
                        *resultp++=D;//再加一个分界符
						word = "";
                        continue;
                      }
               }//?end of if, the end of getting chinese words

			 // if the word isn't a wordlist head in the lexicon, 
			 //skip the following steps and continue;

               else //首字节表明是ASCII
       	      {
                  flag=1;
				  ascii_flag=1;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -