⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 new_ssps.cpp

📁 贝叶斯公式
💻 CPP
📖 第 1 页 / 共 4 页
字号:
				   x=sBuffer[pointer];
				  Wptr=CIndexcom[x].WList;
				  count=CIndexcom[x].WCount;
				  DPos=NULL;
				  if(!Wptr) {
								word += (char)sBuffer[pointer];
								*resultp++=sBuffer[pointer++];								
								continue;
							  }
			  }//end of else, the end of get ascii code

               unsigned char LastLen=2;
               int i;
               int iCompareResult;

               while(1) {
                        	if(!Wptr)
							{
								if(ascii_flag)
                                 {  word += (char)sBuffer[pointer];
                                    *resultp++=sBuffer[pointer++];
								     break;
								}
								for(i=0;i<LastLen;i++)
								{  
									word += (char)sBuffer[pointer];
									*resultp++=sBuffer[pointer++];}//将切出的词写入结果字串

							   *resultp++=E;
							    strcpy((char*)resultp,(const char*)pos);
							    resultp += 8;
							   
								*resultp++=D;//再加一个分界符
								wordvec.push_back(pair<string,int>(word,pointer-word.size()));
								word = "";
								break;
							}

	                    iCompareResult=strncmp((char *)(sBuffer+pointer),(const char *)Wptr->sCIYU,Wptr->Len);
                        if(iCompareResult<0) {
							if(ascii_flag)
							{	word += (char)sBuffer[pointer];
                                 *resultp++=sBuffer[pointer++];
								 break;
							}
							for(i=0;i<LastLen;i++){
								  word += (char)sBuffer[pointer];
                                  *resultp++=sBuffer[pointer++];
							}
							  *resultp++=E;
							  strcpy((char*)resultp,(const char*)pos);
							  resultp += 8;
                              *resultp++=D;
							  wordvec.push_back(pair<string,int>(word,pointer-word.size()));
							  word = "";
                              break;
                         }
                         if(iCompareResult==0) {
                              LastLen=Wptr->Len;
                              DPos=Wptr;
							  pos = Wptr->pos;
			                  Wptr=Wptr->ptrNext;
                         }
                         if(iCompareResult>0)
						 {
							 DPos=Wptr;
			                 Wptr=Wptr->ptrNext;
						 }
                    }//end of while(1)
       }//end of while(pointer<...)
       *resultp=0;
}  

/////////////////////////////////////////////////////////////////////////////////////////////////////////
//-----------------------------------------------------------//
//   功能:SSPS算法子程序:
//   参数: (入口)lpszBuffer:待分词文本缓冲
//			(出口)destination buffer
//----------------------------------------------------------//
//词典文件已被打开,即OpenDic函数已被调用过 
void CSSPS::SSPS(char* lpszBuffer,char* targetBuf)
{ 
	char *tp=targetBuf;
	 
    	//定义Buffer的指针:
	int Bpointer=0;
	int inChinese=0;//....
	int specialspace=0;

	//定义待分词纯汉字字串和分词结果字串:
    unsigned char strSource[100000],strResult[150000];
    /*unsigned char *strSource,*strResult;
    int len=strlen(lpszBuffer);
    strSource=new unsigned char[len+1];
    strResult=new unsigned char[2*len];*/
    //Pay attention: should use variable length strings instead.Modify SSPS()!!
    		
	unsigned char *sourcep=strSource; //*resultp=strResult;
// syslog(LOG_ALERT,"FAILED IN SSPS");
	//逐字节地处理文本缓冲区:
	while(lpszBuffer[Bpointer])////this paragraph should be rewritten!!
	{
	    *sourcep++=lpszBuffer[Bpointer++];
	
	}//While(lpszBuffer is at the end)
//syslog(LOG_ALERT,"FAILED IN SSPS");
//for the source string that end with all chinese substring!!!!!
	*sourcep=0;

	char tag1 = 15;
	char tag2 = 127;
	Segment(strSource,strResult,tag1,tag2);

	*tp++=D;
	char *temp=(char *)strResult;
	while(*temp)
		*tp++=*temp++;
	*tp=0;
//syslog(LOG_ALERT,"FAILED IN SSPS end");
	   
}//SSPS子程序结束

long CSSPS::filelength1(int handle)
{
	struct stat buf;
	fstat(handle,&buf);
	return buf.st_size;
}

//long CSSPS::get_runtime() 
//{
//  clock_t start;
//  start = clock();
//  return((long)((double)start*100.0/(double)CLOCKS_PER_SEC));
//}

void * CSSPS::emalloc(__int32 i)
{
	void *p;
 
	if ((p = (void *) malloc(i)) == NULL)
		//err_ret("Ran out of memory (could not allocate enough)!");
		printf("Ran out of memory (could not allocate enough)!");
	return p;
}

int CSSPS::OperateDic(const char *IFileName,const char *DicFileName )
{
	char sf[256];
	FILE* fp;
	DicOpenDic(DicFileName);
	strcpy(sf,IFileName);
	if((fp=fopen(sf,"rb"))==NULL)
	{
		printf("Raw word file cannot be opened!\n");
		return 1;
	}
	long len=filelength1(fileno(fp));
	if(len>0)
	{
	unsigned char * Buffer=(unsigned char *)emalloc(len);
	fread(Buffer,len,1,fp);
	fclose(fp);
    char *seps1=("\t\r\n");
    char *token1;
	token1 = strtok( (char *)Buffer, seps1 );
	while( token1 != NULL )
	{
        WDel((unsigned char*)token1);
		token1 = strtok( NULL, seps1 );
	 }
	free(Buffer);
	}
	else fclose(fp);
	return 0;
}

//在词典中寻找一个词
int CSSPS::findword(int count,char *buffer,struct aWORDdic *Wptr)
{ 
   int low=0;
   int high=count-1;
   struct aWORDdic *ptr;
   while(low<=high)
   {
      ptr=Wptr;
	  int mid=(low+high)/2;
      int n=0;
	  for(int i=0;i<mid;i++)
		  ptr=ptr->ptrNext;
	  n=strncmp(buffer,(const char *)(ptr)->sCIYU,(ptr)->Len);
      if(n<0)high=mid-1;
      else if(n>0)low=mid+1;
	  else return mid;
   }
   return -1;
 }
//从词典中删除一个词
int CSSPS::WDel(unsigned char *cWord)
{
	int x,y;
	int flag;
	struct aWORDdic* Wptr;
	struct aWORDdic* DPos=NULL;
	struct aWORDdic* DWord;
	if(!cWord) return -1;
        if(cWord[0] & 128) //首字节表明是汉字词
	{
	    x=cWord[0]-0xa1;
	    y=cWord[1]-0xa1;

	    Wptr=WIndexcom[x][y].WList;
	    DPos=NULL;

	while(1)
	{
		if(!Wptr)
		{
			//	NotFound++;

#ifdef Debug
				printf("%s not found.\n",cWord);
#endif
				
			break;
		}

		flag=strcmp((char *)cWord,(char*)Wptr->sCIYU);
		
		if(flag<0)
		{
		//	NotFound++;

#ifdef Debug
				printf("%s not found.\n",cWord);
#endif
				
			break;
		}

		if(flag>0) 
		{
			DPos=Wptr;
			Wptr=Wptr->ptrNext;
		}

		if(!flag)
		{
			DWord=Wptr;

			if(DPos) DPos->ptrNext=Wptr->ptrNext;
			else WIndexcom[x][y].WList=Wptr->ptrNext;
			
			WIndexcom[x][y].WCount--;

			free(DWord->sCIYU);
			free(DWord->pos);
			free(DWord);
			
		//	Num_Del++;

#ifdef Debug
				printf("%s\n",cWord);
#endif
				
			break;
		}
	}//end of while(1);
	}
        else //首字节表明是ASCII
	{
	    x=cWord[0];

	    Wptr=CIndexcom[x].WList;
	    DPos=NULL;

	while(1)
	{
		if(!Wptr)
		{
			//	NotFound++;

#ifdef Debug
				printf("%s not found.\n",cWord);
#endif
				
			break;
		}

		flag=strcmp((char *)cWord,(char*)Wptr->sCIYU);
		
		if(flag<0)
		{
		//	NotFound++;

#ifdef Debug
				printf("%s not found.\n",cWord);
#endif
				
			break;
		}

		if(flag>0) 
		{
			DPos=Wptr;
			Wptr=Wptr->ptrNext;
		}

		if(!flag)
		{
			DWord=Wptr;

			if(DPos) DPos->ptrNext=Wptr->ptrNext;
			else CIndexcom[x].WList=Wptr->ptrNext;
			
			CIndexcom[x].WCount--;

			free(DWord->sCIYU);
			free(DWord->pos);
			free(DWord);
			
		//	Num_Del++;

#ifdef Debug
				printf("%s\n",cWord);
#endif
				
			break;
		}
	}//end of while(1);
	}
	return 0;	
}

//-----------------------------------------------------------//
//   功能:	维护词典
//   参数: 
//  (入口)flag 标志 0del 1add IFileName 词列表文件名 DicFileName 词典文件名
//	 调用函数:	DicOpenDic 、DicCloseDic 、WInsert WDel 等
//    返回:
//		0 : is ok!
//		1 : can not open the input file
//		2 : can not open the .lex file
//		3 : can not open the .ptr file
//		4 : irragular file format
//		5 : error on the file ssps.ptr length
//----------------------------------------------------------//
int CSSPS::OperateDic( int flag,const char *IFileName,const char *DicFileName ,char tag1,char tag2)
{
	int ins_del;
	char sf[256];
	FILE* fp;
	int mark = 0;

	switch(flag)
	{
//		default : usage(); 
		case 0: ins_del=1; break;
		case 1: ins_del=0; break;
	}
	
	strcpy(sf,IFileName);
	if((fp=fopen(sf,"rb"))==NULL)
	{
		printf("Raw word file cannot be opened!\n");
		return 1;
	}

	long len=filelength1(fileno(fp));
	unsigned char * Buffer=(unsigned char *)emalloc(len+1);
	fread(Buffer,len,1,fp);
	Buffer[len]='\0';
	fclose(fp);

	switch(DicOpenDic(DicFileName)) {
	case 3:return 4;
	case 4:return 5;
	};
	
	unsigned int currentpos =0;
	unsigned char newword[MAXCHWORDLEN];
	unsigned char postag[10];
	
	if(ins_del){
		while(GetaWord(Buffer,currentpos,newword,postag,tag1,tag2)){
			string ww((char*)newword);
			//cout << ww << endl;
			WInsert(newword,postag);
		}
	}
	else{
		while(GetaWord(Buffer,currentpos,newword,postag,tag1,tag2))
			WDel(newword);
	}


		switch(DicCloseDic(DicFileName)) {
		case 1:return 2;
		case 2:return 3;
		};

	free(Buffer);
	return 0;
}

//-----------------------------------------------------------//
//   功能:	用于修改词典后保存并关闭词典
//   参数: 
//  (入口)DicFileName 词典文件名
//    返回:0 正常 其他错误编码
//----------------------------------------------------------//
int CSSPS::DicCloseDic(const char *DicFileName)
{
		__int32 NWords=0;
		char fn[300];
        //string temp = "test1";
		//打开词典正文文件:
		strcpy(fn,DicFileName);
		changesuffix(fn,"lex");
		if(!(fword=fopen(fn,"wb")))
		{	
			printf("文件 %s 无法create!",fn);
			return 1;
		}

		FILE *fptr;
		if(!(fptr=fopen("word.txt","w")))
		{	
			printf("词典指针文件“%s”无法create!",fn);
			return 2;
		}	
		//打开词典指针文件:
	//	strcpy(strrchr(DicFileName,'.'),".ptr");
		strcpy(fn,DicFileName);
		changesuffix(fn,"ptr");
		if(!(fwptr=fopen(fn,"wb")))
		{	
			printf("词典指针文件“%s”无法create!",fn);
			return 2;
		}	

		printf("Writing dictionary...\n");

		struct aWORDdic* Tptr,*freep;
		long offset;
                
                for(int z=0;z<128;z++)

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -