📄 ssps.cpp

📁 简单处理两个句子中的相似度对比问题,具体用法很简单,在main函数中写入句子即可
💻 CPP
📖 第 1 页 / 共 2 页
字号:
上一页 12
//char* lpszBuffer:待分词文本缓冲
//char* destination buffer
//词典文件已被打开,即OpenDic函数已被调用过 
		  
void SSPS(char* lpszBuffer,char* targetBuf)
{ 
	char *tp=targetBuf;
	 
    	//定义Buffer的指针:
	int Bpointer=0;
	int inChinese=0;//....
	int spacepointer;
	int specialspace=0;

	//定义待分词纯汉字字串和分词结果字串：
    unsigned char strSource[100000],strResult[150000];

    //Pay attention: should use variable length strings instead.Modify SSPS()!!
    		
	unsigned char *sourcep=strSource; //*resultp=strResult;
// syslog(LOG_ALERT,"FAILED IN SSPS");
	//逐字节地处理文本缓冲区：
	while(lpszBuffer[Bpointer])////this paragraph should be rewritten!!
	{
		if(isHanzi(lpszBuffer[Bpointer],lpszBuffer[Bpointer+1]))//如果遇到纯汉字内码
		{    
			inChinese=1;
			specialspace=0; 

			*sourcep++=lpszBuffer[Bpointer++];//拷贝到待分词字串(1字节)
			*sourcep++=lpszBuffer[Bpointer++];//拷贝到待分词字串(2字节)
			
		/*	if(sourcep-strSource>115000)
			{
				printf("Warning: the size of the all Chinese buffer is too small!\n");
				exit(-1);
			}
*/
			continue;
		}
		

		if(wbspace(lpszBuffer[Bpointer]))
		{	
			if(inChinese)
			{	
				if(!specialspace)
				{
					spacepointer=Bpointer;
					specialspace=1;
				}
				Bpointer++;
			}
			else
				*tp++=lpszBuffer[Bpointer++];
			
			continue;
		}
		
		//对纯汉字字串分词
		if(inChinese)
		{
			*sourcep=0;
			Segment(strSource,strResult);

			*tp++=D;
			char *temp=(char *)strResult;
			while(*temp)
				*tp++=*temp++;
				
			sourcep=strSource;
			inChinese=0;
		}
		
		if(specialspace)
		{
			specialspace=0;
			while(spacepointer<Bpointer)
				*tp++=lpszBuffer[spacepointer++];
		}
		
		//process the special characters and ascii code
		//Bpointer++;  ???
		if(isASCII(lpszBuffer[Bpointer]))//遇到了ASCII码
			*tp++=lpszBuffer[Bpointer++];
		else
			{
				*tp++=lpszBuffer[Bpointer++];
				*tp++=lpszBuffer[Bpointer++];
				*tp++=D;
		/*		*tp++=D;
				*tp++=D;
				Bpointer+=2;   */
			}//特殊的汉字如标点，符号等:
	
	}//While(lpszBuffer is at the end)
//syslog(LOG_ALERT,"FAILED IN SSPS");
//for the source string that end with all chinese substring!!!!!
	*sourcep=0;

	Segment(strSource,strResult);

	*tp++=D;
	char *temp=(char *)strResult;
	while(*temp)
		*tp++=*temp++;
	*tp=0;
//syslog(LOG_ALERT,"FAILED IN SSPS end");
	   
}//SSPS子程序结束

long filelength1(int handle)
{
	struct stat buf;
	fstat(handle,&buf);
	return buf.st_size;
}



void *emalloc(__int32 i)
{
	void *p;
 
	if ((p = (void *) malloc(i)) == NULL)
		//err_ret("Ran out of memory (could not allocate enough)!");
		printf("Ran out of memory (could not allocate enough)!");
	return p;
}

int OperateDic(const char *IFileName,const char *DicFileName )
{
	char sf[256];
	FILE* fp;
	strcpy(sf,IFileName);
		DicOpenDic(DicFileName);
	if((fp=fopen(sf,"rb"))==NULL)
	{
		printf("Raw word file cannot be opened!\n");
		return 1;
	}
	long len=filelength1(fileno(fp));
	if(len>0)
	{
	unsigned char * Buffer=(unsigned char *)emalloc(len);
	fread(Buffer,len,1,fp);
	fclose(fp);
    char *seps1=("\t\r\n 0 1 2 3 4 5 6 7 8 9 a b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z ");
    char *token1;
	token1 = strtok( (char *)Buffer, seps1 );
	while( token1 != NULL )
	{
        WInsert((unsigned char*)token1);
		token1 = strtok( NULL, seps1 );
	 }
	free(Buffer);
	}
	else fclose(fp);
	return 0;
}

void FiletoBufSeg(FILE *fps,char *targetBuf)
{
	unsigned long len=filelength1(fileno(fps));
	char *sourceBuf=(char *)emalloc(len+1);
	fread(sourceBuf,len,1,fps);
	sourceBuf[len]=0;
	
	SSPS(sourceBuf,targetBuf);
	free(sourceBuf);
}//end of FiletoBufSeg

//fpd refers to a successfully opened file for writing
//SourceBuf refers to the buffer to be processed 
void BuftoFileSeg(char *lpszBuffer,FILE* fpd)
{
	
    	//定义Buffer的指针:
	int Bpointer=0;
	int inChinese=0;//....
	int spacepointer;
	int specialspace=0;

	//定义待分词纯汉字字串和分词结果字串：
    unsigned char strSource[120000],strResult[200000];
		//Pay attention: should use variable length strings instead.Modify SSPS()!!
    		
	unsigned char *sourcep=strSource; //*resultp=strResult;
  
	//逐字节地处理文本缓冲区：
	while(lpszBuffer[Bpointer])////this paragraph should be rewritten!!
	{
		if(isHanzi(lpszBuffer[Bpointer],lpszBuffer[Bpointer+1]))//如果遇到纯汉字内码
		{    
			inChinese=1;
			specialspace=0; 

			*sourcep++=lpszBuffer[Bpointer++];//拷贝到待分词字串(1字节)
			*sourcep++=lpszBuffer[Bpointer++];//拷贝到待分词字串(2字节)
			

			continue;
		}
		

		if(wbspace(lpszBuffer[Bpointer]))

		{	
			if(inChinese)
			{	
				if(!specialspace)
				{
					spacepointer=Bpointer;
					specialspace=1;
				}
				Bpointer++;
			}
			else
				fputc(lpszBuffer[Bpointer++],fpd);
			
			continue;
		}
		
		//对纯汉字字串分词
		if(inChinese)
		{
			*sourcep=0;
			Segment(strSource,strResult);

			fputc(D,fpd);

			fputs((const char *)strResult,fpd);
				
			sourcep=strSource;
			inChinese=0;
		}
		
		if(specialspace)
		{
			specialspace=0;
			while(spacepointer<Bpointer)
				fputc(lpszBuffer[spacepointer++],fpd);
		}
		
		//process the special characters and ascii code
		//Bpointer++;  ???
		if(isASCII(lpszBuffer[Bpointer]))//遇到了ASCII码
			fputc(lpszBuffer[Bpointer++],fpd);
		else
			{
				fputc(lpszBuffer[Bpointer++],fpd);
				fputc(lpszBuffer[Bpointer++],fpd);
				fputc(D,fpd);
		/*		
				fputc(D,fpd);
				fputc(D,fpd);
				Bpointer+=2;      */
			}//特殊的汉字如标点，符号等:
	
	}//While(lpszBuffer is at the end)

//for a string end with all Chinese Characters!!!	
	*sourcep=0;
	Segment(strSource,strResult);

	fputc(D,fpd);

	if(strResult) fputs((const char *)strResult,fpd);

}//end of BuftoFileSeg

//fps refers to a successfully opened file to be read
//fpd refers to a successfully opened file for writing
void FiletoFileSeg(FILE*fps,FILE*fpd)
{
	unsigned long len=filelength1(fileno(fps));
	char *lpszBuffer=(char *)emalloc(len+1);
	fread(lpszBuffer,len,1,fps);
	lpszBuffer[len]=0;
	
	BuftoFileSeg(lpszBuffer,fpd);

	free(lpszBuffer);
}//end of FiletoFileSeg

//sourcefile: the source file name
//desfile: the target file name
//you should first open the dictonary before calling this function
void FNametoFNameSeg(char *sourcefile,char *desfile)
{
	FILE *fps,*fpd;

	//FILE *fp;
//	if(!ishtml(sourcefile))
//		return;
	fps=fopen(sourcefile,"rb");
	fpd=fopen(desfile,"wb");
	
	if(!fps)
	{
		printf("Error on opening file : %s\n",sourcefile);
		exit(-1);
	}

	if(!fpd)
	{
		printf("Error on creating target file : %s\n",desfile);
//		WriteLog("Error on creating target file : %s\n",desfile);
		if(fps != NULL)
			fclose(fps);
		return;
//		exit(-1);
	}

	FiletoFileSeg(fps,fpd);

	fclose(fps);
	fclose(fpd);
}

int WDel(unsigned char *cWord)
{
	int x,y;
	int flag;

	if(!cWord) return -1;

	x=cWord[0]-0xb0;
	y=cWord[1]-0xa1;

	struct aWORDdic* Wptr=WIndexcom[x][y].WList;
	struct aWORDdic* DPos=NULL;
	struct aWORDdic* DWord;

	while(1)
	{
		if(!Wptr)
		{
			//	NotFound++;

#ifdef Debug
				printf("%s not found.\n",cWord);
#endif
				
			break;
		}

		flag=strcmp((char *)cWord,(char*)Wptr->sCIYU);
		
		if(flag<0)
		{
		//	NotFound++;

#ifdef Debug
				printf("%s not found.\n",cWord);
#endif
				
			break;
		}

		if(flag>0) 
		{
			DPos=Wptr;
			Wptr=Wptr->ptrNext;
		}

		if(!flag)
		{
			DWord=Wptr;

			if(DPos) DPos->ptrNext=Wptr->ptrNext;
			else WIndexcom[x][y].WList=Wptr->ptrNext;
			
			WIndexcom[x][y].WCount--;

			free(DWord->sCIYU);
			free(DWord);
			
		//	Num_Del++;

#ifdef Debug
				printf("%s\n",cWord);
#endif
				
			break;
		}
	}//end of while(1);

	return 0;	
}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -