⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 ssps.cpp

📁 简单处理两个句子中的相似度对比问题,具体用法很简单,在main函数中写入句子即可
💻 CPP
📖 第 1 页 / 共 2 页
字号:

//SSPS分词子程序
//用单扫描算法快速分词
#define __int16 short int
#define __int32 int
#include "function.h"
#include "ssps.h"

char D=' '; //用作分词标志的特殊字符,加在汉字词之间作为间隔标志

FILE* fLex;
struct aWORDdic{ 
	unsigned char * sCIYU;//词语
	unsigned char Len;//词语的长度
	aWORDdic* ptrNext;
};

struct WIndexEntry{
	__int16 WCount;
	struct aWORDdic *WList;
}WIndexcom[72][94];//指针索引表(全局变量),共72区,每区94个汉字

// 从指针文件(当前目录下的Ssps.ptr文件)中读取索引表和打开
//词典正文文件(当前目录下的Ssps.lex文件)

FILE *fwptr;	   //声明词典指针文件
FILE *fword;   //声明词典正文文件
/*
**	RETURN VALUE:
**		0 : OK!
**		1 : CAN NOT OPEN THE *.LEX FILE
**		2 : CAN NOT OPEN THE *.PTR FILE
**		3 : Error of the format of the word file.
**		4 : Error on the size of file Ssps.ptr!
*/
int DicOpenDic(const char *DicFileName)
{
	__int32 NWords=0;
	char fn[300];


		//打开词典正文文件:
		//strcpy(strrchr(DicFileName,'.'),".lex");
		strcpy(fn,DicFileName);
		changesuffix(fn,"lex");
		if(!(fword=fopen(fn,"rb")))
		{	
			printf("文件Ssps.lex无法打开!");
			return 1;
		}
		
		//打开词典指针文件:
		//strcpy(strrchr(DicFileName,'.'),".ptr");
		strcpy(fn,DicFileName);
		changesuffix(fn,"ptr");
		if(!(fwptr=fopen(fn,"rb")))
		{	
			printf("词典指针文件“Ssps.ptr”无法打开!");
			return 2;
		}
	
	//	printf("Loading current dictionary...\n");

		unsigned long dwFileLength=filelength1(fileno(fwptr));//取指针文件长度
		unsigned char sHanzi[2];//汉字
		unsigned __int16 count;
		unsigned __int32 offset;
		__int16 x,y;

		while(dwFileLength>0)//一直读到文件尾
		{	
			fread(sHanzi,2,1,fwptr);//读汉字(词的首字)
	    	x=sHanzi[0]-0xb0;
			y=sHanzi[1]-0xa1;

			fread(&count,2,1,fwptr);
			WIndexcom[x][y].WCount=count;
			NWords+=count;
				//read the numer of words

			fread(&offset,4,1,fwptr);// skip the 4 bytes offset
			
			dwFileLength -= 8;//读了8个字节

			struct aWORDdic* wwwp;
			struct aWORDdic* tail=(aWORDdic*)emalloc(sizeof(aWORDdic));
			
			WIndexcom[x][y].WList=tail;

			while(count--) //read a list of words from the word file
			{
				wwwp=(aWORDdic *)emalloc(sizeof(aWORDdic));

 				if((wwwp->Len=fgetc(fword))==EOF)//读到文件尾或有其它异常
				{
					printf("Error of the format of the word file.\n");
					return 3;
				}
				
				wwwp->sCIYU=(unsigned char*)emalloc(wwwp->Len+1);//note here
				fread(wwwp->sCIYU,wwwp->Len,1,fword);//接着将词语读入
				wwwp->sCIYU[wwwp->Len]=0;//note here
				wwwp->ptrNext=NULL;	

				tail->ptrNext=wwwp;
				tail=wwwp;
			}//end of read a list of words

			struct aWORDdic * temp=WIndexcom[x][y].WList;
			WIndexcom[x][y].WList=temp->ptrNext;
			free(temp);
		}//end of dwFileLength	
	
		fclose(fwptr);// close the 指针文件
		fclose(fword);// close the words file	
				
		if(dwFileLength)
		{
			printf("Error on the size of file Ssps.ptr!!");
			return 4;
		}
		

	//	printf("Total %d words in dictionary.\n",NWords);

	return 0;
}//end of opendic
/*
**	meaning of the return value
**		0 : ok
**		1 : can not open the lex file
**		2 : can not open the ptr file
*/
int DicCloseDic()
{
		struct aWORDdic* Tptr,*freep;
		for(int x=0;x<72;x++)
			for(int y=0;y<94;y++)
			{
				if(!WIndexcom[x][y].WCount) continue;
				Tptr=WIndexcom[x][y].WList;
				while(Tptr)
				{
					freep=Tptr;
					Tptr=Tptr->ptrNext;

					free(freep->sCIYU);
					free(freep);
				}//end of while
				WIndexcom[x][y].WList = NULL;
				WIndexcom[x][y].WCount = 0;
			}//end of double for;


		fclose(fwptr);
		fclose(fword);
		return 0;
}
/*
**	meaning of the return value
**		0 : ok!
**		1 : some error
*/

int WInsert(unsigned char* cWord)
{
	int x,y;
	int flag;

	if(!cWord) return -1;   
	x=cWord[0]-0xb0;
	y=cWord[1]-0xa1;

	struct aWORDdic* Wptr=WIndexcom[x][y].WList;
	struct aWORDdic* IPos=NULL;
	struct aWORDdic* IWord;

	if(!Wptr)
	{	
		IWord=(struct aWORDdic*)emalloc(sizeof(aWORDdic));
		IWord->ptrNext=NULL;
		IWord->Len=strlen((char*)cWord);
		IWord->sCIYU=(unsigned char *)emalloc(IWord->Len+1);//note here
		strcpy((char*)IWord->sCIYU,(char*)cWord);
		
		WIndexcom[x][y].WList=IWord;
		WIndexcom[x][y].WCount++;

	//	Num_Ins++;

#ifdef Debug
		printf("%s\n",cWord);
#endif
		return 0;
	}

	while(1)
	{
		flag=strcmp((char*)cWord,(char*)Wptr->sCIYU);
		
		if(!flag)
		{
	//		AlreadyExisted++;
        
#ifdef Debug
			printf("%s already exists.\n",cWord);
#endif			

			break;
		}

		if(flag>0) 
		{
			IPos=Wptr;
			Wptr=Wptr->ptrNext;
			
			if(!Wptr)
			{
				IWord=(struct aWORDdic*)emalloc(sizeof(aWORDdic));
				IWord->ptrNext=NULL;
				IWord->Len=strlen((char*)cWord);
				IWord->sCIYU=(unsigned char *)emalloc(IWord->Len+1);//note here
				strcpy((char*)IWord->sCIYU,(char*)cWord);
		
				IPos->ptrNext=IWord;
				WIndexcom[x][y].WCount++;

	//			Num_Ins++;

#ifdef Debug
				printf("%s\n",cWord);
#endif
				
				return 0;
			}
		}

		if(flag<0)
		{
			IWord=(struct aWORDdic*)emalloc(sizeof(aWORDdic));
			IWord->Len=strlen((char*)cWord);
			IWord->sCIYU=(unsigned char *)emalloc(IWord->Len+1);//note here
			strcpy((char*)IWord->sCIYU,(char*)cWord);

			IWord->ptrNext=Wptr;
			if(IPos) IPos->ptrNext=IWord;
			else WIndexcom[x][y].WList=IWord;//note for IPos
			WIndexcom[x][y].WCount++;
			
	//		Num_Ins++;

#ifdef Debug
				printf("%s\n",cWord);
#endif
				
			break;
		}
	}//end of while(1);
	
	return 0;
}
/*
**	MEANING OF THE RETURN VALUE
**		NULL : IS THE END
**		SOME WORD
*/
unsigned char* GetaWord(unsigned char*Buf)
{
	static unsigned char cWord[100];  // should be as lengthy as possible!!
	static unsigned char *Bufp=Buf;//??
	unsigned char* Wordp;
	int inChinese=0;

	if(Bufp == NULL)
		Bufp = Buf;
	//xx++;
	while(*(Bufp+1))
	{
		if(*Bufp!='\n'&& *Bufp!='\r'&& *Bufp != ' ')
		{
			if(!inChinese)
			{
				Wordp=cWord;
				inChinese=1;
			}
		 
			*Wordp++=*Bufp++;
		}
		else
		{
			Bufp++;  // this is for skip an ascii ch!! necessary!!
			if(inChinese)
			{
				//xx++;
				if(isalpha(*cWord))
				{
					inChinese = 0;
					continue;
				}
				*Wordp=0;
				return cWord;
			}		
		}
	}//end of while
		
	if(inChinese)
	{
		*Wordp=0;
		inChinese=0;
		return cWord;
	}
	else {
		Bufp = NULL;
		return NULL;
	}
}

//判断ch是否是一个ascii码
int  isASCII(unsigned char ch)
{
   return !(ch&0x80);
}

//判断ch是否是white space characters
int wbspace(unsigned char ch)
{
//	int bool;// 0x21---0x7e	
//	if(ch==0xff) bool=1;
//	if(ch>=0x21 && ch<=0x7e) bool=1;
//	return !bool;
	return isspace(ch);
}

///////////////////////////////////////////////////////////////////////////////////////////////////////
//this edition of Segment is correct for multi process safe.

void Segment(unsigned char *sBuffer,unsigned char *strResult)
{
       int pointer=0;
       *strResult='\0';
       unsigned char *resultp=strResult;
       int lenofsBuffer=strlen((char *)sBuffer);
       while(pointer<lenofsBuffer) {
             int x=sBuffer[pointer]-0xb0;
	         int y=sBuffer[pointer+1]-0xa1;
		     struct aWORDdic* Wptr=WIndexcom[x][y].WList;
	         struct aWORDdic* DPos=NULL;
		     if(!Wptr) {
                        *resultp++=sBuffer[pointer++];
                        *resultp++=sBuffer[pointer++];//将切出的词写入结果字串
                        *resultp++=D;//再加一个分界符
                        continue;
                }
               unsigned char LastLen=2;
               int i;
               int iCompareResult;
               while(1) {
                        	if(!Wptr)
							{
								*resultp++=sBuffer[pointer++];
                                *resultp++=sBuffer[pointer++];//将切出的词写入结果字串
                                *resultp++=D;//再加一个分界符
								break;
							}
                        iCompareResult=strncmp((char *)(sBuffer+pointer),(const char *)Wptr->sCIYU,Wptr->Len);
                        if(iCompareResult<0) {
                              for(i=0;i<LastLen;i++)
                                  *resultp++=sBuffer[pointer++];
                              *resultp++=D;
                              break;
                         }
                         if(iCompareResult==0) {
                              LastLen=Wptr->Len;
                              DPos=Wptr;
			                  Wptr=Wptr->ptrNext;
                         }
                         if(iCompareResult>0)
						 {
							 DPos=Wptr;
			                 Wptr=Wptr->ptrNext;
						 }
                    }//end of while(1)
       }//end of while(pointer<...)
       *resultp=0;
}  

/////////////////////////////////////////////////////////////////////////////////////////////////////////



//SSPS算法子程序:

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -