📄 ssps.cpp
字号:
//SSPS分词子程序
//用单扫描算法快速分词
#define __int16 short int
#define __int32 int
#include "function.h"
#include "ssps.h"
char D=' '; //用作分词标志的特殊字符,加在汉字词之间作为间隔标志
FILE* fLex;
struct aWORDdic{
unsigned char * sCIYU;//词语
unsigned char Len;//词语的长度
aWORDdic* ptrNext;
};
struct WIndexEntry{
__int16 WCount;
struct aWORDdic *WList;
}WIndexcom[72][94];//指针索引表(全局变量),共72区,每区94个汉字
// 从指针文件(当前目录下的Ssps.ptr文件)中读取索引表和打开
//词典正文文件(当前目录下的Ssps.lex文件)
FILE *fwptr; //声明词典指针文件
FILE *fword; //声明词典正文文件
/*
** RETURN VALUE:
** 0 : OK!
** 1 : CAN NOT OPEN THE *.LEX FILE
** 2 : CAN NOT OPEN THE *.PTR FILE
** 3 : Error of the format of the word file.
** 4 : Error on the size of file Ssps.ptr!
*/
int DicOpenDic(const char *DicFileName)
{
__int32 NWords=0;
char fn[300];
//打开词典正文文件:
//strcpy(strrchr(DicFileName,'.'),".lex");
strcpy(fn,DicFileName);
changesuffix(fn,"lex");
if(!(fword=fopen(fn,"rb")))
{
printf("文件Ssps.lex无法打开!");
return 1;
}
//打开词典指针文件:
//strcpy(strrchr(DicFileName,'.'),".ptr");
strcpy(fn,DicFileName);
changesuffix(fn,"ptr");
if(!(fwptr=fopen(fn,"rb")))
{
printf("词典指针文件“Ssps.ptr”无法打开!");
return 2;
}
// printf("Loading current dictionary...\n");
unsigned long dwFileLength=filelength1(fileno(fwptr));//取指针文件长度
unsigned char sHanzi[2];//汉字
unsigned __int16 count;
unsigned __int32 offset;
__int16 x,y;
while(dwFileLength>0)//一直读到文件尾
{
fread(sHanzi,2,1,fwptr);//读汉字(词的首字)
x=sHanzi[0]-0xb0;
y=sHanzi[1]-0xa1;
fread(&count,2,1,fwptr);
WIndexcom[x][y].WCount=count;
NWords+=count;
//read the numer of words
fread(&offset,4,1,fwptr);// skip the 4 bytes offset
dwFileLength -= 8;//读了8个字节
struct aWORDdic* wwwp;
struct aWORDdic* tail=(aWORDdic*)emalloc(sizeof(aWORDdic));
WIndexcom[x][y].WList=tail;
while(count--) //read a list of words from the word file
{
wwwp=(aWORDdic *)emalloc(sizeof(aWORDdic));
if((wwwp->Len=fgetc(fword))==EOF)//读到文件尾或有其它异常
{
printf("Error of the format of the word file.\n");
return 3;
}
wwwp->sCIYU=(unsigned char*)emalloc(wwwp->Len+1);//note here
fread(wwwp->sCIYU,wwwp->Len,1,fword);//接着将词语读入
wwwp->sCIYU[wwwp->Len]=0;//note here
wwwp->ptrNext=NULL;
tail->ptrNext=wwwp;
tail=wwwp;
}//end of read a list of words
struct aWORDdic * temp=WIndexcom[x][y].WList;
WIndexcom[x][y].WList=temp->ptrNext;
free(temp);
}//end of dwFileLength
fclose(fwptr);// close the 指针文件
fclose(fword);// close the words file
if(dwFileLength)
{
printf("Error on the size of file Ssps.ptr!!");
return 4;
}
// printf("Total %d words in dictionary.\n",NWords);
return 0;
}//end of opendic
/*
** meaning of the return value
** 0 : ok
** 1 : can not open the lex file
** 2 : can not open the ptr file
*/
int DicCloseDic()
{
struct aWORDdic* Tptr,*freep;
for(int x=0;x<72;x++)
for(int y=0;y<94;y++)
{
if(!WIndexcom[x][y].WCount) continue;
Tptr=WIndexcom[x][y].WList;
while(Tptr)
{
freep=Tptr;
Tptr=Tptr->ptrNext;
free(freep->sCIYU);
free(freep);
}//end of while
WIndexcom[x][y].WList = NULL;
WIndexcom[x][y].WCount = 0;
}//end of double for;
fclose(fwptr);
fclose(fword);
return 0;
}
/*
** meaning of the return value
** 0 : ok!
** 1 : some error
*/
int WInsert(unsigned char* cWord)
{
int x,y;
int flag;
if(!cWord) return -1;
x=cWord[0]-0xb0;
y=cWord[1]-0xa1;
struct aWORDdic* Wptr=WIndexcom[x][y].WList;
struct aWORDdic* IPos=NULL;
struct aWORDdic* IWord;
if(!Wptr)
{
IWord=(struct aWORDdic*)emalloc(sizeof(aWORDdic));
IWord->ptrNext=NULL;
IWord->Len=strlen((char*)cWord);
IWord->sCIYU=(unsigned char *)emalloc(IWord->Len+1);//note here
strcpy((char*)IWord->sCIYU,(char*)cWord);
WIndexcom[x][y].WList=IWord;
WIndexcom[x][y].WCount++;
// Num_Ins++;
#ifdef Debug
printf("%s\n",cWord);
#endif
return 0;
}
while(1)
{
flag=strcmp((char*)cWord,(char*)Wptr->sCIYU);
if(!flag)
{
// AlreadyExisted++;
#ifdef Debug
printf("%s already exists.\n",cWord);
#endif
break;
}
if(flag>0)
{
IPos=Wptr;
Wptr=Wptr->ptrNext;
if(!Wptr)
{
IWord=(struct aWORDdic*)emalloc(sizeof(aWORDdic));
IWord->ptrNext=NULL;
IWord->Len=strlen((char*)cWord);
IWord->sCIYU=(unsigned char *)emalloc(IWord->Len+1);//note here
strcpy((char*)IWord->sCIYU,(char*)cWord);
IPos->ptrNext=IWord;
WIndexcom[x][y].WCount++;
// Num_Ins++;
#ifdef Debug
printf("%s\n",cWord);
#endif
return 0;
}
}
if(flag<0)
{
IWord=(struct aWORDdic*)emalloc(sizeof(aWORDdic));
IWord->Len=strlen((char*)cWord);
IWord->sCIYU=(unsigned char *)emalloc(IWord->Len+1);//note here
strcpy((char*)IWord->sCIYU,(char*)cWord);
IWord->ptrNext=Wptr;
if(IPos) IPos->ptrNext=IWord;
else WIndexcom[x][y].WList=IWord;//note for IPos
WIndexcom[x][y].WCount++;
// Num_Ins++;
#ifdef Debug
printf("%s\n",cWord);
#endif
break;
}
}//end of while(1);
return 0;
}
/*
** MEANING OF THE RETURN VALUE
** NULL : IS THE END
** SOME WORD
*/
unsigned char* GetaWord(unsigned char*Buf)
{
static unsigned char cWord[100]; // should be as lengthy as possible!!
static unsigned char *Bufp=Buf;//??
unsigned char* Wordp;
int inChinese=0;
if(Bufp == NULL)
Bufp = Buf;
//xx++;
while(*(Bufp+1))
{
if(*Bufp!='\n'&& *Bufp!='\r'&& *Bufp != ' ')
{
if(!inChinese)
{
Wordp=cWord;
inChinese=1;
}
*Wordp++=*Bufp++;
}
else
{
Bufp++; // this is for skip an ascii ch!! necessary!!
if(inChinese)
{
//xx++;
if(isalpha(*cWord))
{
inChinese = 0;
continue;
}
*Wordp=0;
return cWord;
}
}
}//end of while
if(inChinese)
{
*Wordp=0;
inChinese=0;
return cWord;
}
else {
Bufp = NULL;
return NULL;
}
}
//判断ch是否是一个ascii码
int isASCII(unsigned char ch)
{
return !(ch&0x80);
}
//判断ch是否是white space characters
int wbspace(unsigned char ch)
{
// int bool;// 0x21---0x7e
// if(ch==0xff) bool=1;
// if(ch>=0x21 && ch<=0x7e) bool=1;
// return !bool;
return isspace(ch);
}
///////////////////////////////////////////////////////////////////////////////////////////////////////
//this edition of Segment is correct for multi process safe.
void Segment(unsigned char *sBuffer,unsigned char *strResult)
{
int pointer=0;
*strResult='\0';
unsigned char *resultp=strResult;
int lenofsBuffer=strlen((char *)sBuffer);
while(pointer<lenofsBuffer) {
int x=sBuffer[pointer]-0xb0;
int y=sBuffer[pointer+1]-0xa1;
struct aWORDdic* Wptr=WIndexcom[x][y].WList;
struct aWORDdic* DPos=NULL;
if(!Wptr) {
*resultp++=sBuffer[pointer++];
*resultp++=sBuffer[pointer++];//将切出的词写入结果字串
*resultp++=D;//再加一个分界符
continue;
}
unsigned char LastLen=2;
int i;
int iCompareResult;
while(1) {
if(!Wptr)
{
*resultp++=sBuffer[pointer++];
*resultp++=sBuffer[pointer++];//将切出的词写入结果字串
*resultp++=D;//再加一个分界符
break;
}
iCompareResult=strncmp((char *)(sBuffer+pointer),(const char *)Wptr->sCIYU,Wptr->Len);
if(iCompareResult<0) {
for(i=0;i<LastLen;i++)
*resultp++=sBuffer[pointer++];
*resultp++=D;
break;
}
if(iCompareResult==0) {
LastLen=Wptr->Len;
DPos=Wptr;
Wptr=Wptr->ptrNext;
}
if(iCompareResult>0)
{
DPos=Wptr;
Wptr=Wptr->ptrNext;
}
}//end of while(1)
}//end of while(pointer<...)
*resultp=0;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
//SSPS算法子程序:
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -