📄 new_ssps.cpp
字号:
x=sBuffer[pointer];
Wptr=CIndexcom[x].WList;
count=CIndexcom[x].WCount;
DPos=NULL;
if(!Wptr) {
word += (char)sBuffer[pointer];
*resultp++=sBuffer[pointer++];
continue;
}
}//end of else, the end of get ascii code
unsigned char LastLen=2;
int i;
int iCompareResult;
while(1) {
if(!Wptr)
{
if(ascii_flag)
{ word += (char)sBuffer[pointer];
*resultp++=sBuffer[pointer++];
break;
}
for(i=0;i<LastLen;i++)
{
word += (char)sBuffer[pointer];
*resultp++=sBuffer[pointer++];}//将切出的词写入结果字串
*resultp++=E;
strcpy((char*)resultp,(const char*)pos);
resultp += 8;
*resultp++=D;//再加一个分界符
wordvec.push_back(pair<string,int>(word,pointer-word.size()));
word = "";
break;
}
iCompareResult=strncmp((char *)(sBuffer+pointer),(const char *)Wptr->sCIYU,Wptr->Len);
if(iCompareResult<0) {
if(ascii_flag)
{ word += (char)sBuffer[pointer];
*resultp++=sBuffer[pointer++];
break;
}
for(i=0;i<LastLen;i++){
word += (char)sBuffer[pointer];
*resultp++=sBuffer[pointer++];
}
*resultp++=E;
strcpy((char*)resultp,(const char*)pos);
resultp += 8;
*resultp++=D;
wordvec.push_back(pair<string,int>(word,pointer-word.size()));
word = "";
break;
}
if(iCompareResult==0) {
LastLen=Wptr->Len;
DPos=Wptr;
pos = Wptr->pos;
Wptr=Wptr->ptrNext;
}
if(iCompareResult>0)
{
DPos=Wptr;
Wptr=Wptr->ptrNext;
}
}//end of while(1)
}//end of while(pointer<...)
*resultp=0;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
//-----------------------------------------------------------//
// 功能:SSPS算法子程序:
// 参数: (入口)lpszBuffer:待分词文本缓冲
// (出口)destination buffer
//----------------------------------------------------------//
//词典文件已被打开,即OpenDic函数已被调用过
void CSSPS::SSPS(char* lpszBuffer,char* targetBuf)
{
char *tp=targetBuf;
//定义Buffer的指针:
int Bpointer=0;
int inChinese=0;//....
int specialspace=0;
//定义待分词纯汉字字串和分词结果字串:
unsigned char strSource[100000],strResult[150000];
/*unsigned char *strSource,*strResult;
int len=strlen(lpszBuffer);
strSource=new unsigned char[len+1];
strResult=new unsigned char[2*len];*/
//Pay attention: should use variable length strings instead.Modify SSPS()!!
unsigned char *sourcep=strSource; //*resultp=strResult;
// syslog(LOG_ALERT,"FAILED IN SSPS");
//逐字节地处理文本缓冲区:
while(lpszBuffer[Bpointer])////this paragraph should be rewritten!!
{
*sourcep++=lpszBuffer[Bpointer++];
}//While(lpszBuffer is at the end)
//syslog(LOG_ALERT,"FAILED IN SSPS");
//for the source string that end with all chinese substring!!!!!
*sourcep=0;
char tag1 = 15;
char tag2 = 127;
Segment(strSource,strResult,tag1,tag2);
*tp++=D;
char *temp=(char *)strResult;
while(*temp)
*tp++=*temp++;
*tp=0;
//syslog(LOG_ALERT,"FAILED IN SSPS end");
}//SSPS子程序结束
long CSSPS::filelength1(int handle)
{
struct stat buf;
fstat(handle,&buf);
return buf.st_size;
}
//long CSSPS::get_runtime()
//{
// clock_t start;
// start = clock();
// return((long)((double)start*100.0/(double)CLOCKS_PER_SEC));
//}
void * CSSPS::emalloc(__int32 i)
{
void *p;
if ((p = (void *) malloc(i)) == NULL)
//err_ret("Ran out of memory (could not allocate enough)!");
printf("Ran out of memory (could not allocate enough)!");
return p;
}
int CSSPS::OperateDic(const char *IFileName,const char *DicFileName )
{
char sf[256];
FILE* fp;
DicOpenDic(DicFileName);
strcpy(sf,IFileName);
if((fp=fopen(sf,"rb"))==NULL)
{
printf("Raw word file cannot be opened!\n");
return 1;
}
long len=filelength1(fileno(fp));
if(len>0)
{
unsigned char * Buffer=(unsigned char *)emalloc(len);
fread(Buffer,len,1,fp);
fclose(fp);
char *seps1=("\t\r\n");
char *token1;
token1 = strtok( (char *)Buffer, seps1 );
while( token1 != NULL )
{
WDel((unsigned char*)token1);
token1 = strtok( NULL, seps1 );
}
free(Buffer);
}
else fclose(fp);
return 0;
}
//在词典中寻找一个词
int CSSPS::findword(int count,char *buffer,struct aWORDdic *Wptr)
{
int low=0;
int high=count-1;
struct aWORDdic *ptr;
while(low<=high)
{
ptr=Wptr;
int mid=(low+high)/2;
int n=0;
for(int i=0;i<mid;i++)
ptr=ptr->ptrNext;
n=strncmp(buffer,(const char *)(ptr)->sCIYU,(ptr)->Len);
if(n<0)high=mid-1;
else if(n>0)low=mid+1;
else return mid;
}
return -1;
}
//从词典中删除一个词
int CSSPS::WDel(unsigned char *cWord)
{
int x,y;
int flag;
struct aWORDdic* Wptr;
struct aWORDdic* DPos=NULL;
struct aWORDdic* DWord;
if(!cWord) return -1;
if(cWord[0] & 128) //首字节表明是汉字词
{
x=cWord[0]-0xa1;
y=cWord[1]-0xa1;
Wptr=WIndexcom[x][y].WList;
DPos=NULL;
while(1)
{
if(!Wptr)
{
// NotFound++;
#ifdef Debug
printf("%s not found.\n",cWord);
#endif
break;
}
flag=strcmp((char *)cWord,(char*)Wptr->sCIYU);
if(flag<0)
{
// NotFound++;
#ifdef Debug
printf("%s not found.\n",cWord);
#endif
break;
}
if(flag>0)
{
DPos=Wptr;
Wptr=Wptr->ptrNext;
}
if(!flag)
{
DWord=Wptr;
if(DPos) DPos->ptrNext=Wptr->ptrNext;
else WIndexcom[x][y].WList=Wptr->ptrNext;
WIndexcom[x][y].WCount--;
free(DWord->sCIYU);
free(DWord->pos);
free(DWord);
// Num_Del++;
#ifdef Debug
printf("%s\n",cWord);
#endif
break;
}
}//end of while(1);
}
else //首字节表明是ASCII
{
x=cWord[0];
Wptr=CIndexcom[x].WList;
DPos=NULL;
while(1)
{
if(!Wptr)
{
// NotFound++;
#ifdef Debug
printf("%s not found.\n",cWord);
#endif
break;
}
flag=strcmp((char *)cWord,(char*)Wptr->sCIYU);
if(flag<0)
{
// NotFound++;
#ifdef Debug
printf("%s not found.\n",cWord);
#endif
break;
}
if(flag>0)
{
DPos=Wptr;
Wptr=Wptr->ptrNext;
}
if(!flag)
{
DWord=Wptr;
if(DPos) DPos->ptrNext=Wptr->ptrNext;
else CIndexcom[x].WList=Wptr->ptrNext;
CIndexcom[x].WCount--;
free(DWord->sCIYU);
free(DWord->pos);
free(DWord);
// Num_Del++;
#ifdef Debug
printf("%s\n",cWord);
#endif
break;
}
}//end of while(1);
}
return 0;
}
//-----------------------------------------------------------//
// 功能: 维护词典
// 参数:
// (入口)flag 标志 0del 1add IFileName 词列表文件名 DicFileName 词典文件名
// 调用函数: DicOpenDic 、DicCloseDic 、WInsert WDel 等
// 返回:
// 0 : is ok!
// 1 : can not open the input file
// 2 : can not open the .lex file
// 3 : can not open the .ptr file
// 4 : irragular file format
// 5 : error on the file ssps.ptr length
//----------------------------------------------------------//
int CSSPS::OperateDic( int flag,const char *IFileName,const char *DicFileName ,char tag1,char tag2)
{
int ins_del;
char sf[256];
FILE* fp;
int mark = 0;
switch(flag)
{
// default : usage();
case 0: ins_del=1; break;
case 1: ins_del=0; break;
}
strcpy(sf,IFileName);
if((fp=fopen(sf,"rb"))==NULL)
{
printf("Raw word file cannot be opened!\n");
return 1;
}
long len=filelength1(fileno(fp));
unsigned char * Buffer=(unsigned char *)emalloc(len+1);
fread(Buffer,len,1,fp);
Buffer[len]='\0';
fclose(fp);
switch(DicOpenDic(DicFileName)) {
case 3:return 4;
case 4:return 5;
};
unsigned int currentpos =0;
unsigned char newword[MAXCHWORDLEN];
unsigned char postag[10];
if(ins_del){
while(GetaWord(Buffer,currentpos,newword,postag,tag1,tag2)){
string ww((char*)newword);
//cout << ww << endl;
WInsert(newword,postag);
}
}
else{
while(GetaWord(Buffer,currentpos,newword,postag,tag1,tag2))
WDel(newword);
}
switch(DicCloseDic(DicFileName)) {
case 1:return 2;
case 2:return 3;
};
free(Buffer);
return 0;
}
//-----------------------------------------------------------//
// 功能: 用于修改词典后保存并关闭词典
// 参数:
// (入口)DicFileName 词典文件名
// 返回:0 正常 其他错误编码
//----------------------------------------------------------//
int CSSPS::DicCloseDic(const char *DicFileName)
{
__int32 NWords=0;
char fn[300];
//string temp = "test1";
//打开词典正文文件:
strcpy(fn,DicFileName);
changesuffix(fn,"lex");
if(!(fword=fopen(fn,"wb")))
{
printf("文件 %s 无法create!",fn);
return 1;
}
FILE *fptr;
if(!(fptr=fopen("word.txt","w")))
{
printf("词典指针文件“%s”无法create!",fn);
return 2;
}
//打开词典指针文件:
// strcpy(strrchr(DicFileName,'.'),".ptr");
strcpy(fn,DicFileName);
changesuffix(fn,"ptr");
if(!(fwptr=fopen(fn,"wb")))
{
printf("词典指针文件“%s”无法create!",fn);
return 2;
}
printf("Writing dictionary...\n");
struct aWORDdic* Tptr,*freep;
long offset;
for(int z=0;z<128;z++)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -