📄 ssps.cpp
字号:
//char* lpszBuffer:待分词文本缓冲
//char* destination buffer
//词典文件已被打开,即OpenDic函数已被调用过
void SSPS(char* lpszBuffer,char* targetBuf)
{
char *tp=targetBuf;
//定义Buffer的指针:
int Bpointer=0;
int inChinese=0;//....
int spacepointer;
int specialspace=0;
//定义待分词纯汉字字串和分词结果字串:
unsigned char strSource[100000],strResult[150000];
//Pay attention: should use variable length strings instead.Modify SSPS()!!
unsigned char *sourcep=strSource; //*resultp=strResult;
// syslog(LOG_ALERT,"FAILED IN SSPS");
//逐字节地处理文本缓冲区:
while(lpszBuffer[Bpointer])////this paragraph should be rewritten!!
{
if(isHanzi(lpszBuffer[Bpointer],lpszBuffer[Bpointer+1]))//如果遇到纯汉字内码
{
inChinese=1;
specialspace=0;
*sourcep++=lpszBuffer[Bpointer++];//拷贝到待分词字串(1字节)
*sourcep++=lpszBuffer[Bpointer++];//拷贝到待分词字串(2字节)
/* if(sourcep-strSource>115000)
{
printf("Warning: the size of the all Chinese buffer is too small!\n");
exit(-1);
}
*/
continue;
}
if(wbspace(lpszBuffer[Bpointer]))
{
if(inChinese)
{
if(!specialspace)
{
spacepointer=Bpointer;
specialspace=1;
}
Bpointer++;
}
else
*tp++=lpszBuffer[Bpointer++];
continue;
}
//对纯汉字字串分词
if(inChinese)
{
*sourcep=0;
Segment(strSource,strResult);
*tp++=D;
char *temp=(char *)strResult;
while(*temp)
*tp++=*temp++;
sourcep=strSource;
inChinese=0;
}
if(specialspace)
{
specialspace=0;
while(spacepointer<Bpointer)
*tp++=lpszBuffer[spacepointer++];
}
//process the special characters and ascii code
//Bpointer++; ???
if(isASCII(lpszBuffer[Bpointer]))//遇到了ASCII码
*tp++=lpszBuffer[Bpointer++];
else
{
*tp++=lpszBuffer[Bpointer++];
*tp++=lpszBuffer[Bpointer++];
*tp++=D;
/* *tp++=D;
*tp++=D;
Bpointer+=2; */
}//特殊的汉字如标点,符号等:
}//While(lpszBuffer is at the end)
//syslog(LOG_ALERT,"FAILED IN SSPS");
//for the source string that end with all chinese substring!!!!!
*sourcep=0;
Segment(strSource,strResult);
*tp++=D;
char *temp=(char *)strResult;
while(*temp)
*tp++=*temp++;
*tp=0;
//syslog(LOG_ALERT,"FAILED IN SSPS end");
}//SSPS子程序结束
long filelength1(int handle)
{
struct stat buf;
fstat(handle,&buf);
return buf.st_size;
}
void *emalloc(__int32 i)
{
void *p;
if ((p = (void *) malloc(i)) == NULL)
//err_ret("Ran out of memory (could not allocate enough)!");
printf("Ran out of memory (could not allocate enough)!");
return p;
}
int OperateDic(const char *IFileName,const char *DicFileName )
{
char sf[256];
FILE* fp;
strcpy(sf,IFileName);
DicOpenDic(DicFileName);
if((fp=fopen(sf,"rb"))==NULL)
{
printf("Raw word file cannot be opened!\n");
return 1;
}
long len=filelength1(fileno(fp));
if(len>0)
{
unsigned char * Buffer=(unsigned char *)emalloc(len);
fread(Buffer,len,1,fp);
fclose(fp);
char *seps1=("\t\r\n 0 1 2 3 4 5 6 7 8 9 a b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z ");
char *token1;
token1 = strtok( (char *)Buffer, seps1 );
while( token1 != NULL )
{
WInsert((unsigned char*)token1);
token1 = strtok( NULL, seps1 );
}
free(Buffer);
}
else fclose(fp);
return 0;
}
void FiletoBufSeg(FILE *fps,char *targetBuf)
{
unsigned long len=filelength1(fileno(fps));
char *sourceBuf=(char *)emalloc(len+1);
fread(sourceBuf,len,1,fps);
sourceBuf[len]=0;
SSPS(sourceBuf,targetBuf);
free(sourceBuf);
}//end of FiletoBufSeg
//fpd refers to a successfully opened file for writing
//SourceBuf refers to the buffer to be processed
void BuftoFileSeg(char *lpszBuffer,FILE* fpd)
{
//定义Buffer的指针:
int Bpointer=0;
int inChinese=0;//....
int spacepointer;
int specialspace=0;
//定义待分词纯汉字字串和分词结果字串:
unsigned char strSource[120000],strResult[200000];
//Pay attention: should use variable length strings instead.Modify SSPS()!!
unsigned char *sourcep=strSource; //*resultp=strResult;
//逐字节地处理文本缓冲区:
while(lpszBuffer[Bpointer])////this paragraph should be rewritten!!
{
if(isHanzi(lpszBuffer[Bpointer],lpszBuffer[Bpointer+1]))//如果遇到纯汉字内码
{
inChinese=1;
specialspace=0;
*sourcep++=lpszBuffer[Bpointer++];//拷贝到待分词字串(1字节)
*sourcep++=lpszBuffer[Bpointer++];//拷贝到待分词字串(2字节)
continue;
}
if(wbspace(lpszBuffer[Bpointer]))
{
if(inChinese)
{
if(!specialspace)
{
spacepointer=Bpointer;
specialspace=1;
}
Bpointer++;
}
else
fputc(lpszBuffer[Bpointer++],fpd);
continue;
}
//对纯汉字字串分词
if(inChinese)
{
*sourcep=0;
Segment(strSource,strResult);
fputc(D,fpd);
fputs((const char *)strResult,fpd);
sourcep=strSource;
inChinese=0;
}
if(specialspace)
{
specialspace=0;
while(spacepointer<Bpointer)
fputc(lpszBuffer[spacepointer++],fpd);
}
//process the special characters and ascii code
//Bpointer++; ???
if(isASCII(lpszBuffer[Bpointer]))//遇到了ASCII码
fputc(lpszBuffer[Bpointer++],fpd);
else
{
fputc(lpszBuffer[Bpointer++],fpd);
fputc(lpszBuffer[Bpointer++],fpd);
fputc(D,fpd);
/*
fputc(D,fpd);
fputc(D,fpd);
Bpointer+=2; */
}//特殊的汉字如标点,符号等:
}//While(lpszBuffer is at the end)
//for a string end with all Chinese Characters!!!
*sourcep=0;
Segment(strSource,strResult);
fputc(D,fpd);
if(strResult) fputs((const char *)strResult,fpd);
}//end of BuftoFileSeg
//fps refers to a successfully opened file to be read
//fpd refers to a successfully opened file for writing
void FiletoFileSeg(FILE*fps,FILE*fpd)
{
unsigned long len=filelength1(fileno(fps));
char *lpszBuffer=(char *)emalloc(len+1);
fread(lpszBuffer,len,1,fps);
lpszBuffer[len]=0;
BuftoFileSeg(lpszBuffer,fpd);
free(lpszBuffer);
}//end of FiletoFileSeg
//sourcefile: the source file name
//desfile: the target file name
//you should first open the dictonary before calling this function
void FNametoFNameSeg(char *sourcefile,char *desfile)
{
FILE *fps,*fpd;
//FILE *fp;
// if(!ishtml(sourcefile))
// return;
fps=fopen(sourcefile,"rb");
fpd=fopen(desfile,"wb");
if(!fps)
{
printf("Error on opening file : %s\n",sourcefile);
exit(-1);
}
if(!fpd)
{
printf("Error on creating target file : %s\n",desfile);
// WriteLog("Error on creating target file : %s\n",desfile);
if(fps != NULL)
fclose(fps);
return;
// exit(-1);
}
FiletoFileSeg(fps,fpd);
fclose(fps);
fclose(fpd);
}
int WDel(unsigned char *cWord)
{
int x,y;
int flag;
if(!cWord) return -1;
x=cWord[0]-0xb0;
y=cWord[1]-0xa1;
struct aWORDdic* Wptr=WIndexcom[x][y].WList;
struct aWORDdic* DPos=NULL;
struct aWORDdic* DWord;
while(1)
{
if(!Wptr)
{
// NotFound++;
#ifdef Debug
printf("%s not found.\n",cWord);
#endif
break;
}
flag=strcmp((char *)cWord,(char*)Wptr->sCIYU);
if(flag<0)
{
// NotFound++;
#ifdef Debug
printf("%s not found.\n",cWord);
#endif
break;
}
if(flag>0)
{
DPos=Wptr;
Wptr=Wptr->ptrNext;
}
if(!flag)
{
DWord=Wptr;
if(DPos) DPos->ptrNext=Wptr->ptrNext;
else WIndexcom[x][y].WList=Wptr->ptrNext;
WIndexcom[x][y].WCount--;
free(DWord->sCIYU);
free(DWord);
// Num_Del++;
#ifdef Debug
printf("%s\n",cWord);
#endif
break;
}
}//end of while(1);
return 0;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -