📄 中文分词.cpp
字号:
*(strReturn+1)='\0';
if (CheckIfEnglishChar(char1))//字母开头字母结尾
{
intStrLength=0;
while (CheckIfEnglish(*(strSource+intStartPlace+1)))
{
* (strReturn+intStrLength)=LowerCase(*(strSource+intStartPlace));
intStartPlace++;
intStrLength++;
}
if (CheckIfEnglishChar(*(strSource+intStartPlace)))
{
* (strReturn+intStrLength)=LowerCase(*(strSource+intStartPlace));
intStrLength++;
}
*(strReturn+intStrLength)='\0';
if (intStrLength>1)
{
for (i=0;i<intStrLength;i++)
if (CheckIfEnglishChar(*(strReturn+i)))
{
if (intStrLength<20)
{
pWord=FindWord(strReturn);
if (pWord==NULL) AddNewWord(strReturn,1,intStrLength,4.0);//???
else InCreaseWord(pWord);
break;
}
}
}
*intCharLength=intStrLength;return 2;
}
else
{
if (((Direction==1)&&((char1==13)&&(char2==10)))||((Direction==2)&&((char1==10)&&(char2==13))))
{*intCharLength=2;*strReturn=10;return 4;}//回车
else *intCharLength=1;//不是回车
return 2;
}
}
void Document::DevideWord(char *strSource)
{
char strWordTmp[1000],chrTmp[1000];
WordType * pWordStart;
unsigned char char1;
int intReturn,intCharLength,pStart;
int bHasAdd;
pStart=0;
intReturn=GetCharFromString(strSource,pStart,chrTmp,&intCharLength);
pStart+=intCharLength;
while(intReturn!=0)
{
if (intReturn==1)
{
intReturn=GetCharFromString(strSource,pStart,chrTmp,&intCharLength);
pStart+=intCharLength;
if (intReturn==1)
{
SubString(strSource+pStart-4,2,strWordTmp);
char1=(unsigned char)(*strWordTmp);
pWordStart=tmpWordArray[char1];
bHasAdd=0;
while (pWordStart!=NULL)
{
SubString(strSource+pStart-4,pWordStart->length,strWordTmp);
if (strcmp(strWordTmp,pWordStart->word)==0)
{
pWordStart->count++;
pStart=pStart+pWordStart->length-4;
intReturn=GetCharFromString(strSource,pStart,chrTmp,&intCharLength);
pStart+=intCharLength;
if (intReturn==1)
{
strcat(strWordTmp,chrTmp);
if (strlen(strWordTmp)>=4) AddNewWord(strWordTmp,1,pWordStart->length+2,4.0);
bHasAdd=1;
}
break;
}
pWordStart=pWordStart->next;
}
if (bHasAdd!=1)
{
SubString(strSource+pStart-4,4,strWordTmp);
if (strlen(strWordTmp)>=4) AddNewWord(strWordTmp,1,4,4.0);
}
}
else if (intReturn==2)
{
intReturn=GetCharFromString(strSource,pStart,chrTmp,&intCharLength);
pStart+=intCharLength;
}
else return;
}
else
{
intReturn=GetCharFromString(strSource,pStart,chrTmp,&intCharLength);
pStart+=intCharLength;
}
}
}
void Document::FirstScanFile(char *strSource,char *strOutput)
{
char chrTmp[1000];
int intReturn,intCharLength,pStart;
*strOutput='\0';
pStart=0;
intReturn=GetCharFromString(strSource,pStart,chrTmp,&intCharLength);
pStart+=intCharLength;
while(intReturn!=0)
{
//返回值:0 末尾;1 字符 ;2 符号 ;-1 出错 ;
if ((intReturn==2)||(intReturn==4)) //???
{
DevideWord(strOutput); // 取到一句马上分词
// ?这样的话,起步是最后一举如果没有标点符号,就不会用于分词当中了.(小问题)by xfguo
*strOutput='\0';
while (intReturn==2)
{
intReturn=GetCharFromString(strSource,pStart,chrTmp,&intCharLength);
pStart+=intCharLength;
}
if (intReturn==1) strcat(strOutput,chrTmp);
}
else if (intReturn==1) strcat(strOutput,chrTmp);
if (intReturn!=0)
{
intReturn=GetCharFromString(strSource,pStart,chrTmp,&intCharLength);
pStart+=intCharLength;
}
}
}
void Document::GetJiaoJiTable()
//取交集,在两个数组里,做完后在数组0里,词频为0
{
int i;
WordType *pWord;
//把一个数组里的东西词频设置为0,加入词表中
char strWordTmp[50];
for (i=0;i<intWordCount[0];i++)
AddNewWord(lastWordArray[0][i].word,0,strlen(lastWordArray[0][i].word),lastWordArray[0][i].weight);
//把另一个数组里的东西进行查询,命中设置词频为1
for (i=0;i<intWordCount[1];i++)
{
ReverseString(lastWordArray[1][i].word,strWordTmp);
pWord=FindWord(strWordTmp);
if (pWord!=NULL) pWord->count=1;
}
ArrangeWordInTable(1,1,1);//按照1过滤
freeTmpWordArray();
for (i=0;i<intWordCount[0];i++)
AddNewWord(lastWordArray[0][i].word,0,strlen(lastWordArray[0][i].word),lastWordArray[0][i].weight);
}
void Document::LexicalAquisition() //char * outputFileName
//抽词,outputFileName非空则直接输出否则保留于词表中
{
char * buffer,* buffer_temp;
buffer=(char *)malloc(strlen(strSource)+1);
buffer_temp=(char *)malloc(strlen(strSource)+1);
//IfExtendWordTable=1;
Direction=1;
strcpy(buffer,strSource);
// 第一遍抽词
FirstScanFile(buffer,buffer_temp);
ArrangeWordInTable(3,1,1);
freeTmpWordArray();
// 第二遍抽词
// 把当前的tmpWordArray清空
Direction=2;
ReverseString(buffer,buffer_temp);
FirstScanFile(buffer_temp,buffer);
ArrangeWordInTable(3,1,2);
freeTmpWordArray();
// 获得交集
GetJiaoJiTable();
free(buffer);
free(buffer_temp);
//if (outputFileName!=NULL) WriteWordToFile(outputFileName,0,1);
initArrangeTable();
}
void Document::WordSegmentation(char * outputFileName)
//分词
{
Direction=1;
char * buffer;
buffer=(char *)malloc(strlen(strSource)+1);
IfExtendWordTable=2;
FirstScanFile(strSource,buffer); // 重新计算频率
ArrangeWordInTable(3,1,1);
if (outputFileName!=NULL) WriteWordToFile(outputFileName,1,1);
free(buffer);
}
void SubString(char *strSource,int intLength,char *strDest)
{
int i;
for (i=0;i<intLength;i++) *(strDest+i)=*(strSource+i);
*(strDest+i)='\0';
}
int CheckIfEnglishChar(char char1)
{
if (((char1>='a')&&(char1<='z'))||((char1>='A')&&(char1<='Z'))||((char1>='0')&&(char1<='9'))) return 1;
else return 0;
}
int CheckIfEnglish(char char1)
{
if (CheckIfEnglishChar(char1)||(char1=='.')||(char1=='\\')||(char1=='-')||(char1=='!')||(char1=='/')) return 1;
else return 0;
}
void ReverseString(char *strSource,char * strReturn)
{
int i,intStringLength;
i=0;
while (*(strSource+i)!=0) i++;
intStringLength=i;
for (i=0;i<intStringLength;i++)
*(strReturn+i)=*(strSource+intStringLength-1-i);
*(strReturn+i)='\0';
}
char LowerCase(char char1)
{
if ((char1>='A')&&(char1<='Z')) return (char)(char1-('A'-'a'));
return char1;
}
void Document :: InitClassDocument()
{
/*if (strSource!=NULL)
{
free(strSource);
strSource=NULL;
}*/
initArrangeTable();
IfExtendWordTable=1;
Direction=1;
//if (bIfUseDict==0)
freeTmpWordArray();
initTmpWordArray();
}
//else ResetWordTable();
/* added by Tan Bin */
void Document::outputList(vector<struct Occurrence>& vectorOccur){
int i;
int count;
char strWordTmp[50];
count=intWordCount[Direction-1];
struct Occurrence occur;
vectorOccur.clear();
for (i=0;i<count;i++)
{
strcpy(strWordTmp,lastWordArray[Direction-1][i].word);
if (filterString(strWordTmp)){
strcpy(occur.sKeyword,strWordTmp);
occur.nFrequency=lastWordArray[Direction-1][i].count;
vectorOccur.insert(vectorOccur.begin(),occur);
}
}
}
bool filterString(char* str){
unsigned char ch1;
unsigned char ch2;
int i=0;
int j=0;
while (str[i]!=NULL){
ch1=(unsigned char)str[i];
if (ch1<128){
i++;
continue;
}
if (str[i+1]==NULL) break;
ch2=(unsigned char)str[i+1];
if (ch2<128){
i++;
continue;
}
if (j<i){
str[j]=ch1;
str[j+1]=ch2;
}
j+=2; i+=2;
}
str[j]='\0';
return (j!=0);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -