📄 中文分词.cpp

📁 该源码用c++编写
💻 CPP
📖 第 1 页 / 共 2 页
字号:
上一页 12
	*(strReturn+1)='\0';

	if (CheckIfEnglishChar(char1))//字母开头字母结尾 
	{
		intStrLength=0;
		
		while (CheckIfEnglish(*(strSource+intStartPlace+1)))
		{
			* (strReturn+intStrLength)=LowerCase(*(strSource+intStartPlace));
			intStartPlace++;
			intStrLength++;
		}
		
		if (CheckIfEnglishChar(*(strSource+intStartPlace)))
		{ 
			* (strReturn+intStrLength)=LowerCase(*(strSource+intStartPlace));
			intStrLength++;
		}

    		*(strReturn+intStrLength)='\0';

		if (intStrLength>1) 
		{	
			for (i=0;i<intStrLength;i++)
			if (CheckIfEnglishChar(*(strReturn+i)))
			{	
				if (intStrLength<20) 
				{ 
					pWord=FindWord(strReturn);
				  	if (pWord==NULL) AddNewWord(strReturn,1,intStrLength,4.0);//???
				  	else InCreaseWord(pWord);
				  	break;
				}
			}
		}

		*intCharLength=intStrLength;return 2;
	}
	else 
	{	
		if (((Direction==1)&&((char1==13)&&(char2==10)))||((Direction==2)&&((char1==10)&&(char2==13)))) 
		{*intCharLength=2;*strReturn=10;return 4;}//回车
		else *intCharLength=1;//不是回车
		return 2;
	}
}

void Document::DevideWord(char *strSource)
{	
	char strWordTmp[1000],chrTmp[1000];
	WordType * pWordStart;
	unsigned char char1;
	int intReturn,intCharLength,pStart;
	
	int bHasAdd;

	pStart=0;
	intReturn=GetCharFromString(strSource,pStart,chrTmp,&intCharLength);
	pStart+=intCharLength;
	
	while(intReturn!=0)
	{
		if (intReturn==1)
		{	
			intReturn=GetCharFromString(strSource,pStart,chrTmp,&intCharLength);
			pStart+=intCharLength;	
			
			if (intReturn==1) 
			{	
				SubString(strSource+pStart-4,2,strWordTmp);
				char1=(unsigned char)(*strWordTmp);
				pWordStart=tmpWordArray[char1];
				
				bHasAdd=0;
				while (pWordStart!=NULL)
				{	
					SubString(strSource+pStart-4,pWordStart->length,strWordTmp);
					if (strcmp(strWordTmp,pWordStart->word)==0)
					{	
						pWordStart->count++;
						pStart=pStart+pWordStart->length-4;
						intReturn=GetCharFromString(strSource,pStart,chrTmp,&intCharLength);
						pStart+=intCharLength;

						if (intReturn==1)
						{	
							strcat(strWordTmp,chrTmp);
							if (strlen(strWordTmp)>=4) AddNewWord(strWordTmp,1,pWordStart->length+2,4.0);
							bHasAdd=1;
						}
						break;
					}
					pWordStart=pWordStart->next; 
				}
				if (bHasAdd!=1)
				{	
					SubString(strSource+pStart-4,4,strWordTmp);
					if (strlen(strWordTmp)>=4) AddNewWord(strWordTmp,1,4,4.0);
				}
			}
			else if (intReturn==2) 
			{	
				intReturn=GetCharFromString(strSource,pStart,chrTmp,&intCharLength);
				pStart+=intCharLength;	
			}
			else return;
		}
		else
		{	
			intReturn=GetCharFromString(strSource,pStart,chrTmp,&intCharLength);
			pStart+=intCharLength;	
		}
	}
}



void Document::FirstScanFile(char *strSource,char *strOutput)
{	
	char chrTmp[1000];
	int intReturn,intCharLength,pStart;
	
	*strOutput='\0';

	pStart=0;
	
	intReturn=GetCharFromString(strSource,pStart,chrTmp,&intCharLength);
	pStart+=intCharLength;
	
	while(intReturn!=0)
	{	
		//返回值：0 末尾；1 字符 ；2 符号 ；-1 出错 ；
		if ((intReturn==2)||(intReturn==4)) //???
		{	
			DevideWord(strOutput);	// 取到一句马上分词
			// ?这样的话,起步是最后一举如果没有标点符号,就不会用于分词当中了.(小问题)by xfguo
			*strOutput='\0';
			while (intReturn==2) 
			{	
				intReturn=GetCharFromString(strSource,pStart,chrTmp,&intCharLength);
				pStart+=intCharLength;
			}
			if (intReturn==1) strcat(strOutput,chrTmp);
		}
		else if (intReturn==1) strcat(strOutput,chrTmp);
		if (intReturn!=0)
		{  
			intReturn=GetCharFromString(strSource,pStart,chrTmp,&intCharLength);
		    	pStart+=intCharLength;
		}
	}
}

void Document::GetJiaoJiTable()
//取交集,在两个数组里,做完后在数组0里,词频为0
{	
	int i;
	WordType *pWord;
	
	//把一个数组里的东西词频设置为0,加入词表中
	char strWordTmp[50];
	for (i=0;i<intWordCount[0];i++)
		AddNewWord(lastWordArray[0][i].word,0,strlen(lastWordArray[0][i].word),lastWordArray[0][i].weight);
	
	//把另一个数组里的东西进行查询,命中设置词频为1
	for (i=0;i<intWordCount[1];i++)
		{	
			ReverseString(lastWordArray[1][i].word,strWordTmp);
			pWord=FindWord(strWordTmp);
			if (pWord!=NULL) pWord->count=1;
		}

	ArrangeWordInTable(1,1,1);//按照1过滤
	freeTmpWordArray();

	for (i=0;i<intWordCount[0];i++)
		AddNewWord(lastWordArray[0][i].word,0,strlen(lastWordArray[0][i].word),lastWordArray[0][i].weight);

}

void Document::LexicalAquisition() //char * outputFileName
//抽词,outputFileName非空则直接输出否则保留于词表中
{	
	char * buffer,* buffer_temp;
	buffer=(char *)malloc(strlen(strSource)+1);
	buffer_temp=(char *)malloc(strlen(strSource)+1);
	//IfExtendWordTable=1;
	Direction=1;
	strcpy(buffer,strSource);
	// 第一遍抽词
	FirstScanFile(buffer,buffer_temp);
	ArrangeWordInTable(3,1,1);
	freeTmpWordArray();
	// 第二遍抽词
	// 把当前的tmpWordArray清空
	Direction=2;
	ReverseString(buffer,buffer_temp);
	FirstScanFile(buffer_temp,buffer);
	ArrangeWordInTable(3,1,2);
	freeTmpWordArray();
	// 获得交集
	GetJiaoJiTable();

	free(buffer);
	free(buffer_temp);
	//if (outputFileName!=NULL) WriteWordToFile(outputFileName,0,1);
	initArrangeTable();	
}

void Document::WordSegmentation(char * outputFileName)
//分词
{	
	Direction=1;
	char * buffer;
	buffer=(char *)malloc(strlen(strSource)+1); 
	IfExtendWordTable=2;
	FirstScanFile(strSource,buffer);		// 重新计算频率
	ArrangeWordInTable(3,1,1);
	if (outputFileName!=NULL) WriteWordToFile(outputFileName,1,1);
	free(buffer);
}


void SubString(char *strSource,int intLength,char *strDest)
{	
	int i;
	for (i=0;i<intLength;i++) *(strDest+i)=*(strSource+i);
	*(strDest+i)='\0';
}

int CheckIfEnglishChar(char char1)
{	
	if (((char1>='a')&&(char1<='z'))||((char1>='A')&&(char1<='Z'))||((char1>='0')&&(char1<='9'))) return 1;
	else return 0;
}

int CheckIfEnglish(char char1)
{	
	if (CheckIfEnglishChar(char1)||(char1=='.')||(char1=='\\')||(char1=='-')||(char1=='!')||(char1=='/')) return 1;
	else return 0;
}

void ReverseString(char *strSource,char * strReturn)
{	
	int i,intStringLength;
	i=0;
	while (*(strSource+i)!=0) i++;
	intStringLength=i;
	    
	for (i=0;i<intStringLength;i++) 
		*(strReturn+i)=*(strSource+intStringLength-1-i);
	    	*(strReturn+i)='\0';
}

char LowerCase(char char1)
{	
	if ((char1>='A')&&(char1<='Z')) return (char)(char1-('A'-'a'));
	return char1;
}


void Document :: InitClassDocument()
{
	/*if (strSource!=NULL) 
	{   
	      free(strSource);
	      strSource=NULL;
	}*/
	initArrangeTable();
	IfExtendWordTable=1;
	Direction=1;
		//if (bIfUseDict==0)
	
	freeTmpWordArray();
	initTmpWordArray();
}
//else ResetWordTable();


/* added by Tan Bin */
void Document::outputList(vector<struct Occurrence>& vectorOccur){
	int i;
	int count;
	char strWordTmp[50];
	count=intWordCount[Direction-1];
	struct Occurrence occur;

	vectorOccur.clear();
	for (i=0;i<count;i++)
	{	
		strcpy(strWordTmp,lastWordArray[Direction-1][i].word);
		if (filterString(strWordTmp)){
			strcpy(occur.sKeyword,strWordTmp);
			occur.nFrequency=lastWordArray[Direction-1][i].count;
			vectorOccur.insert(vectorOccur.begin(),occur);
		}
	}
}

bool filterString(char* str){
	unsigned char ch1;
	unsigned char ch2;
	int i=0;
	int j=0;
	while (str[i]!=NULL){
		ch1=(unsigned char)str[i];
		if (ch1<128){
			i++;
			continue;
		}
		if (str[i+1]==NULL) break;
		ch2=(unsigned char)str[i+1];
		if (ch2<128){
			i++;
			continue;
		}
		if (j<i){
			str[j]=ch1;
			str[j+1]=ch2;
		}
		j+=2; i+=2;
	}
	str[j]='\0';
	return (j!=0);
}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -