📄 mpwordseg.cpp

📁 中文信息处理方面的一个源码。此为一个词性标注软件
💻 CPP
字号:
#include "stdafx.h"
#include "math.h" // 包含log函数的定义

#include "MPWordSeg.h"
#include "MyDictionary.h"
#include "MyFileApp.h"
#include "NameRecognizer.h"
/*
# define MaxWordLength 8  // 最大词长为8个字节（即4个汉字）
# define Separator "/  "    // 词界标记
# define CorpusSize 200000
*/

extern int MaxWordLength;
extern CString Separator;
extern long CorpusSize;

# if !pDict
extern CMyDictionary pDict;
# endif

// 以下是最大概率法分词程序

struct Candidate {
	short offset, length;  // 候选词在输入串中的起点，长度
	short goodPrev;  // 最佳前趋词的序号
	float fee, sumFee; // 候选词的费用，路径上的累计费用
} Candidates[100];// 假定最多100个候选词

short getTmpWords(CString &s)
{ // 从输入串中挑选可能是词的单位作为最大概率法分词的候选词
	short i=0,j,len,restlen,n=s.GetLength();
	long freq;
	CString w;
	for(j=0;j<n;j+=2) {
		for(len=2;len<=MaxWordLength;len+=2) {
			restlen=n-j; 
			
			if (len<=restlen) // 如果剩余词长度不够长，跳出循环
				w=s.Mid(j,len);
			else
				break;

			freq=pDict.GetFreq(w); // 如果在数据库中将wfreq字段设为双精度型数字，则返回0值
			if(len>2 && freq==-1)
				continue;
			if(freq==-1) 
				freq=0;
			Candidates[i].offset=j;
			Candidates[i].length=len;
			Candidates[i].fee = (float)(-log((double)(freq+1)/CorpusSize));
			Candidates[i].sumFee=0.0F;// 置初值
			i++;
		}
	}
	return i;
}

void getPrev(short i)
{ // 计算每一个候选词的最佳前趋词，以及当前词的最小累计费用
	if(Candidates[i].offset==0) {
		Candidates[i].goodPrev=-1;
		Candidates[i].sumFee=Candidates[i].fee;
		return;
	}
	
	short j,minID=-1;
	
//	for(j=i-1;j>=0;j--) {
//		if(Candidates[j].offset+Candidates[j].length==Candidates[i].offset)
//			break;
//	}
//	for(;Candidates[j].offset+Candidates[j].length==Candidates[i].offset;j--)

	for(j=i-1;j>=0;j--) 
	{ //向左查找所有候选词，得到前驱词集合，从中挑选最佳前趋词
		if(Candidates[j].offset+Candidates[j].length==Candidates[i].offset) {
			if(minID==-1 || Candidates[j].sumFee<=Candidates[minID].sumFee)
				minID=j;
		}
		if(Candidates[i].offset-Candidates[j].offset>=8) // 向左查找候选词最远不超过4个汉字
			break;
	}
	
	Candidates[i].goodPrev=minID;
	Candidates[i].sumFee=Candidates[i].fee+Candidates[minID].sumFee;
	return;
}


CString SegmentHzStrMP(CString s1)
{//最大概率法分词程序，处理一个字符串

	int len=s1.GetLength();
	short n=getTmpWords(s1); // 获得候选词
	short minID=-1;
	short i;
	
	//////////////////////////////////
	// 计算最大概率程序段开始

	for(i=0;i<n;i++) {
		getPrev(i); // 获得最佳左邻词
		if(Candidates[i].offset+Candidates[i].length==len) { // 如果当前词是s1中最后一个可能的候选词
			if(minID==-1||Candidates[i].sumFee<Candidates[minID].sumFee) // 如果这个末尾候选词的累计费用最小
				minID=i; // 把当前词的序号赋给minID，这就是最小费用路径的终点词的序号
						 // 这就是最后分词结果最右边的那个词的序号
		}
	}
	// 计算最大概率程序段结束
	//////////////////////////////////
	
	//////////////////////////////////
	// 以下是输出分词结果程序段
	CString s2=""; 
/* 

    // 以下是最大概率法分词输出分词结果开始
	for(i=minID;i>=0;i=Candidates[i].goodPrev) // 从右向左取候选词
		s2=s1.Mid(Candidates[i].offset,Candidates[i].length)+Separator+s2; 
	// 最大概率法分词输出分词结果结束

*/
	///////////////////////////////////////////////////
	// 识别中文姓名程序段开始

	CString tmp="";
	for(i=minID;i>=0;i=Candidates[i].goodPrev) {

		CString w=s1.Mid(Candidates[i].offset,Candidates[i].length);
		
		if(w.GetLength()==2) // 如果是单字，就将它加入到tmp串中
			tmp=w+tmp;
		else {
			if(tmp.GetLength()>0) {
				if(tmp.GetLength()==2)
					s2=tmp+Separator+s2;
				else
					s2=CheckStr(tmp)+s2; 
				tmp="";
			}
			s2=w+Separator+s2;
		}
	}

	if(tmp.GetLength()>0) {
		if(tmp.GetLength()==2)
			s2=tmp+Separator+" "+s2;
		else
		//	s2=CheckStr(tmp)+Separator+s2;
			s2=CheckStr(tmp)+s2;
		tmp="";
	}
	// 识别中文姓名程序段结束
	/////////////////////////////////////////////////////

	return s2;
}

CString SegmentSentenceMP (CString s1)  
{// 最大概率法分词程序：对句子进行分词处理的函数
	CString s2="";
	int i,dd;
	while(!s1.IsEmpty()) {
		unsigned char ch=(unsigned char) s1[0];
		if(ch<128) { // 处理西文字符
			i=1;
			dd=s1.GetLength();
			while(i<dd && ((unsigned char)s1[i]<128) && (s1[i]!=10) && (s1[i]!=13)) // s1[i]不能是换行符或回车符
				i++;
			if ((ch!=32) && (ch!=10) && (ch!=13)) // 如果不是西文空格或换行或回车符
				s2 += s1.Left(i) + Separator;
			else {
				if (ch==10 || ch==13)   // 如果是换行或回车符，将它拷贝给s2输出
					s2+=s1.Left(i);
			}
			s1=s1.Mid(i);
			continue;
		}
		else { 
			if (ch<176) { // 中文标点等非汉字字符
				i=0;
				dd=s1.GetLength();
				while(i<dd && ((unsigned char)s1[i]<176) && ((unsigned char)s1[i]>=161)
							&& (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=162 && (unsigned char)s1[i+1]<=168)))
							&& (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=171 && (unsigned char)s1[i+1]<=191)))
							&& (!((unsigned char)s1[i]==163 && ((unsigned char)s1[i+1]==172 || (unsigned char)s1[i+1]==161) 
							|| (unsigned char)s1[i+1]==168 || (unsigned char)s1[i+1]==169 || (unsigned char)s1[i+1]==186
							|| (unsigned char)s1[i+1]==187 || (unsigned char)s1[i+1]==191))) // 
					i=i+2; // 假定没有半个汉字
				if (i==0)
					i=i+2;
				if (!(ch==161 && (unsigned char)s1[1]==161)) // 不处理中文空格
					s2+=s1.Left(i) + Separator; // 其他的非汉字双字节字符可能连续输出
				s1=s1.Mid(i);
				continue;
			}
		}
		
		// 以下处理汉字串

		i=2;
		dd=s1.GetLength();
		while(i<dd && (unsigned char)s1[i]>=176) 
//		while(i<dd && (unsigned char)s1[i]>=128 && (unsigned char)s1[i]!=161)
			i+=2;
		
		s2+=SegmentHzStrMP(s1.Left(i));
		s1=s1.Mid(i);
	}
	/////////////////////////////////////////////////////
	// 以下程序段用于将表示时间的单位合并成一个分词单位

	int TmpPos;
	const char * p;
	CString s2_part_1;
	
	if (s2.Find("  年/")>=0) {
		TmpPos=s2.Find("  年/");
		s2_part_1=s2.Mid(0,TmpPos);
		p=(LPCTSTR) s2_part_1;
		p=p+TmpPos-2;
		if (p[0]=='1'||p[0]=='2'||p[0]=='3'||p[0]=='4'||p[0]=='5'||p[0]=='6'||p[0]=='7'||p[0]=='8'||p[0]=='9'||p[0]=='0') {
			s2_part_1=s2_part_1.Mid(0,TmpPos-1);
			s2=s2_part_1+s2.Mid(TmpPos+2);
		}
	}
	
	if (s2.Find("  月/")>=0) {
		TmpPos=s2.Find("  月/");
		s2_part_1=s2.Mid(0,TmpPos);
		p=(LPCTSTR) s2_part_1;
		p=p+TmpPos-2;
		if (p[0]=='1'||p[0]=='2'||p[0]=='3'||p[0]=='4'||p[0]=='5'||p[0]=='6'||p[0]=='7'||p[0]=='8'||p[0]=='9'||p[0]=='0') {
			s2_part_1=s2_part_1.Mid(0,TmpPos-1);
			s2=s2_part_1+s2.Mid(TmpPos+2);
		}
	}

	if (s2.Find("  日/")>=0) {
		TmpPos=s2.Find("  日/");
		s2_part_1=s2.Mid(0,TmpPos);
		p=(LPCTSTR) s2_part_1;
		p=p+TmpPos-2;
		if (p[0]=='1'||p[0]=='2'||p[0]=='3'||p[0]=='4'||p[0]=='5'||p[0]=='6'||p[0]=='7'||p[0]=='8'||p[0]=='9'||p[0]=='0') {
			s2_part_1=s2_part_1.Mid(0,TmpPos-1);
			s2=s2_part_1+s2.Mid(TmpPos+2);
		}
	}
	
	////////    合并时间单位程序段结束
	///////////////////////////////////////////////

	return s2;
}

void SegmentAFileMP (CString FileName)
{  // 最大概率法分词程序：对文件进行分词处理
	if (pDict.myDatabaseName.IsEmpty()) {
		AfxMessageBox("您没有打开词库，无法进行分词处理");
		if(pDict.OpenMDB()==FALSE)
			return;
	}

	FILE * in, * out;
	in = fopen((const char*) FileName,"rt");
	if(in==NULL) {
		AfxMessageBox("无法打开文件");
		return;
	}
	FileName=ChangeFileName(FileName,"-seg");
	out = fopen((const char*) FileName,"wt");
	if(out==NULL) {
		AfxMessageBox("无法创建文件");
		fclose(in);
		return;
	}

	CStdioFile inFile(in),outFile(out);

	char s[2048];
	CString line;

	while(inFile.ReadString(s,2048)) {// 循环读入文件中的每一行
		line = s;
		line = SegmentSentenceMP(line); // 调用句子分词函数进行分词处理
		outFile.WriteString(line); // 将分词结果写入目标文件
	}
	inFile.Close();
	outFile.Close();
}

// 最大概率法分词程序结束
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -