📄 result.cpp

📁 计算所汉语词法分析系统ICTCLAS介绍词是最小的能够独立活动的有意义的语言成分。但汉语是以字为基本的书写单位
💻 CPP
📖 第 1 页 / 共 2 页
字号:
12 下一页
// Result.cpp: implementation of the CResult class.
//
//////////////////////////////////////////////////////////////////////

#include "stdafx.h"
#include "Result.h"
#include <string.h>
#include <stdio.h>
#include <math.h>
#include "..\\Utility\\Utility.h"
//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////

CResult::CResult()
{
	//malloc buffer
	m_pResult=new PWORD_RESULT[MAX_SEGMENT_NUM];
	for(int i=0;i<MAX_SEGMENT_NUM;i++)
	{
		m_pResult[i]=new WORD_RESULT[MAX_WORDS];
	}

	m_dictCore.Load("data\\coreDict.dct");
	m_POSTagger.LoadContext("data\\lexical.ctx");
/*
	m_dictCore.Load("data\\Dict.dct");
	m_POSTagger.LoadContext("data\\trainTest.ctx");
*/
/*
	m_dictCore.AddItem("十分",'d'*256,500);
	m_dictCore.AddItem("十分",'m'*256,-500);
	m_dictCore.Save("data\\coreDict.dct");
*/
	
	m_POSTagger.SetTagType();

	m_Person.Configure("data\\Person",TT_PERSON);
	//Set the person recognition configure
	m_Transliteration.Configure("data\\Trans",TT_TRANS);
	//Set the transliteration recognition configure
	m_Place.Configure("data\\Place",TT_PLACE);
	//Set the place recognition configure
	m_nOperateType=2;//0:Only Segment;1: First Tag; 2:Second Type
	m_nOutputFormat=0;//0:PKU criterion;1:973 criterion; 2: XML criterion
}

CResult::~CResult()
{
	//free buffer
	for(int i=0;i<MAX_SEGMENT_NUM;i++)
	{
		delete [] m_pResult[i];
	}
	delete m_pResult;

}

bool CResult::Output(PWORD_RESULT pItem, char *sResult,bool bFirstWordIgnore)
{
	int i=0;
	char sTempBuffer[WORD_MAXLENGTH],sPOS[3];
	sPOS[2]=0;
	sResult[0]=0;
	if(bFirstWordIgnore)//Ignore first valid
		i=1;
	while(pItem[i].sWord[0]!=0&&pItem[i].nHandle!=CT_SENTENCE_END)//Not sentence ending flag
	{
		//Get the POS string
		if(m_nOutputFormat!=0)//Not PKU format
			PKU2973POS(pItem[i].nHandle,sPOS);
		else//PKU format
		{
			sPOS[0]=pItem[i].nHandle/256;
			sPOS[1]=pItem[i].nHandle%256;
		}
		sPOS[m_nOperateType]=0;//Set the sPOS with operate type
		
		if(m_nOutputFormat==0)//PKU format
		{
			sprintf(sTempBuffer,"%s",pItem[i].sWord);
		    strcat(sResult,sTempBuffer);
			if(sPOS[0]!=0)//need POS 
			{
				sprintf(sTempBuffer,"/%s",sPOS);
				strcat(sResult,sTempBuffer);
			}
		    strcat(sResult,"  ");
		}
		else if(m_nOutputFormat==1)//973 format
		{
			sprintf(sTempBuffer,"%s\\",pItem[i].sWord);
		    strcat(sResult,sTempBuffer);
			if(sPOS[0]!=0)//need POS 
			{
				sprintf(sTempBuffer,"[%s]",sPOS);
				strcat(sResult,sTempBuffer);
			}
		}
		else if(m_nOutputFormat==2)//XML format
		{
			if(sPOS[0]!=0)//POS
			{
				sprintf(sTempBuffer,"<any type=\042%s\042>",sPOS);
				strcat(sResult,sTempBuffer);
			}
			sprintf(sTempBuffer,"<src>%s</src>",pItem[i].sWord);
			strcat(sResult,sTempBuffer);
			if(sPOS[0]!=0)
			{
				strcat(sResult,"</any>");
			}
		}
		i++;
	}
	return true;
}

bool CResult::Processing(char *sSentence,unsigned int nCount)
{
	int nIndex;
#if _ICT_DEBUG
	char *sSegment;
	sSegment=new char[MAX_SENTENCE_LEN*2];
#endif
	m_Seg.Segment(sSentence,m_dictCore,nCount);

	m_nResultCount=m_Seg.m_nSegmentCount;
	//Record the number of result
	for(nIndex=0;nIndex<m_Seg.m_nSegmentCount;nIndex++)
	{
			m_POSTagger.POSTagging(m_Seg.m_pWordSeg[nIndex],m_dictCore,m_dictCore);
#if _ICT_DEBUG
			Output(m_Seg.m_pWordSeg[nIndex],sSegment);
			printf("POS Tag%d:%s\n",nIndex+1,sSegment);
#endif
		m_Person.Recognition(m_Seg.m_pWordSeg[nIndex],m_Seg.m_graphOptimum,m_Seg.m_graphSeg,m_dictCore);
		m_Transliteration.Recognition(m_Seg.m_pWordSeg[nIndex],m_Seg.m_graphOptimum,m_Seg.m_graphSeg,m_dictCore);
		m_Place.Recognition(m_Seg.m_pWordSeg[nIndex],m_Seg.m_graphOptimum,m_Seg.m_graphSeg,m_dictCore);
	}		
	//m_Person.Recognition(m_Seg.m_WordSeg[0],m_Seg.m_graphOptimum,m_Seg.m_graphSeg,m_dictCore);
	//Person Recognition
#if _ICT_DEBUG
	printf("After person recognition.\n");
#endif
	m_Seg.OptimumSegmet(nCount);
	for(nIndex=0;nIndex<m_Seg.m_nSegmentCount;nIndex++)
	{
		m_POSTagger.POSTagging(m_Seg.m_pWordSeg[nIndex],m_dictCore,m_dictCore);
#if _ICT_DEBUG
		Output(m_Seg.m_pWordSeg[nIndex],sSegment);
		printf("POS Tag%d:%s\n",nIndex+1,sSegment);
#endif
	}
#if _ICT_DEBUG
	printf("After Sorting.\n");
#endif
	Sort();//Sort the ending 
#if _ICT_DEBUG
	for(nIndex=0;nIndex<m_Seg.m_nSegmentCount;nIndex++)
	{
		Output(m_pResult[nIndex],sSegment);
		printf("POS Tag%d(P=Exp(%f)):%s\n",nIndex+1,m_dResultPossibility[nIndex],sSegment);
	}
	delete [] sSegment;
#endif
	return true;
}
//Sort the segmentation and POS result according its possibility
bool CResult::Sort()
{
	ELEMENT_TYPE dPossibility[MAX_SEGMENT_NUM],dTemp;
	int nIndex[MAX_SEGMENT_NUM],nTemp;//Index

	memset(dPossibility,0,sizeof(dPossibility));
	//Init the possibility
	
	for(int i=0;i<m_Seg.m_nSegmentCount;i++)
	{//Computing the possibility
		dPossibility[i]=ComputePossibility(m_Seg.m_pWordSeg[i]);
		nIndex[i]=i;//Record the index
	}
	
	//Sort with Bubble sort algorithm
	for(i=0;i<m_Seg.m_nSegmentCount;i++)
		for(int j=i+1;j<m_Seg.m_nSegmentCount;j++)
		{
			if(dPossibility[i]<dPossibility[j])
			{//Swap the possition and value
				nTemp=nIndex[i];
				dTemp=dPossibility[i];
				nIndex[i]=nIndex[j];
				dPossibility[i]=dPossibility[j];
				nIndex[j]=nTemp;
				dPossibility[j]=dTemp;
			}
		}
	
	for(i=0;i<m_Seg.m_nSegmentCount;i++)
	{//Adjust the segmentation and POS result and store them in the final result array
	 //Store them according their possibility ascendly
		Adjust(m_Seg.m_pWordSeg[nIndex[i]],m_pResult[i]);	
		m_dResultPossibility[i]=dPossibility[i];	
	}
	return true;
}
//Compute the possibility of current segmentation and POS result
ELEMENT_TYPE CResult::ComputePossibility(PWORD_RESULT pItem)
{
	int i=0;
	ELEMENT_TYPE dResultPossibility=0;
	while(pItem[i].sWord[0]!=0)
	{
		dResultPossibility+=pItem[i].dValue;
		//Compute the possibility of logP(Wi|Ti)
		if(pItem[i+1].sWord[0]!=0)//Not the last one
		{//Compute the possibility of logP(Ti|Ti-1)
			dResultPossibility+=log((double)(m_POSTagger.m_context.GetContextPossibility(0,pItem[i].nHandle,pItem[i+1].nHandle)+1));
			dResultPossibility-=log((double)(m_POSTagger.m_context.GetFrequency(0,pItem[i].nHandle)+1));
		}
		i++;
	}
	return dResultPossibility;
}

//Adjust the result with some rules
bool CResult::Adjust(PWORD_RESULT pItem,PWORD_RESULT pItemRet)
{
	int i=0,j=0;
	unsigned int nLen;
	char sSurName[10],sSurName2[10],sGivenName[10];
	bool bProcessed=false;//Have been processed
	while(pItem[i].sWord[0]!=0)
	{
		nLen=strlen(pItem[i].sWord);
		bProcessed=false;
		
		//Rule1: adjust person name
		if(pItem[i].nHandle==28274&&ChineseNameSplit(pItem[i].sWord,sSurName,sSurName2,sGivenName,m_Person.m_dict)&&strcmp(pItem[i].sWord,"叶利钦")!=0)//'nr'
		{//Divide name into surname and given name
			
			if(sSurName[0])
			{
				strcpy(pItemRet[j].sWord,sSurName);
				pItemRet[j++].nHandle=28274;
			}
			if(sSurName2[0])
			{
				strcpy(pItemRet[j].sWord,sSurName2);
				pItemRet[j++].nHandle=28274;
			}
			if(sGivenName[0])
			{
				strcpy(pItemRet[j].sWord,sGivenName);
				pItemRet[j++].nHandle=28274;
			}
			bProcessed=true;
		}
		//Rule2 for overlap words ABB 一段段、一片片
12 下一页
💿 文件大小 111 K
👤 上传用户 fzjw0803005
📂 所属分类多国语言处理
🏷️ 相关标签

#ICTCLAS #汉语 #分 #计算
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -