📄 span.cpp

📁 计算所汉语词法分析系统ICTCLAS介绍词是最小的能够独立活动的有意义的语言成分。但汉语是以字为基本的书写单位
💻 CPP
📖 第 1 页 / 共 2 页
字号:
12 下一页
/****************************************************************************
 *
 * Copyright (c) 2000, 2001 
 *     Machine Group
 *     Software Research Lab.
 *     Institute of Computing Tech.
 *     Chinese Academy of Sciences
 *     All rights reserved.
 *
 * This file is the confidential and proprietary property of 
 * Institute of Computing Tech. and the posession or use of this file requires 
 * a written license from the author.
 * Filename: Span.cpp
 * Abstract:
 *           implementation of the CSpan class.
 * Author:   Kevin Zhang 
 *          (zhanghp@software.ict.ac.cn)
 * Date:     2002-4-23
 *
 * Notes:    Tagging with Hidden Markov Model
 *                
 ****************************************************************************/

#include "stdafx.h"
#include "Span.h"
#include "..\\Segment\\Segment.h"
#include "..\\Utility\\Utility.h"
#include <math.h>
#include <string.h>
#include <stdio.h>
#include <time.h>
//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////
CSpan::CSpan()
{
	if(m_tagType!=TT_NORMAL)
	      m_nTags[0][0]=100;//Begin tag
	else
	      m_nTags[0][0]=0;//Begin tag
	
	m_nTags[0][1]=-1;
	m_dFrequency[0][0]=0;
	m_nCurLength=1;
	m_nUnknownIndex=0;
	m_nStartPos=0;
	m_nWordPosition[1]=0;
	m_sWords[0][0]=0;

	m_tagType=TT_NORMAL;//Default tagging type
}

CSpan::~CSpan()
{
}

bool CSpan::Disamb()
{
	int i,j,k,nMinCandidate;
	double dMinFee,dTmp;
	for(i=1;i<m_nCurLength;i++)//For every word
	{
		for(j=0;m_nTags[i][j]>=0;j++)//For every word
		{
			nMinCandidate=MAX_POS_PER_WORD+1;
			for(k=0;m_nTags[i-1][k]>=0;k++)
			{
				//ConvertPOS(m_nTags[i-1][k],&nKey,&nPrevPOS);
				//ConvertPOS(m_nTags[i][j],&nKey,&nCurPOS);
				//dTmp=m_context.GetContextPossibility(nKey,nPrevPOS,nCurPOS);
				dTmp=-log(m_context.GetContextPossibility(0,m_nTags[i-1][k],m_nTags[i][j]));
				dTmp+=m_dFrequency[i-1][k];//Add the fees
				if(nMinCandidate>10||dTmp<dMinFee)//Get the minimum fee
				{
					nMinCandidate=k;
					dMinFee=dTmp;
				}
			}
			m_nBestPrev[i][j]=nMinCandidate;//The best previous for j
			m_dFrequency[i][j]=m_dFrequency[i][j]+dMinFee;
		}
	}
	
	return true;
}

bool CSpan::Reset(bool bContinue)
{
	if(!bContinue)
	{//||CC_Find("。！”〕〉》」〗】",m_sWords[m_nCurLength-1])
		if(m_tagType!=TT_NORMAL)//Get the last POS in the last sentence
		      m_nTags[0][0]=100;//Begin tag
		else
		      m_nTags[0][0]=0;//Begin tag
		m_nUnknownIndex=0;
		m_dFrequency[0][0]=0;
		m_nStartPos=0;
	}
	else
	{
		m_nTags[0][0]=m_nTags[m_nCurLength-1][0];//Get the last POS in the last sentence
		m_dFrequency[0][0]=m_dFrequency[m_nCurLength-1][0];
	}
    m_nTags[0][1]=-1;//Get the last POS in the last sentence,set the -1 as end flag
	m_nCurLength=1;
	m_nWordPosition[1]=m_nStartPos;	
	m_sWords[0][0]=0;
	return true;
}


bool CSpan::LoadContext(char *sFilename)
{
	return m_context.Load(sFilename);
}
bool CSpan::UnknownMatch()
{//Find the template
  char sPOS[MAX_WORDS_PER_SENTENCE]="z";
  int nStart,nEnd;
  for(int i=1;m_nBestTag[i]>-1;i++)
	sPOS[i]=m_nBestTag[i]+'a';
  sPOS[i]=0;
  char *pFind=strchr(sPOS+1,'b');
  char *pFindEnd;

  while(pFind!=NULL&&*pFind!=NULL)
  {
    pFindEnd=pFind+1;
    while(pFindEnd!=NULL&&*pFindEnd!=NULL&&(*pFindEnd=='c'||*pFindEnd=='d'))
       pFindEnd=pFindEnd+1;
    nStart=pFind-sPOS;
    nEnd=pFindEnd-sPOS;
	//Save the unknown word position
    m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[nStart];
	m_nUnknownWords[m_nUnknownIndex++][1]=m_nWordPosition[nEnd];
	pFind=strchr(pFindEnd,'b');
  }
  return true;
}

bool CSpan::GetBestPOS()
{
  Disamb();
  for(int i=m_nCurLength-1,j=0;i>0;i--)
  {
	 if(m_sWords[i][0])
	 {//Not virtual ending
		 m_nBestTag[i]=m_nTags[i][j];//Record the best POS and its possibility
	 }
	 j=m_nBestPrev[i][j];
  }
  int nEnd=m_nCurLength;//Set the end of POS tagging
  if(m_sWords[m_nCurLength-1][0]==0)
	  nEnd=m_nCurLength-1;
  m_nBestTag[nEnd]=-1;
  return true;
}

bool CSpan::SplitPersonPOS(CDictionary &unlistDict)
{//Split the word with POS 21 and 22
    int i=m_nCurLength-1,j;
	unsigned int nLenWord,nLenPart;
	char sFirstPart[50],sLastPart[50];
	int nFirstPOS,nLastPOS;
	for(;i>0;i--)
	{
		if(m_nBestTag[i]==21||m_nBestTag[i]==22)
		{//Find the POS which need to split
			for(j=m_nCurLength-1;j>i;j--)
			{//Move the POS and words
				strcpy(m_sWords[j+1],m_sWords[j]);
				m_nBestTag[j+1]=m_nBestTag[j];
				m_nWordPosition[j+1]=m_nWordPosition[j];
			}
			m_nCurLength+=1;//The length increment 
	        /*
				CSegment segment;
				segment.Segment(m_sWords[i],unlistDict,1);
			*/
			//Generate new segment words and POS
			if(m_nBestTag[i]==21)
			{//Combination by Previous and first component
				nLenWord=strlen(m_sWords[i]);
				if(nLenWord>4)//Get first component
				{
					strcpy(sLastPart,m_sWords[i]+nLenWord-4);
					if(!unlistDict.IsExist(sLastPart,-1))
						strcpy(sLastPart,m_sWords[i]+nLenWord-2);
				}
				else
				{
					strcpy(sLastPart,m_sWords[i]+nLenWord-2);	
				}
				nLenPart=strlen(sLastPart);
				if(nLenPart<nLenWord)
				{//Get first part
					strncpy(sFirstPart,m_sWords[i],nLenWord-nLenPart);
					sFirstPart[nLenWord-nLenPart]=0;
				}
				else
				{
					strncpy(sFirstPart,m_sWords[i],nLenWord-2);
					sFirstPart[nLenWord-2]=0;
					strncpy(sLastPart,m_sWords[i]+nLenWord-2,2);
					sLastPart[2]=0;
				}
				nFirstPOS=11;
				nLastPOS=1;
			}
			else
			{//Combination by Next word and last component
				nLenWord=strlen(m_sWords[i]);
				if(nLenWord>4)//Get last component
				{
					strncpy(sFirstPart,m_sWords[i],4);
					sFirstPart[4]=0;
					if(!unlistDict.IsExist(sFirstPart,-1))
						sFirstPart[2]=0;
				}
				else
				{
					strncpy(sFirstPart,m_sWords[i],2);	
					sFirstPart[2]=0;
				}
				nLenPart=strlen(sFirstPart);
				if(nLenPart<nLenWord)
				{//Get first part
					strncpy(sLastPart,m_sWords[i]+nLenPart,nLenWord-nLenPart);
					sLastPart[nLenWord-nLenPart]=0;
				}
				else
				{
					strncpy(sFirstPart,m_sWords[i],2);
					sFirstPart[2]=0;
					strncpy(sLastPart,m_sWords[i]+2,nLenWord-2);
					sLastPart[nLenWord-2]=0;
				}
				if(unlistDict.IsExist(sFirstPart,1)&&m_nBestTag[i-1]==5)
					//小陈说：
					nFirstPOS=1;
				else if(unlistDict.IsExist(m_sWords[i-1],1)&&!unlistDict.IsExist(m_sWords[i-2],1))
					nFirstPOS=4;
				else
					nFirstPOS=3;
				nLastPOS=12;
			}
            strcpy(m_sWords[i],sFirstPart);
			m_nBestTag[i]=nFirstPOS;
            strcpy(m_sWords[i+1],sLastPart);
			m_nBestTag[i+1]=nLastPOS;
			m_nWordPosition[i+1]=m_nWordPosition[i]+strlen(sFirstPart);
		}
	}
	return true;
}

bool CSpan::PersonRecognize(CDictionary &personDict)
{
  char sPOS[MAX_WORDS_PER_SENTENCE]="z",sPersonName[100];
                          //0     1    2    3    4   5   
  char sPatterns[][5]={ "BBCD","BBC","BBE","BBZ","BCD","BEE","BE",
						 "BG",  "BXD","BZ", "CDCD","CD","EE", 
						 "FB", "Y","XD",""};
  double dFactor[]={0.0011,0.0011,0.0011,0.0011,0.7614,0.0011,0.2055,
						 0.0160,0.0011,0.0011,0,0.0160,0.0011,
						 0.0160,0.0011,0.0011,0
  };
  //About parameter:
/*
	Given Name: 486     0.0160
	Surname+postfix:484 0.0160
	m_lPerson2Num:6265   0.2055
	m_lPerson3Num: 23184 0.7614
	m_lPerson4Num:32     0.0011
  */
  //The person recognition patterns set
  //BBCD:姓+姓+名1+名2;
  //BBE: 姓+姓+单名;
  //BBZ: 姓+姓+双名成词;
  //BCD: 姓+名1+名2;
  //BE:  姓+单名;
  //BEE: 姓+单名+单名;韩磊磊
  //BG:  姓+后缀
  //BXD: 姓+姓双名首字成词+双名末字
  //BZ:  姓+双名成词;
  //B:	 姓
  //CD:  名1+名2;
  //EE:  单名+单名;
  //FB:  前缀+姓
  //XD:  姓双名首字成词+双名末字
  //Y:   姓单名成词
  int nPatternLen[]={4,3,3,3,3,3,2,2,3,2,4,2,2,2,1,2,0};

  for(int i=1;m_nBestTag[i]>-1;i++)//Convert to string from POS
	sPOS[i]=m_nBestTag[i]+'A';
  sPOS[i]=0;
  int j=1,k,nPos;//Find the proper pattern from the first POS
  int nLittleFreqCount;//Counter for the person name role with little frequecy
  bool bMatched=false;   
  while(j<i)
  {
	bMatched=false;   
	for(k=0;!bMatched&&nPatternLen[k]>0;k++)
	{
		if(strncmp(sPatterns[k],sPOS+j,nPatternLen[k])==0&&strcmp(m_sWords[j-1],"·")!=0&&strcmp(m_sWords[j+nPatternLen[k]],"·")!=0)
		{//Find the proper pattern k
			if(strcmp(sPatterns[k],"FB")==0&&(sPOS[j+2]=='E'||sPOS[j+2]=='C'||sPOS[j+2]=='G'))
			{//Rule 1 for exclusion:前缀+姓+名1(名2): 规则(前缀+姓)失效；
				continue;
			}
/*			if((strcmp(sPatterns[k],"BEE")==0||strcmp(sPatterns[k],"EE")==0)&&strcmp(m_sWords[j+nPatternLen[k]-1],m_sWords[j+nPatternLen[k]-2])!=0)
			{//Rule 2 for exclusion:姓+单名+单名:单名+单名 若EE对应的字不同，规则失效.如：韩磊磊
				continue;
			}

			if(strcmp(sPatterns[k],"B")==0&&m_nBestTag[j+1]!=12)
			{//Rule 3 for exclusion: 若姓后不是后缀，规则失效.如：江主席、刘大娘
				continue;
			}
*/			//Get the possible name
			nPos=j;//Record the person position in the tag sequence
			sPersonName[0]=0;
			nLittleFreqCount=0;//Record the number of role with little frequency
			while(nPos<j+nPatternLen[k])
			{//Get the possible person name
			 //
				if(m_nBestTag[nPos]<4&&personDict.GetFrequency(m_sWords[nPos],m_nBestTag[nPos])<LITTLE_FREQUENCY)
					nLittleFreqCount++;//The counter increase
				strcat(sPersonName,m_sWords[nPos]);
				nPos+=1;
			}
			if(IsAllForeign(sPersonName)&&personDict.GetFrequency(m_sWords[j],1)<LITTLE_FREQUENCY)
			{//Exclusion foreign name
			 //Rule 2 for exclusion:若均为外国人名用字 规则(名1+名2)失效
				j+=nPatternLen[k]-1;
				continue;
			}
			if(strcmp(sPatterns[k],"CDCD")==0)
			{//Rule for exclusion
			 //规则(名1+名2+名1+名2)本身是排除规则:女高音歌唱家迪里拜尔演唱
 			 //Rule 3 for exclusion:含外国人名用字 规则适用
			 //否则，排除规则失效:黑妞白妞姐俩拔了头筹。
				if(GetForeignCharCount(sPersonName)>0)
					j+=nPatternLen[k]-1;
				continue;
			}
			if(strcmp(sPatterns[k],"CD")==0&&IsAllForeign(sPersonName))
			{//
				j+=nPatternLen[k]-1;
				continue;
			}
			if(nLittleFreqCount==nPatternLen[k]||nLittleFreqCount==3)
			//马哈蒂尔;小扎耶德与他的中国阿姨胡彩玲受华黎明大使之邀，
			//The all roles appear with two lower frequecy,we will ignore them
				continue;
			m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[j];
			m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[j+nPatternLen[k]];
			m_dWordsPossibility[m_nUnknownIndex]=log(dFactor[k])+ComputePossibility(j,nPatternLen[k],personDict);
			//Mutiply the factor 
			m_nUnknownIndex+=1;
			j+=nPatternLen[k];
			bMatched=true;
		}
	}
    if(!bMatched)//Not matched, add j by 1
		j+=1;
  }
  return true;
}

int CSpan::GetFrom(PWORD_RESULT pWordItems,int nIndex,CDictionary &dictCore, CDictionary &dictUnknown)
{
	int nCount,aPOS[MAX_POS_PER_WORD],aFreq[MAX_POS_PER_WORD];
	int nFreq=0,j,nRetPos=0,nWordsIndex=0;
	bool bSplit=false;//Need to split in Transliteration recognition 
    int i=1;
	nWordsIndex=i+nIndex-1;
	for(;i<MAX_WORDS_PER_SENTENCE&&pWordItems[nWordsIndex].sWord[0]!=0;i++)
	{
		if(m_tagType==TT_NORMAL||!dictUnknown.IsExist(pWordItems[nWordsIndex].sWord,44))
        {
			strcpy(m_sWords[i],pWordItems[nWordsIndex].sWord);//store current word
   		    m_nWordPosition[i+1]=m_nWordPosition[i]+strlen(m_sWords[i]);
		}
		else
		{
			if(!bSplit)
			{
				strncpy(m_sWords[i],pWordItems[nWordsIndex].sWord,2);//store current word
				m_sWords[i][2]=0;
				bSplit=true;
			}
			else
			{
				unsigned int nLen=strlen(pWordItems[nWordsIndex].sWord+2);
				strncpy(m_sWords[i],pWordItems[nWordsIndex].sWord+2,nLen);//store current word
				m_sWords[i][nLen]=0;
				bSplit=false;
			}
   		    m_nWordPosition[i+1]=m_nWordPosition[i]+strlen(m_sWords[i]);
		}
		//Record the position of current word
		m_nStartPos=m_nWordPosition[i+1];
		//Move the Start POS to the ending
		if(m_tagType!=TT_NORMAL)
		{
			//Get the POSs from the unknown recognition dictionary
			dictUnknown.GetHandle(m_sWords[i],&nCount,aPOS,aFreq);
			for(j=0;j<nCount;j++) 
			{//Get the POS set of sCurWord in the unknown dictionary
				m_nTags[i][j]=aPOS[j];
   				m_dFrequency[i][j]=-log((double)(1+aFreq[j]))+log((double)(m_context.GetFrequency(0,aPOS[j])+1));
			}
			//Get the POS set of sCurWord in the core dictionary
			//We ignore the POS in the core dictionary and recognize them as other (0).
			//We add their frequency to get the possibility as POS 0
			dictCore.GetHandle(m_sWords[i],&nCount,aPOS,aFreq);
			nFreq=0;
			for(int k=0;k<nCount;k++) 
			{
				nFreq+=aFreq[k];
			}
			if(nCount>0)
			{
				m_nTags[i][j]=0;
				//m_dFrequency[i][j]=(double)(1+nFreq)/(double)(m_context.GetFrequency(0,0)+1);
				m_dFrequency[i][j]=-log((double)(1+nFreq))+log((double)(m_context.GetFrequency(0,0)+1));
				j++;
			}
		}
		else//For normal POS tagging
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -