📄 segment.cpp
字号:
//////////////////////////////////////////////////////////////////////
//ICTCLAS简介:计算所汉语词法分析系统ICTCLAS(Institute of Computing Technology, Chinese Lexical Analysis System),
// 功能有:中文分词;词性标注;未登录词识别。
// 分词正确率高达97.58%(973专家评测结果),
// 未登录词识别召回率均高于90%,其中中国人名的识别召回率接近98%;
// 处理速度为31.5Kbytes/s。
//著作权: Copyright?2002-2005中科院计算所 职务著作权人:张华平 刘群
//遵循协议:自然语言处理开放资源许可证1.0
//Email: zhanghp@software.ict.ac.cn
//Homepage:www.nlp.org.cn;mtgroup.ict.ac.cn
/****************************************************************************
*
* Copyright (c) 2000, 2001
* Machine Group
* Software Research Lab.
* Institute of Computing Tech.
* Chinese Academy of Sciences
* All rights reserved.
*
* This file is the confidential and proprietary property of
* Institute of Computing Tech. and the posession or use of this file requires
* a written license from the author.
* Filename: Segment.cpp
* Abstract:
* implementation of the CSegment class.
* Author: Kevin Zhang
* (zhanghp@software.ict.ac.cn)
* Date: 2002-4-23
*
* Notes: N-Shortest paths Word segmentation
*
****************************************************************************/
#include "stdafx.h"
#include "Segment.h"
#include "..\\Utility\\Dictionary.h"
#include "..\\Utility\\Utility.h"
#include "NShortPath.h"
#include <string.h>
#include <math.h>
//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////
CSegment::CSegment()
{
//malloc buffer
m_pWordSeg=new PWORD_RESULT[MAX_SEGMENT_NUM];
for(int i=0;i<MAX_SEGMENT_NUM;i++)
{
m_pWordSeg[i]=new WORD_RESULT[MAX_WORDS];
}
m_npWordPosMapTable=0;//Record the start position of possible words
m_nWordCount=0;//Record the End position of possible words
m_graphOptimum.SetRowFirst();//Set row first
}
CSegment::~CSegment()
{
//free buffer
for(int i=0;i<MAX_SEGMENT_NUM;i++)
{
delete m_pWordSeg[i];
}
delete m_pWordSeg;
}
bool CSegment::Segment(char *sSentence,CDictionary &dictCore,int nResultCount)
{
int **nSegRoute;//The segmentation route
nSegRoute=new int*[MAX_SEGMENT_NUM];
for(int i=0;i<MAX_SEGMENT_NUM;i++)
{
nSegRoute[i]=new int[MAX_SENTENCE_LEN/2];
memset(nSegRoute[i],0,MAX_SENTENCE_LEN/2*sizeof(int));
}
m_graphSeg.m_segGraph.SetRowFirst(false);
m_graphOptimum.SetRowFirst(false);
m_graphSeg.GenerateWordNet(sSentence,dictCore);
CNShortPath sp(&m_graphSeg.m_segGraph,nResultCount);
sp.ShortPath();
sp.Output((int **)nSegRoute,false,&m_nSegmentCount);
m_graphOptimum.SetEmpty();//Set graph optimum empty
i=0;
while(i<m_nSegmentCount)
{
GenerateWord(nSegRoute,i);
//Gernerate word according the Segmentation route
i++;
}
//free the memory
for(i=0;i<MAX_SEGMENT_NUM;i++)
{
delete [] nSegRoute[i];//free the pointer memory
}
delete [] nSegRoute;//free the pointer array
return true;
}
//Generate Word according the segmentation route
bool CSegment::GenerateWord(int **nSegRoute, int nIndex)
{
unsigned int i=0,k=0;
int j,nStartVertex,nEndVertex,nPOS;
char sAtom[WORD_MAXLENGTH],sNumCandidate[100],sCurWord[100];
ELEMENT_TYPE fValue;
while(nSegRoute[nIndex][i]!=-1&&nSegRoute[nIndex][i+1]!=-1&&nSegRoute[nIndex][i]<nSegRoute[nIndex][i+1])
{
nStartVertex=nSegRoute[nIndex][i];
j=nStartVertex;//Set the start vertex
nEndVertex=nSegRoute[nIndex][i+1];//Set the end vertex
nPOS=0;
m_graphSeg.m_segGraph.GetElement(nStartVertex,nEndVertex,&fValue,&nPOS);
sAtom[0]=0;
while(j<nEndVertex)
{//Generate the word according the segmentation route
strcat(sAtom,m_graphSeg.m_sAtom[j]);
j++;
}
m_pWordSeg[nIndex][k].sWord[0]=0;//Init the result ending
strcpy(sNumCandidate,sAtom);
while(sAtom[0]!=0&&(IsAllNum((unsigned char *)sNumCandidate)||IsAllChineseNum(sNumCandidate)))
{//Merge all seperate continue num into one number
//sAtom[0]!=0: add in 2002-5-9
strcpy(m_pWordSeg[nIndex][k].sWord,sNumCandidate);
//Save them in the result segmentation
i++;//Skip to next atom now
sAtom[0]=0;
while(j<nSegRoute[nIndex][i+1])
{//Generate the word according the segmentation route
strcat(sAtom,m_graphSeg.m_sAtom[j]);
j++;
}
strcat(sNumCandidate,sAtom);
}
unsigned int nLen=strlen(m_pWordSeg[nIndex][k].sWord);
if(nLen==4&&CC_Find("第上成±—+∶·./",m_pWordSeg[nIndex][k].sWord)||nLen==1&&strchr("+-./",m_pWordSeg[nIndex][k].sWord[0]))
{//Only one word
strcpy(sCurWord,m_pWordSeg[nIndex][k].sWord);//Record current word
i--;
}
else if(m_pWordSeg[nIndex][k].sWord[0]==0)//Have never entering the while loop
{
strcpy(m_pWordSeg[nIndex][k].sWord,sAtom);
//Save them in the result segmentation
strcpy(sCurWord,sAtom);//Record current word
}
else
{//It is a num
if(strcmp("--",m_pWordSeg[nIndex][k].sWord)==0||strcmp("—",m_pWordSeg[nIndex][k].sWord)==0||m_pWordSeg[nIndex][k].sWord[0]=='-'&&m_pWordSeg[nIndex][k].sWord[1]==0)//The delimiter "--"
{
nPOS=30464;//'w'*256;Set the POS with 'w'
i--;//Not num, back to previous word
}
else
{//Adding time suffix
char sInitChar[3];
unsigned int nCharIndex=0;//Get first char
sInitChar[nCharIndex]=m_pWordSeg[nIndex][k].sWord[nCharIndex];
if(sInitChar[nCharIndex]<0)
{
nCharIndex+=1;
sInitChar[nCharIndex]=m_pWordSeg[nIndex][k].sWord[nCharIndex];
}
nCharIndex+=1;
sInitChar[nCharIndex]='\0';
if(k>0&&(abs(m_pWordSeg[nIndex][k-1].nHandle)==27904||abs(m_pWordSeg[nIndex][k-1].nHandle)==29696)&&(strcmp(sInitChar,"—")==0||sInitChar[0]=='-')&&(strlen(m_pWordSeg[nIndex][k].sWord)>nCharIndex))
{//3-4月 //27904='m'*256
//Split the sInitChar from the original word
strcpy(m_pWordSeg[nIndex][k+1].sWord,m_pWordSeg[nIndex][k].sWord+nCharIndex);
m_pWordSeg[nIndex][k+1].dValue=m_pWordSeg[nIndex][k].dValue;
m_pWordSeg[nIndex][k+1].nHandle=27904;
m_pWordSeg[nIndex][k].sWord[nCharIndex]=0;
m_pWordSeg[nIndex][k].dValue=0;
m_pWordSeg[nIndex][k].nHandle=30464;//'w'*256;
m_graphOptimum.SetElement(nStartVertex,nStartVertex+1,m_pWordSeg[nIndex][k].dValue,m_pWordSeg[nIndex][k].nHandle,m_pWordSeg[nIndex][k].sWord);
nStartVertex+=1;
k+=1;
}
nLen=strlen(m_pWordSeg[nIndex][k].sWord);
if((strlen(sAtom)==2&&CC_Find("月日时分秒",sAtom))||strcmp(sAtom,"月份")==0)
{//2001年
strcat(m_pWordSeg[nIndex][k].sWord,sAtom);
strcpy(sCurWord,"未##时");
nPOS=-29696;//'t'*256;//Set the POS with 'm'
}
else if(strcmp(sAtom,"年")==0)
{
if(IsYearTime(m_pWordSeg[nIndex][k].sWord))//strncmp(sAtom,"年",2)==0&&
{//1998年,
strcat(m_pWordSeg[nIndex][k].sWord,sAtom);
strcpy(sCurWord,"未##时");
nPOS=-29696;//Set the POS with 't'
}
else
{
strcpy(sCurWord,"未##数");
nPOS=-27904;//Set the POS with 'm'
i--;//Can not be a time word
}
}
else
{
//早晨/t 五点/t
if(strcmp(m_pWordSeg[nIndex][k].sWord+strlen(m_pWordSeg[nIndex][k].sWord)-2,"点")==0)
{
strcpy(sCurWord,"未##时");
nPOS=-29696;//Set the POS with 't'
}
else
{
if(!CC_Find("∶·./",m_pWordSeg[nIndex][k].sWord+nLen-2)&&m_pWordSeg[nIndex][k].sWord[nLen-1]!='.'&&m_pWordSeg[nIndex][k].sWord[nLen-1]!='/')
{
strcpy(sCurWord,"未##数");
nPOS=-27904;//'m'*256;Set the POS with 'm'
}
else if(nLen>strlen(sInitChar))
{//Get rid of . example 1.
if(m_pWordSeg[nIndex][k].sWord[nLen-1]=='.'||m_pWordSeg[nIndex][k].sWord[nLen-1]=='/')
m_pWordSeg[nIndex][k].sWord[nLen-1]=0;
else
m_pWordSeg[nIndex][k].sWord[nLen-2]=0;
strcpy(sCurWord,"未##数");
nPOS=-27904;//'m'*256;Set the POS with 'm'
i--;
}
}
i--;//Not num, back to previous word
}
}
fValue=0;
nEndVertex=nSegRoute[nIndex][i+1];//Ending POS changed to latter
}
m_pWordSeg[nIndex][k].nHandle=nPOS;//Get the POS of current word
m_pWordSeg[nIndex][k].dValue=fValue;//(int)(MAX_FREQUENCE*exp(-fValue));//Return the frequency of current word
m_graphOptimum.SetElement(nStartVertex,nEndVertex,fValue,nPOS,sCurWord);
//Generate optimum segmentation graph according the segmentation result
i++;//Skip to next atom
k++;//Accept next word
}
m_pWordSeg[nIndex][k].sWord[0]=0;
m_pWordSeg[nIndex][k].nHandle=-1;//Set ending
return true;
}
//DEL bool CSegment::GetSegmentResult(int nIndex,char *sResult)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -