📄 liblexic.cpp
字号:
// LibLexic.cpp: implementation of the CLibLexic class.
//
//////////////////////////////////////////////////////////////////////
#include "stdafx.h"
#include "LibLexic.h"
//#include "libman.h"
#include "LexConst.h"
#include <assert.h>
#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__;
#define new DEBUG_NEW
#endif
//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////
CLibLexic::CLibLexic()
{
int i,j;
for(i=0; i<62; i++) {
m_aryAllWordTagNum[i] = 0;
m_aryRecWordTagNum[i] = 0;
for(j=0;j<62;j++)
{
m_aryTagLattice[i][j] = 0;
}
}
m_nPreTag = 0;
}
CLibLexic::~CLibLexic()
{
}
bool CLibLexic::Analyse(string str)
{
m_strLine.erase(m_strLine.begin(),m_strLine.end());
m_vecWordTag.erase(m_vecWordTag.begin(),m_vecWordTag.end());
m_nPreTag = 0;
if(IsSentence(str))
{
Normalize(str);
return true;
}
else
return false;
} // 当为句子的时候返回'真'
bool CLibLexic::IsSentence(string str)
{
int i;
string strTemp = str;
unsigned int state = CLibLexic::STATE_S_0;
bool bSentence;
for(bSentence=false,i=0; i<str.length(); i++)
{
switch(state) {
case CLibLexic::STATE_S_0 :
if(str[i]=='<')
state = CLibLexic::STATE_S_1;
else
state = CLibLexic::STATE_S_0;
break;
case CLibLexic::STATE_S_1 :
if(str[i]=='s')
state = CLibLexic::STATE_S_2;
else if (str[i]=='\040') // '\040'是八进制表示空格
state = CLibLexic::STATE_S_1;
else
state = CLibLexic::STATE_S_0;
break;
case CLibLexic::STATE_S_2 :
if (str[i]=='\040')
state = CLibLexic::STATE_S_3;
else
state = CLibLexic::STATE_S_0;
break;
case CLibLexic::STATE_S_3 :
if(str[i]=='n')
state = CLibLexic::STATE_S_4;
else if (str[i]=='\040')
state = CLibLexic::STATE_S_3;
else
state = CLibLexic::STATE_S_0;
break;
case CLibLexic::STATE_S_4 :
if(str[i]=='=')
state = CLibLexic::STATE_S_5;
else if (str[i]=='\040')
state = CLibLexic::STATE_S_4;
else
state = CLibLexic::STATE_S_0;
break;
case CLibLexic::STATE_S_5 :
if(str[i]=='"')
state = CLibLexic::STATE_S_6;
else if (str[i]=='\040')
state = CLibLexic::STATE_S_5;
else
state = CLibLexic::STATE_S_0;
break;
case CLibLexic::STATE_S_6 :
if( (str[i]>='0') && (str[i]<='9') )
state = CLibLexic::STATE_S_7;
else if (str[i]=='\040')
state = CLibLexic::STATE_S_6;
else
state = CLibLexic::STATE_S_0;
break;
case CLibLexic::STATE_S_7 :
if(str[i]=='"')
state = CLibLexic::STATE_S_8;
else if( (str[i]>='0') && (str[i]<='9') )
state = CLibLexic::STATE_S_7;
else
state = CLibLexic::STATE_S_0;
break;
case CLibLexic::STATE_S_8 :
if(str[i]=='>') {
state = CLibLexic::STATE_S_9;
bSentence = true;
}
else if (str[i]=='\040')
state = CLibLexic::STATE_S_8;
else
state = CLibLexic::STATE_S_0;
break;
}
if(bSentence) break;
}
if(bSentence) {
m_strLine.assign(str,0,i+1);
//str.assign(str,i,str.length()-i); //去除<s n=" ">
return true;
}
else return false;
}
void CLibLexic::Normalize(string str)
{
string strWords,strTag;
unsigned int state = CLibLexic::STATE_S_A;
for(int i=0; i<str.length(); i++)
{
switch(state) {
case CLibLexic::STATE_S_A :
if(str[i]=='<') {
state = CLibLexic::STATE_S_B;
}
else {
state = CLibLexic::STATE_S_A;
}
break;
case CLibLexic::STATE_S_B :
if(str[i]=='w') {
state = CLibLexic::STATE_S_C;
}
else if (str[i]=='c') {
state = CLibLexic::STATE_S_C;
}
else if (str[i]=='\040') {
state = CLibLexic::STATE_S_B;
}
else {
state = CLibLexic::STATE_S_A;
}
break;
case CLibLexic::STATE_S_C :
if (str[i]=='\040') {
state = CLibLexic::STATE_S_D;
}
else {
state = CLibLexic::STATE_S_A;
}
break;
case CLibLexic::STATE_S_D :
if(str[i]=='>') {
state = CLibLexic::STATE_S_E;
}
else { //应该是字母数字或减号(标记符) 如NN1-NNP 还有空格
strTag.append(1,str[i]);
state = CLibLexic::STATE_S_D;
}
break;
case CLibLexic::STATE_S_E :
if(str[i]=='<') {
RecordTagWords(strTag,strWords);
strTag.erase(strTag.begin(),strTag.end());
strWords.erase(strWords.begin(),strWords.end());
state = CLibLexic::STATE_S_B;
} // str[i]=='<'
else { //其他所有的标记符号,作为单词
strWords.append(1,str[i]);
state = CLibLexic::STATE_S_E;
}
break;
}
}
if(strTag.length()>0) {
RecordTagWords(strTag,strWords);
}
}
void CLibLexic::TrimWord(string &strWord)
{
string strTemp;
int nState;
int i;
for(i=0,nState=0; i<strWord.length(); i++)
{
switch(nState) {
case 0:
if(strWord[i]=='\040') {
nState = 0;
}
else {
strTemp.append(1,strWord[i]);
nState = 1;
}
break;
case 1:
if(strWord[i]=='\040') {
nState = 2;
}
else {
strTemp.append(1,strWord[i]);
nState = 1;
}
break;
case 2:
if(strWord[i]=='\040') {
nState = 2;
}
else {
strTemp.append(1,'\040');
strTemp.append(1,strWord[i]);
nState = 1;
}
break;
}
}
strWord.erase(strWord.begin(),strWord.end());
strWord = strWord + strTemp;
}
//若包含如 &等的模式的词时,返回false
// 比如法语词就是含有— $aacute; 等等
bool CLibLexic::TrimAndTell(string &strWord)
{
string strTemp;
int nState;
int i;
bool bRet;
for(i=0,nState=0 , bRet=true;
i<strWord.length() && bRet;
i++)
{
switch(nState) {
case 0:
if(strWord[i]=='\040') {
nState = 0;
}
else if(strWord[i]=='&') {
strTemp.append(1,strWord[i]);
nState = 10;
}
else { // '其他' ';'
strTemp.append(1,strWord[i]);
nState = 11;
}
break;
case 10:
if(strWord[i]=='\040') {
nState = 2;
}
else if(strWord[i]=='&') {
strTemp.append(1,strWord[i]);
nState = 10;
}
else if(strWord[i]==';') {
strTemp.append(1,strWord[i]);
nState = 11;
}
else { // '其他'
strTemp.append(1,strWord[i]);
nState = 3;
}
break;
case 11:
if(strWord[i]=='\040') {
nState = 2;
}
else if(strWord[i]=='&') {
strTemp.append(1,strWord[i]);
nState = 10;
}
else { // '其他' ';'
strTemp.append(1,strWord[i]);
nState = 11;
}
break;
case 2:
if(strWord[i]=='\040') {
nState = 2;
}
else if(strWord[i]=='&') {
strTemp.append(1,'\040');
strTemp.append(1,strWord[i]);
nState = 10;
}
else { // '其他' ';'
strTemp.append(1,'\040');
strTemp.append(1,strWord[i]);
nState = 11;
}
break;
case 3:
if(strWord[i]=='\040') {
nState = 2;
}
else if(strWord[i]=='&') {
strTemp.append(1,strWord[i]);
nState = 10;
}
else if(strWord[i]==';') {
bRet = false;
}
else { // '其他'
strTemp.append(1,strWord[i]);
nState = 3;
}
break;
#ifdef _DEBUG
default:
cout<< "Just incredible!"<<endl;
#endif
}
}
if(bRet) {
strWord.erase(strWord.begin(),strWord.end());
strWord = strWord + strTemp;
}
return bRet;
}
bool CLibLexic::IsPureDigits(string str)
{ // 前面可以有加减号
// 只含有数字,逗号,点号,空格被认为是纯数字
bool bPureDigits;
int i,state;
for(bPureDigits=true,i=0, state=0;
i<str.length() && bPureDigits;
i++)
{
switch(state) {
case 0:
if(str[i]=='\040') {
state = 0;
}
else if( (str[i]=='+') || (str[i]=='-')
||(str[i]==',')
||(str[i]=='.')
||( (str[i]>='0')&&(str[i]<='9')) )
{
state = 1;
}
else {
bPureDigits = false;
}
break;
case 1:
if( (str[i]=='\040')
||(str[i]==',')
||(str[i]=='.')
||( (str[i]>='0')&&(str[i]<='9')) )
{
state = 1;
}
else {
bPureDigits = false;
}
} // switch
} // for
return bPureDigits;
}
void CLibLexic::PartitionWords(string str,vector<string>& vecWords)
{
int i;
string strTemp;
vecWords.erase(vecWords.begin(),vecWords.end());
for(i=0; i<str.length(); i++)
{
if(str[i]=='/') {
if(strTemp.length()>0) {
vecWords.push_back(strTemp);
}
strTemp.erase(strTemp.begin(),strTemp.end());
}
else {
strTemp.append(1,str[i]);
}
}
if(strTemp.length()>0) {
vecWords.push_back(strTemp);
}
}
// 将用'/'符号断开的词汇分开
// 但
// (1) 比如法语词就是含有— $aacute; 等等
// (2) 标记 UNC PUL PUQ PUN PUR
// (3) 标记 CRD 若全是数字 逗号 点号
// 不进行处理
//
// 就不记录下来
// 记录超过一定长度的‘合法’的单词 m_vecStrWord
// 并且将'合法'词汇中的'畸形','冗余'词汇记录下来 m_vecVerbose
void CLibLexic::RecordTagWords(string strTag, string strWords)
{
string strTW,strWord;
vector<string> vecWords;
vector<string>::iterator iter;
unsigned int nTag;
TrimWord(strTag);
nTag = ReplaceTagStrByInt(strTag);
m_aryTagLattice[m_nPreTag][nTag % CTagConst::tagUNKNOWN] ++ ;
m_nPreTag = (nTag % CTagConst::tagUNKNOWN);
PartitionWords(strWords,vecWords);
for(iter=vecWords.begin();iter!=vecWords.end();iter++)
{
m_aryAllWordTagNum[nTag % CTagConst::tagUNKNOWN] ++;
// 将tagUNkNOWN存在m_aryAllWordTagNum[0]处
strWord = *iter;
if( (strTag.compare("CRD")==0)
&& IsPureDigits(strWord) )
{
}
else if( (strTag.compare("UNC")==0)
||(strTag.compare("PUL")==0)
||(strTag.compare("PUN")==0)
||(strTag.compare("PUQ")==0)
||(strTag.compare("PUR")==0) )
{
}
else if(! TrimAndTell(strWord) ) {
//含有— $aacute; 等等
}
else { // 这里处理所谓的'合法'单词
SWordTag wt;
if(IsMalform(strWord)) {
string strPat,strPat1;
if(PatternDD(strWord,strPat1)) {
}
else if(FitPattern(strWord,strPat)) {
if(strPat.length()<=WORDLEN) {
//真正需要进入字典的单词
wt.nTag = nTag;//ReplaceTagStrByInt(strTag);
strcpy(wt.cstrWord,strPat.c_str());
_strlwr(wt.cstrWord);
m_vecWordTag.push_back(wt);
m_aryRecWordTagNum[nTag % CTagConst::tagUNKNOWN] ++;
// 将tagUNkNOWN存在m_aryRecWordTagNum[0]处
}
}
else {
}
}
else if(IsVerbose(strWord)) {
}
else if(strWord.length()>WORDLEN) {
}
else if(strWord.length()>0) {
// 在PartitionWords有可能将空格存入
//真正需要进入字典的单词
wt.nTag = nTag;//ReplaceTagStrByInt(strTag);
strcpy(wt.cstrWord,strWord.c_str());
_strlwr(wt.cstrWord);
m_vecWordTag.push_back(wt);
m_aryRecWordTagNum[nTag % CTagConst::tagUNKNOWN] ++;
// 将tagUNkNOWN存在m_aryRecWordTagNum[0]处
}
}
}
}
//unsigned int CLibLexic::ReplaceTagStrByInt(string strTag)
//{
// bool bMatchTag;
// unsigned int nTag;
// int i;
// if(strTag.length()>3){
// strTag.erase(strTag.begin()+3,strTag.end());
// }
// bMatchTag = false;
// for(i=0;i<CTagConst::GetTagsNum(); i++)
// {
// if(strTag.compare(CTagConst::m_strTags[i])==0){
// bMatchTag = true;
// nTag = (unsigned int) i+1;
// // 注意tag是从 1 开始编号的,因为这样可以和
// // CLexBTree中的ClearCell一致
// break;
// }
// }
// if(bMatchTag){
// return nTag;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -