📄 liblexic.cpp
字号:
// }
// else {
// return (unsigned int) CTagConst::tagUNKNOWN;
// }
//}
unsigned int CLibLexic::ReplaceTagStrByInt(string strTag)
{
return CTagConst::GetTagNo(strTag);
}
bool CLibLexic::IsMalform(string strWord)
{
int i;
bool bMalform;
// 如果单词中出现 除 - . ' 空格 字母 以外的词 全是 畸形
for(i=0, bMalform=false;
i<strWord.length() && !bMalform; i++) {
if(! ( (strWord[i]=='-') || (strWord[i]=='.')
|| (strWord[i]=='\'')
|| (strWord[i]=='\040')
||( strWord[i]>='a' && strWord[i]<='z')
||( strWord[i]>='A' && strWord[i]<='Z') ) )
{
bMalform = true;
break;
}
}
return bMalform;
}
bool CLibLexic::IsVerbose(string strWord)
{
int i;
bool bVerbose = false;
char c1,c2;
int count;
// 也就是不区分大小写
ToLower(strWord);
//是不是有连续超过三个(包括三个)的相同字母,若有则为冗余
for(i=1,c1=strWord[0],count=1; i<strWord.length(); i++)
{
if(c1!=strWord[i]) {
count=1;
c1=strWord[i];
}
else {
count++;
}
if(count>2) {
bVerbose = true;
break;
}
}
if(bVerbose) return true;
// 是不是有连续超过三个(包括三个)相同两连字如...hehehe...等,若有则为冗余
int letter_state;
for(i=0,letter_state=0;
i<strWord.length(); i++)
{
switch(letter_state) {
case 0:
c1 = strWord[i];
letter_state = 1;
break;
case 1:
c2=strWord[i];
count=1;
letter_state = 2;
break;
case 2:
if(strWord[i] == c1) {
letter_state = 3;
}
else {
c1 = c2;
c2 = strWord[i];
letter_state = 2;
count = 1;
}
break;
case 3:
if(strWord[i] == c2) {
count++;
letter_state = 2;
}
else {
c1 = c2;
c2 = strWord[i];
letter_state = 2;
count = 1;
}
break;
} // switch
if(count>2) {
bVerbose = true;
break;
}
}
if(bVerbose) return true;
// 如果单词中的'点'超过三个就是'冗余'
for(i=0, count=0;
i<strWord.length(); i++) {
if( strWord[i]=='.') count++;
if(count>3)
{
bVerbose = true;
break;
}
}
return bVerbose;
}
// 匹配一些指定的模式
bool CLibLexic::FitPattern(string strWord,string& strPattern)
{
bool bFit;
bFit = FitPatternEx(strWord,strPattern);
if(bFit) {
ToLower(strPattern);
// m_vecPattern.push_back(strPattern);
}
return bFit;
}
bool CLibLexic::FitPatternEx(string strWord,string& strPattern)
{
string strTemp;
bool bFitted;
int i,len;
len = strWord.length();
for(i=0, bFitted=true; i<len && bFitted; i++)
{
if(strWord[i]=='-') {
bFitted = FitPatternExEx(strTemp,strPattern);
strTemp.erase(strTemp.begin(),strTemp.end());
strPattern = strPattern + "-";
}
else {
strTemp.append(1,strWord[i]);
}
}
if(bFitted) {
bFitted = FitPatternExEx(strTemp,strPattern);
}
if(!bFitted) {
strPattern.erase(strPattern.begin(),strPattern.end());
}
return bFitted;
}
bool CLibLexic::FitPatternExEx(string strTemp,string& strPattern)
{
string strTempPat;
bool bFit;
bFit = true;
if(PatternNumeral(strTemp,strTempPat))
{
strPattern = strPattern + strTempPat;
}
else if(PatternAlphas(strTemp))
{
strPattern = strPattern + strTemp;
}
else if(PatternAD(strTemp,strTempPat))
{
strPattern = strPattern + strTempPat;
}
// else if(PatternDD(strTemp,strTempPat))
// {
// strPattern = strPattern + strTempPat;
// }
else if(PatternDA(strTemp,strTempPat))
{
strPattern = strPattern + strTempPat;
}
else if (PatternIS(strTemp,strTempPat))
{
strPattern = strPattern + strTempPat;
}
else if (PatternLSR(strTemp,strTempPat))
{
strPattern = strPattern + strTempPat;
}
else {
bFit = false;
}
return bFit;
}
// 符合数的书写规范的如 100,323.0234 32.34 23,542,235
bool CLibLexic::NumeralInteger(string strWord)
{
bool bInteger;
int len,i;
len = strWord.length();
for(i=0, bInteger=true; i< len && bInteger; i++)
{
if (!(strWord[i]>='0' && strWord[i]<='9'))
{
bInteger = false;
}
}
return bInteger;
}
bool CLibLexic::NumeralFloat(string strWord)
{
bool bNormalQuit,bAtEndState;
int i,state,len;
len = strWord.length();
bAtEndState = false;
bNormalQuit = true;
for(i=0, state=1; i<len && bNormalQuit; i++)
{
switch(state) {
case 1:
if (strWord[i]>='0' && strWord[i]<='9')
{
bAtEndState = true;
state = 1;
}
else if (strWord[i] =='.' )
{
bAtEndState = false;
state = 3;
}
else {
bNormalQuit = false;
}
break;
case 3:
if (strWord[i]>='0' && strWord[i]<='9')
{
bAtEndState = true;
state = 4;
}
else {
bNormalQuit = false;
}
break;
case 4:
if (strWord[i]>='0' && strWord[i]<='9')
{
bAtEndState = true;
state = 4;
}
else {
bNormalQuit = false;
}
break;
default:
assert(false);
} // switch
}
return bNormalQuit && bAtEndState;
}
bool CLibLexic::NumeralDigits(string strWord)
{
bool bNormalQuit,bAtEndState;
int i,state;
int len;
len = strWord.length();
bNormalQuit =true;
bAtEndState = false;
for(i=0, state=0; i<len && bNormalQuit; i++)
{
switch(state) {
case 0:
if (strWord[i]>='0' && strWord[i]<='9')
{
bAtEndState = true;
state = 1;
}
else {
bNormalQuit = false;
}
break;
case 1:
if (strWord[i]>='0' && strWord[i]<='9')
{
bAtEndState = true;
state = 1;
}
else if (strWord[i] == ',')
{
bAtEndState = false;
state = 2;
}
else if (strWord[i] =='.' )
{
bAtEndState = false;
state = 3;
}
else {
bNormalQuit = false;
}
break;
case 2:
if (strWord[i]>='0' && strWord[i]<='9')
{
bAtEndState = true;
state = 1;
}
else {
bNormalQuit = false;
}
break;
case 3:
if (strWord[i]>='0' && strWord[i]<='9')
{
bAtEndState = true;
state = 4;
}
else {
bNormalQuit = false;
}
break;
case 4:
if (strWord[i]>='0' && strWord[i]<='9')
{
bAtEndState = true;
state = 4;
}
else {
bNormalQuit = false;
}
break;
default:
assert(false);
} // switch
}
return bNormalQuit && bAtEndState;
}
// 整数(不可有逗号) 记为{i}
// 整数(不可有逗号) 实数(不可有逗号) 纯小数(如 .32) 记为{f}
// 整数(可有逗号) 实数(可有逗号) 记为{d}
bool CLibLexic::PatternNumeral(string strNum,string& strNumPat)
{
// 注意这里的顺序一定不能错
if(NumeralInteger(strNum))
{
strNumPat = "{i}";
}
else if(NumeralFloat(strNum))
{
strNumPat = "{f}";
}
else if(NumeralDigits(strNum))
{
strNumPat = "{d}";
}
else {
return false;
}
return true;
}
// 全字母.
// 全字母
bool CLibLexic::PatternAlphas(string str)
{
bool bAlphas;
int i,len;
len = str.length();
if('.'==str[len-1])
{
len = len - 1;
}
for(bAlphas=true,i=0; i<len && bAlphas; i++)
{
if( ! ( ( str[i]>='a' && str[i]<='z')
||( str[i]>='A' && str[i]<='Z') ) )
{
bAlphas = false;
}
}
return bAlphas;
}
// 单字符 + . + {i}
// 单字符 + {i} + . + {i} V.4-compliant X11.5-based
// 单字符 + {i}
// 全字母 + {i} + ....省略
// 说明:若只加 全字母 就结束了 若加{i} + 全字母 还可以递归
bool CLibLexic::PatternAD(string strWord,string& strPattern)
{
string strTempPat;
int i,state,len;
bool bAtEndState,bNormalQuit;
len = strWord.length();
for(i=0, state=0, bAtEndState=false, bNormalQuit=true;
i<len && bNormalQuit;
i++)
{
switch(state) {
case 0:
if ( ( strWord[i]>='a' && strWord[i]<='z') || ( strWord[i]>='A' && strWord[i]<='Z'))
{
strTempPat.append(1,strWord[i]);
state = 1;
bAtEndState = false;
}
else {
bNormalQuit = false;
}
break;
case 1:
if ( ( strWord[i]>='a' && strWord[i]<='z') || ( strWord[i]>='A' && strWord[i]<='Z'))
{
strTempPat.append(1,strWord[i]);
state = 2;
bAtEndState = true;
}
else if (strWord[i]>='0' && strWord[i]<='9')
{
state = 4;
bAtEndState = true;
}
else if ('.'==strWord[i]) {
strTempPat.append(1,strWord[i]);
state = 3;
bAtEndState = false;
}
else {
bNormalQuit = false;
}
break;
case 2:
if ( ( strWord[i]>='a' && strWord[i]<='z') || ( strWord[i]>='A' && strWord[i]<='Z'))
{
strTempPat.append(1,strWord[i]);
state = 2;
bAtEndState = true;
}
else if (strWord[i]>='0' && strWord[i]<='9')
{
state = 4;
bAtEndState = true;
}
else {
bNormalQuit = false;
}
break;
case 3:
if (strWord[i]>='0' && strWord[i]<='9')
{
state = 5;
bAtEndState = true;
}
else {
bNormalQuit = false;
}
break;
case 4:
if ( ( strWord[i]>='a' && strWord[i]<='z') || ( strWord[i]>='A' && strWord[i]<='Z'))
{
strTempPat = strTempPat + "{i}";
strTempPat.append(1,strWord[i]);
state = 2;
bAtEndState = true;
}
else if (strWord[i]>='0' && strWord[i]<='9')
{
state = 4;
bAtEndState = true;
}
else if ('.'==strWord[i]) {
strTempPat = strTempPat + "{i}";
strTempPat.append(1,strWord[i]);
state = 3;
bAtEndState = false;
}
else {
bNormalQuit = false;
}
break;
case 5:
if (strWord[i]>='0' && strWord[i]<='9')
{
state = 5;
bAtEndState = true;
}
else {
bNormalQuit = false;
}
break;
default:
assert(false);
} // switch;
}// for
if(bNormalQuit&&bAtEndState) {
switch(state) {
case 2:
strPattern = strTempPat;
break;
case 4:
case 5:
strPattern = strTempPat+"{i}";
break;
default:
assert(false);
}
return true;
}
else {
return false;
}
}
// {d}%
// {d} + :{d} + .......(省略)
bool CLibLexic::PatternDD(string strWord,string& strPattern)
{
string strTemp,strTempPat,strTempPatEx;
bool bFitted;
int i,len;
len = strWord.length();
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -