📄 standardtokenizer.cpp
字号:
unReadChar();
goto SUCCESSFULLY_EXTRACTED_NUMBER;
}
CONSUME_DIGITS;
if (!DIGIT && !DECIMAL) {
unReadChar();
} else if (!EOS && DECIMAL && _istdigit(rd->Peek())) {
/* We just read the fractional digit group, but it's also followed by
** a decimal symbol and at least one more digit, so this must be a
** HOST rather than a real number. */
return ReadNumber(str.getBuffer(), '.',t);
}
}
SUCCESSFULLY_EXTRACTED_NUMBER:
TCHAR rightmost = RIGHTMOST(str);
/* Don't including a trailing decimal point. */
if (rightmost == '.') {
SHAVE_RIGHTMOST(str);
unReadChar();
rightmost = RIGHTMOST(str);
}
/* If all we have left is a negative sign, it's not a valid number. */
if (rightmost == '-') {
CND_PRECONDITION (str.len == 1, "Number is invalid");
return false;
}
return setToken(t,&str,tokenType);
}
bool StandardTokenizer::ReadAlphaNum(const TCHAR prev, Token* t) {
t->growBuffer(LUCENE_MAX_WORD_LEN);//make sure token can hold the next word
StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText
if ( str.len < LUCENE_MAX_WORD_LEN ){
str.appendChar(prev);
TCHAR ch = prev;
CONSUME_WORD;
if (!EOS && str.len < LUCENE_MAX_WORD_LEN-1 ) { //still have space for 1 more character?
switch(ch) { /* What follows the first alphanum segment? */
case '.':
str.appendChar('.');
return ReadDotted(&str, CL_NS2(analysis,standard)::UNKNOWN,t);
case '\'':
str.appendChar('\'');
return ReadApostrophe(&str,t);
case '@':
str.appendChar('@');
return ReadAt(&str,t);
case '&':
str.appendChar('&');
return ReadCompany(&str,t);
default:
unReadChar(); // by linfj
break;
/* default: fall through to end of this function. */
}
}
}
return setToken(t,&str,CL_NS2(analysis,standard)::ALPHANUM);
}
bool StandardTokenizer::ReadDotted(StringBuffer* _str, TokenTypes forcedType, Token* t) {
const int32_t specialCharPos = rdPos;
StringBuffer& str=*_str;
/* A segment of a "dotted" is not allowed to begin with another dot or a dash.
** Even though hosts, e-mail addresses, etc., could have a dotted-segment
** that begins with a dot or a dash, it's far more common in source text
** for a pattern like "abc.--def" to be intended as two tokens. */
TCHAR ch = rd->Peek();
if (!(DOT || DASH)) {
bool prevWasDot;
bool prevWasDash;
if (str.len == 0) {
prevWasDot = false;
prevWasDash = false;
} else {
prevWasDot = RIGHTMOST(str) == '.';
prevWasDash = RIGHTMOST(str) == '-';
}
while (!EOS && str.len < LUCENE_MAX_WORD_LEN-1 ) {
ch = readChar();
const bool dot = ch == '.';
const bool dash = ch == '-';
if (!(ALNUM || UNDERSCORE || dot || dash)) {
break;
}
/* Multiple dots or dashes in succession end the token.
** Consider the following inputs:
** "Visit windowsupdate.microsoft.com--update today!"
** "In the U.S.A.--yes, even there!" */
if ((dot || dash) && (prevWasDot || prevWasDash)) {
/* We're not going to append the character we just read, in any case.
** As to the character before it (which is currently RIGHTMOST(str)):
** Unless RIGHTMOST(str) is a dot, in which we need to save it so the
** acronym-versus-host detection can work, we want to get rid of it. */
if (!prevWasDot) {
SHAVE_RIGHTMOST(str);
}
break;
}
str.appendChar(ch);
prevWasDot = dot;
prevWasDash = dash;
}
}
/* There's a potential StringBuffer.append call in the code above, which
** could cause str to reallocate its internal buffer. We must wait to
** obtain the optimization-oriented strBuf pointer until after the initial
** potentially realloc-triggering operations on str.
** Because there can be other such ops much later in this function, strBuf
** is guarded within a block to prevent its use during or after the calls
** that would potentially invalidate it. */
{ /* Begin block-guard of strBuf */
TCHAR* strBuf = str.getBuffer();
bool rightmostIsDot = RIGHTMOST_IS(str, '.');
if (CONSUMED_NOTHING_OF_VALUE) {
/* No more alphanums available for this token; shave trailing dot, if any. */
if (rightmostIsDot) {
SHAVE_RIGHTMOST(str);
}
/* If there are no dots remaining, this is a generic ALPHANUM. */
if (_tcschr(strBuf, '.') == NULL) {
forcedType = CL_NS2(analysis,standard)::ALPHANUM;
}
/* Check the token to see if it's an acronym. An acronym must have a
** letter in every even slot and a dot in every odd slot, including the
** last slot (for example, "U.S.A."). */
} else if (rightmostIsDot) {
bool isAcronym = true;
const int32_t upperCheckLimit = str.len - 1; /* -1 b/c we already checked the last slot. */
for (int32_t i = 0; i < upperCheckLimit; i++) {
const bool even = (i % 2 == 0);
ch = strBuf[i];
if ( (even && !ALPHA) || (!even && !DOT) ) {
isAcronym = false;
break;
}
}
if (isAcronym) {
forcedType = CL_NS2(analysis,standard)::ACRONYM;
} else {
/* If it's not an acronym, we don't want the trailing dot. */
SHAVE_RIGHTMOST(str);
/* If there are no dots remaining, this is a generic ALPHANUM. */
if (_tcschr(strBuf, '.') == NULL) {
forcedType = CL_NS2(analysis,standard)::ALPHANUM;
}
}
}
} /* End block-guard of strBuf */
if (!EOS) {
if (ch == '@' && str.len < LUCENE_MAX_WORD_LEN-1) {
str.appendChar('@');
return ReadAt(&str,t);
} else {
unReadChar();
}
}
return setToken(t,&str,CL_NS2(analysis,standard)::UNKNOWN
? forcedType : CL_NS2(analysis,standard)::HOST);
}
bool StandardTokenizer::ReadApostrophe(StringBuffer* _str, Token* t) {
StringBuffer& str=*_str;
TokenTypes tokenType = CL_NS2(analysis,standard)::APOSTROPHE;
const int32_t specialCharPos = rdPos;
TCHAR ch=0;
CONSUME_ALPHAS;
if (RIGHTMOST_IS(str, '\'') || CONSUMED_NOTHING_OF_VALUE) {
/* After the apostrophe, no more alphanums were available within this
** token; shave trailing apostrophe and revert to generic ALPHANUM. */
SHAVE_RIGHTMOST(str);
tokenType = CL_NS2(analysis,standard)::ALPHANUM;
}
if (!EOS) {
unReadChar();
}
return setToken(t,&str,tokenType);
}
bool StandardTokenizer::ReadAt(StringBuffer* str, Token* t) {
bool ret = ReadDotted(str, CL_NS2(analysis,standard)::EMAIL,t);
/* JLucene grammar indicates dots/digits not allowed in company name: */
if (!CONTAINS_ANY((*str), ".0123456789")) {
setToken(t,str,CL_NS2(analysis,standard)::COMPANY);
}
return true;
}
bool StandardTokenizer::ReadCompany(StringBuffer* _str, Token* t) {
StringBuffer& str = *_str;
const int32_t specialCharPos = rdPos;
TCHAR ch=0;
CONSUME_WORD;
if (CONSUMED_NOTHING_OF_VALUE) {
/* After the ampersand, no more alphanums were available within this
** token; shave trailing ampersand and revert to ALPHANUM. */
CND_PRECONDITION(RIGHTMOST_IS(str, '&'),"ReadCompany failed");
SHAVE_RIGHTMOST(str);
return setToken(t,&str,CL_NS2(analysis,standard)::ALPHANUM);
}
if (!EOS) {
unReadChar();
}
return setToken(t,&str,CL_NS2(analysis,standard)::COMPANY);
}
CL_NS_END2
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -