📄 standardtokenizer.cpp
字号:
#include "CLucene/StdHeader.h"
#include "StandardTokenizer.h"
CL_NS_USE(analysis)
CL_NS_USE(util)
CL_NS_DEF2(analysis,standard)
/* A bunch of shortcut macros, many of which make assumptions about variable
** names. These macros enhance readability, not just convenience! */
#define EOS (rd->Eos())
#define SPACE (_istspace((TCHAR)ch) != 0)
#define ALPHA (_istalpha((TCHAR)ch) != 0)
#define ALNUM (_istalnum(ch) != 0)
#define DIGIT (_istdigit(ch) != 0)
#define UNDERSCORE (ch == '_')
#define DASH (ch == '-')
#define NEGATIVE_SIGN_ DASH
#define POSITIVE_SIGN_ (ch == '+')
#define SIGN (NEGATIVE_SIGN_ || POSITIVE_SIGN_)
#define DOT (ch == '.')
#define DECIMAL DOT
#define _CONSUME_AS_LONG_AS(conditionFails) \
while (!EOS) { \
ch = readChar(); \
if (!(conditionFails) || str.len >= LUCENE_MAX_WORD_LEN) { \
break; \
} \
str.appendChar(ch); \
}
#define CONSUME_ALPHAS _CONSUME_AS_LONG_AS(ALPHA)
#define CONSUME_DIGITS _CONSUME_AS_LONG_AS(DIGIT)
/* otherMatches is a condition (possibly compound) under which a character
** that's not an ALNUM or UNDERSCORE can be considered not to break the
** span. Callers should pass false if only ALNUM/UNDERSCORE are acceptable. */
#define CONSUME_WORD _CONSUME_AS_LONG_AS(ALNUM || UNDERSCORE)
#define CONSUME_WORD_OR(otherMatches) _CONSUME_AS_LONG_AS(ALNUM || UNDERSCORE || (otherMatches))
/* It is considered that "nothing of value" has been read if:
** a) The "read head" hasn't moved since specialCharPos was established.
** or
** b) The "read head" has moved by one character, but that character was
** either whitespace or not among the characters found in the body of
** a token (deliberately doesn't include the likes of '@'/'&'). */
#define CONSUMED_NOTHING_OF_VALUE \
(rdPos == specialCharPos || (rdPos == specialCharPos+1 && ( \
SPACE || !(ALNUM || DOT || DASH || UNDERSCORE) \
)))
#define RIGHTMOST(sb) (sb.getBuffer()[sb.len-1])
#define RIGHTMOST_IS(sb, c) (RIGHTMOST(sb) == c)
/* To discard the last character in a StringBuffer, we decrement the buffer's
** length indicator and move the terminator back by one character. */
#define SHAVE_RIGHTMOST(sb) (sb.getBuffer()[--sb.len] = '\0')
#define REMOVE_TRAILING_CHARS(sb, charMatchesCondition) \
{ \
TCHAR* sbBuf = sb.getBuffer(); \
for (int32_t i = sb.len-1; i >= 0; i--) { \
TCHAR c = sbBuf[i]; \
if (charMatchesCondition) { \
sbBuf[--sb.len] = '\0'; \
} else { \
break; \
} \
} \
}
/* Does StringBuffer sb contain any of the characters in string ofThese? */
#define CONTAINS_ANY(sb, ofThese) \
(_tcscspn(sb.getBuffer(), _T(ofThese)) != static_cast<size_t>(sb.len))
StandardTokenizer::StandardTokenizer(Reader* reader):
rd(_CLNEW FastCharStream(reader)),
/* rdPos is zero-based. It starts at -1, and will advance to the first
** position when readChar() is first called. */
rdPos(-1),
tokenStart(-1)
{
}
StandardTokenizer::~StandardTokenizer() {
_CLDELETE(rd);
}
void StandardTokenizer::close(){
}
TCHAR StandardTokenizer::readChar() {
TCHAR ch = rd->GetNext();
/* Increment by 1 because we're speaking in terms of characters, not
** necessarily bytes: */
rdPos++;
return ch;
}
void StandardTokenizer::unReadChar() {
rd->UnGet();
rdPos--;
}
inline bool StandardTokenizer::setToken(Token* t, StringBuffer* sb, TokenTypes tokenCode) {
t->setStartOffset(tokenStart);
t->setEndOffset(tokenStart+sb->length());
t->setType(tokenImage[tokenCode]);
sb->getBuffer(); //null terminates the buffer
t->resetTermTextLen();
return true;
}
/*inline Token* StandardTokenizer::createToken(const TCHAR* text, int32_t tokenLength, TokenTypes tokenCode) {
CND_PRECONDITION (tokenLength > 0,"Token length is less than or equal to zero");
return _CLNEW Token(text, tokenStart, tokenStart+tokenLength, tokenImage[tokenCode]);
}*/
bool StandardTokenizer::next(Token* t) {
while (!EOS) {
TUCHAR ch ;
ch = (TUCHAR)readChar();
if ( ch == 0 ){
continue;
// }else if (((TCHAR)ch>>8)&&((TCHAR)ch>=0xa0)){//add by Fox_Hawk } else if ( ch >= 0xa0 ) {
tokenStart = rdPos;
int iWordLen = 1 ; //要定义为1,因为一开始就读了一个ch
t->growBuffer(LUCENE_MAX_WORD_LEN);//make sure token can hold the next word
StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText
str.appendChar(ch);
//注意,中文分词需要用空格分开,否则处理不了,而且一个词最长不能超过LUCENE_MAX_WORD_LEN
while ( !EOS ){
TUCHAR nextch = (TUCHAR)readChar();
if ( nextch >= 0xa0 ) {
if ( iWordLen >= LUCENE_MAX_WORD_LEN )
break; //为了限制一个汉字词的长度
str.appendChar(nextch); //todo:要注意如果最后只有半个汉字字符的时候
iWordLen ++ ;
}else {
unReadChar();
break;
}
}// end while !EOS
if ( iWordLen > 1 ) {
setToken(t,&str, CL_NS2(analysis,standard)::CJK);
return true;
}
continue;
} else if (SPACE) {
continue;
} else if (ALPHA || UNDERSCORE) {
tokenStart = rdPos;
return ReadAlphaNum(ch,t);
} else if (DIGIT || NEGATIVE_SIGN_ || DECIMAL) {
tokenStart = rdPos;
/* ReadNumber returns NULL if it fails to extract a valid number; in
** that case, we just continue. */
if (ReadNumber(NULL, ch,t))
return true;
}
}
return false;
}
bool StandardTokenizer::ReadNumber(const TCHAR* previousNumber, const TCHAR prev,Token* t) {
/* previousNumber is only non-NULL if this function already read a complete
** number in a previous recursion, yet has been asked to read additional
** numeric segments. For example, in the HOST "192.168.1.3", "192.168" is
** a complete number, but this function will recurse to read the "1.3",
** generating a single HOST token "192.168.1.3". */
t->growBuffer(LUCENE_MAX_WORD_LEN);//make sure token can hold the next word
StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText
TokenTypes tokenType;
bool decExhausted;
if (previousNumber != NULL) {
str.prepend(previousNumber);
tokenType = CL_NS2(analysis,standard)::HOST;
decExhausted = false;
} else {
tokenType = CL_NS2(analysis,standard)::NUM;
decExhausted = (prev == '.');
}
if ( str.len >= LUCENE_MAX_WORD_LEN ){
//todo: if a number is too long, i would say there is no point
//storing it, because its going to be the wrong number anyway?
//what do people think?
return false;
}
str.appendChar(prev);
const bool signExhausted = (prev == '-');
TCHAR ch = prev;
CONSUME_DIGITS;
if (str.len < 2 /* CONSUME_DIGITS didn't find any digits. */
&& (
(signExhausted && !DECIMAL)
|| (decExhausted /* && !DIGIT is implied, since CONSUME_DIGITS stopped on a non-digit. */)
)
)
{
/* We have either:
** a) a negative sign that's not followed by either digit(s) or a decimal
** b) a decimal that's not followed by digit(s)
** so this is not a valid number. */
if (!EOS) {
/* Unread the character that stopped CONSUME_DIGITS: */
unReadChar();
}
return false;
}
/* We just read a group of digits. Is it followed by a decimal symbol,
** implying that there might be another group of digits available? */
if (!EOS) {
if (DECIMAL) {
if ( str.len >= LUCENE_MAX_WORD_LEN )
return false;//todo: read above for rationale
str.appendChar(ch);
} else {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -