📄 standardtokenizer.cpp

📁 汉化CLucene今天
💻 CPP
📖 第 1 页 / 共 2 页
字号:
12 下一页
#include "CLucene/StdHeader.h"
#include "StandardTokenizer.h"

CL_NS_USE(analysis)
CL_NS_USE(util)
CL_NS_DEF2(analysis,standard)

  /* A bunch of shortcut macros, many of which make assumptions about variable
  ** names.  These macros enhance readability, not just convenience! */
  #define EOS           (rd->Eos())
  #define SPACE         (_istspace((TCHAR)ch) != 0)
  #define ALPHA         (_istalpha((TCHAR)ch) != 0)
  #define ALNUM         (_istalnum(ch) != 0)
  #define DIGIT         (_istdigit(ch) != 0)
  #define UNDERSCORE    (ch == '_')

  #define DASH          (ch == '-')
  #define NEGATIVE_SIGN_ DASH
  #define POSITIVE_SIGN_ (ch == '+')
  #define SIGN          (NEGATIVE_SIGN_ || POSITIVE_SIGN_)

  #define DOT             (ch == '.')
  #define DECIMAL         DOT


  #define _CONSUME_AS_LONG_AS(conditionFails) \
    while (!EOS) { \
      ch = readChar(); \
	  if (!(conditionFails) || str.len >= LUCENE_MAX_WORD_LEN) { \
        break; \
      } \
      str.appendChar(ch); \
    }

  #define CONSUME_ALPHAS _CONSUME_AS_LONG_AS(ALPHA)

  #define CONSUME_DIGITS _CONSUME_AS_LONG_AS(DIGIT)

  /* otherMatches is a condition (possibly compound) under which a character
  ** that's not an ALNUM or UNDERSCORE can be considered not to break the
  ** span.  Callers should pass false if only ALNUM/UNDERSCORE are acceptable. */
  #define CONSUME_WORD                  _CONSUME_AS_LONG_AS(ALNUM || UNDERSCORE)
  #define CONSUME_WORD_OR(otherMatches) _CONSUME_AS_LONG_AS(ALNUM || UNDERSCORE || (otherMatches))


  /* It is considered that "nothing of value" has been read if:
  ** a) The "read head" hasn't moved since specialCharPos was established.
  ** or
  ** b) The "read head" has moved by one character, but that character was
  **    either whitespace or not among the characters found in the body of
  **    a token (deliberately doesn't include the likes of '@'/'&'). */
  #define CONSUMED_NOTHING_OF_VALUE \
    (rdPos == specialCharPos || (rdPos == specialCharPos+1 && ( \
      SPACE || !(ALNUM || DOT || DASH || UNDERSCORE) \
    )))

  #define RIGHTMOST(sb) (sb.getBuffer()[sb.len-1])
  #define RIGHTMOST_IS(sb, c) (RIGHTMOST(sb) == c)
  /* To discard the last character in a StringBuffer, we decrement the buffer's
  ** length indicator and move the terminator back by one character. */
  #define SHAVE_RIGHTMOST(sb) (sb.getBuffer()[--sb.len] = '\0')

  #define REMOVE_TRAILING_CHARS(sb, charMatchesCondition) \
  { \
    TCHAR* sbBuf = sb.getBuffer(); \
    for (int32_t i = sb.len-1; i >= 0; i--) { \
      TCHAR c = sbBuf[i]; \
      if (charMatchesCondition) { \
        sbBuf[--sb.len] = '\0'; \
      } else { \
        break; \
      } \
    } \
  }

  /* Does StringBuffer sb contain any of the characters in string ofThese? */
  #define CONTAINS_ANY(sb, ofThese) \
  (_tcscspn(sb.getBuffer(), _T(ofThese)) != static_cast<size_t>(sb.len))


  StandardTokenizer::StandardTokenizer(Reader* reader):
    rd(_CLNEW FastCharStream(reader)),
    /* rdPos is zero-based.  It starts at -1, and will advance to the first
    ** position when readChar() is first called. */
    rdPos(-1),
    tokenStart(-1)
  {
  }

  StandardTokenizer::~StandardTokenizer() {
    _CLDELETE(rd);
  }

  void StandardTokenizer::close(){
  }

  TCHAR StandardTokenizer::readChar() {
    TCHAR ch = rd->GetNext();
    /* Increment by 1 because we're speaking in terms of characters, not
    ** necessarily bytes: */
    rdPos++;
    return ch;
  }

  void StandardTokenizer::unReadChar() {
    rd->UnGet();
    rdPos--;
  }

  inline bool StandardTokenizer::setToken(Token* t, StringBuffer* sb, TokenTypes tokenCode) {
    t->setStartOffset(tokenStart);
	t->setEndOffset(tokenStart+sb->length());
	t->setType(tokenImage[tokenCode]);
	sb->getBuffer(); //null terminates the buffer
	t->resetTermTextLen();
	return true;
  }

  /*inline Token* StandardTokenizer::createToken(const TCHAR* text, int32_t tokenLength, TokenTypes tokenCode) {
    CND_PRECONDITION (tokenLength > 0,"Token length is less than or equal to zero");
    return _CLNEW Token(text, tokenStart, tokenStart+tokenLength, tokenImage[tokenCode]);
  }*/

  bool StandardTokenizer::next(Token* t) {
    while (!EOS) {
      TUCHAR ch ;
	  
 	  ch = (TUCHAR)readChar();

	  if ( ch == 0 ){
		continue;
//      }else if (((TCHAR)ch>>8)&&((TCHAR)ch>=0xa0)){//add by Fox_Hawk	  } else if ( ch >= 0xa0 ) {
		  tokenStart = rdPos;
		  int iWordLen = 1 ;	//要定义为1,因为一开始就读了一个ch

		  t->growBuffer(LUCENE_MAX_WORD_LEN);//make sure token can hold the next word
		  StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText
		  str.appendChar(ch);
//注意，中文分词需要用空格分开，否则处理不了，而且一个词最长不能超过LUCENE_MAX_WORD_LEN
		  while ( !EOS ){
			  TUCHAR nextch = (TUCHAR)readChar();
			  if ( nextch >= 0xa0 ) {
				    if ( iWordLen >= LUCENE_MAX_WORD_LEN )
							break; //为了限制一个汉字词的长度
					str.appendChar(nextch); //todo:要注意如果最后只有半个汉字字符的时候
					iWordLen ++ ;
			  }else {
					unReadChar();
					break;
			  }
		  }// end while !EOS
		  if ( iWordLen > 1 ) {
			  setToken(t,&str, CL_NS2(analysis,standard)::CJK);
			  return true;
		  }
		  continue;
	  } else if (SPACE) {
        continue;
      } else if (ALPHA || UNDERSCORE) {
        tokenStart = rdPos;
        return ReadAlphaNum(ch,t);
      } else if (DIGIT || NEGATIVE_SIGN_ || DECIMAL) {
        tokenStart = rdPos;
        /* ReadNumber returns NULL if it fails to extract a valid number; in
        ** that case, we just continue. */
        if (ReadNumber(NULL, ch,t))
          return true;
	  }
    }
    return false;
  }

  bool StandardTokenizer::ReadNumber(const TCHAR* previousNumber, const TCHAR prev,Token* t) {
    /* previousNumber is only non-NULL if this function already read a complete
    ** number in a previous recursion, yet has been asked to read additional
    ** numeric segments.  For example, in the HOST "192.168.1.3", "192.168" is
    ** a complete number, but this function will recurse to read the "1.3",
    ** generating a single HOST token "192.168.1.3". */
    t->growBuffer(LUCENE_MAX_WORD_LEN);//make sure token can hold the next word
    StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText
    TokenTypes tokenType;
    bool decExhausted;
    if (previousNumber != NULL) {
      str.prepend(previousNumber);
      tokenType = CL_NS2(analysis,standard)::HOST;
      decExhausted = false;
    } else {
      tokenType = CL_NS2(analysis,standard)::NUM;
      decExhausted = (prev == '.');
    }
	if (  str.len >= LUCENE_MAX_WORD_LEN ){
		//todo: if a number is too long, i would say there is no point
		//storing it, because its going to be the wrong number anyway?
		//what do people think?
		return false; 
	}
    str.appendChar(prev);

    const bool signExhausted = (prev == '-');
    TCHAR ch = prev;

    CONSUME_DIGITS;

    if (str.len < 2 /* CONSUME_DIGITS didn't find any digits. */
        && (
                (signExhausted && !DECIMAL)
             || (decExhausted /* && !DIGIT is implied, since CONSUME_DIGITS stopped on a non-digit. */)
           )
       )
    {
      /* We have either:
      **   a) a negative sign that's not followed by either digit(s) or a decimal
      **   b) a decimal that's not followed by digit(s)
      ** so this is not a valid number. */
      if (!EOS) {
        /* Unread the character that stopped CONSUME_DIGITS: */
        unReadChar();
      }
      return false;
    }

    /* We just read a group of digits.  Is it followed by a decimal symbol,
    ** implying that there might be another group of digits available? */
    if (!EOS) {
      if (DECIMAL) {
		if (  str.len >= LUCENE_MAX_WORD_LEN )
			return false;//todo: read above for rationale
        str.appendChar(ch);
      } else {
12 下一页
💿 文件大小 377 K
👤 上传用户 lemon_zc1949
📂 所属分类其他
🏷️ 相关标签

#CLucene #汉化
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -