📄 standardtokenizer.cpp

📁 汉化CLucene今天
💻 CPP
📖 第 1 页 / 共 2 页
字号:
上一页 12
        unReadChar();
        goto SUCCESSFULLY_EXTRACTED_NUMBER;
      }

      CONSUME_DIGITS;
      if (!DIGIT && !DECIMAL) {
        unReadChar();
      } else if (!EOS && DECIMAL && _istdigit(rd->Peek())) {
        /* We just read the fractional digit group, but it's also followed by
        ** a decimal symbol and at least one more digit, so this must be a
        ** HOST rather than a real number. */
        return ReadNumber(str.getBuffer(), '.',t);
      }
    }

    SUCCESSFULLY_EXTRACTED_NUMBER:
    TCHAR rightmost = RIGHTMOST(str);
    /* Don't including a trailing decimal point. */
    if (rightmost == '.') {
      SHAVE_RIGHTMOST(str);
      unReadChar();
      rightmost = RIGHTMOST(str);
    }
    /* If all we have left is a negative sign, it's not a valid number. */
    if (rightmost == '-') {
      CND_PRECONDITION (str.len == 1, "Number is invalid");
      return false;
    }

	return setToken(t,&str,tokenType);
  }

  bool StandardTokenizer::ReadAlphaNum(const TCHAR prev, Token* t) {
    t->growBuffer(LUCENE_MAX_WORD_LEN);//make sure token can hold the next word
    StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText
	if (  str.len < LUCENE_MAX_WORD_LEN ){
		str.appendChar(prev);
		TCHAR ch = prev;

		CONSUME_WORD;
		if (!EOS && str.len < LUCENE_MAX_WORD_LEN-1 ) { //still have space for 1 more character?
			switch(ch) { /* What follows the first alphanum segment? */
				case '.':
					str.appendChar('.');
					return ReadDotted(&str, CL_NS2(analysis,standard)::UNKNOWN,t);
				case '\'':
					str.appendChar('\'');
					return ReadApostrophe(&str,t);
				case '@':
					str.appendChar('@');
					return ReadAt(&str,t);
				case '&':
					str.appendChar('&');
					return ReadCompany(&str,t);
				default:
						unReadChar();	// by linfj

						break;
				/* default: fall through to end of this function. */
			}
		}
	}
	return setToken(t,&str,CL_NS2(analysis,standard)::ALPHANUM);
  }

  bool StandardTokenizer::ReadDotted(StringBuffer* _str, TokenTypes forcedType, Token* t) {
    const int32_t specialCharPos = rdPos;
	StringBuffer& str=*_str;
	
    /* A segment of a "dotted" is not allowed to begin with another dot or a dash.
    ** Even though hosts, e-mail addresses, etc., could have a dotted-segment
    ** that begins with a dot or a dash, it's far more common in source text
    ** for a pattern like "abc.--def" to be intended as two tokens. */
    TCHAR ch = rd->Peek();
    if (!(DOT || DASH)) {
      bool prevWasDot;
      bool prevWasDash;
      if (str.len == 0) {
        prevWasDot = false;
        prevWasDash = false;
      } else {
        prevWasDot = RIGHTMOST(str) == '.';
        prevWasDash = RIGHTMOST(str) == '-';
      }
      while (!EOS && str.len < LUCENE_MAX_WORD_LEN-1 ) {
        ch = readChar();
        const bool dot = ch == '.';
        const bool dash = ch == '-';

        if (!(ALNUM || UNDERSCORE || dot || dash)) {
          break;
        }
        /* Multiple dots or dashes in succession end the token.
        ** Consider the following inputs:
        **   "Visit windowsupdate.microsoft.com--update today!"
        **   "In the U.S.A.--yes, even there!"                 */
        if ((dot || dash) && (prevWasDot || prevWasDash)) {
          /* We're not going to append the character we just read, in any case.
          ** As to the character before it (which is currently RIGHTMOST(str)):
          ** Unless RIGHTMOST(str) is a dot, in which we need to save it so the
          ** acronym-versus-host detection can work, we want to get rid of it. */
          if (!prevWasDot) {
            SHAVE_RIGHTMOST(str);
          }
          break;
        }

        str.appendChar(ch);

        prevWasDot = dot;
        prevWasDash = dash;
      }
    }

    /* There's a potential StringBuffer.append call in the code above, which
    ** could cause str to reallocate its internal buffer.  We must wait to
    ** obtain the optimization-oriented strBuf pointer until after the initial
    ** potentially realloc-triggering operations on str.
    ** Because there can be other such ops much later in this function, strBuf
    ** is guarded within a block to prevent its use during or after the calls
    ** that would potentially invalidate it. */
    { /* Begin block-guard of strBuf */
    TCHAR* strBuf = str.getBuffer();

    bool rightmostIsDot = RIGHTMOST_IS(str, '.');
    if (CONSUMED_NOTHING_OF_VALUE) {
      /* No more alphanums available for this token; shave trailing dot, if any. */
      if (rightmostIsDot) {
        SHAVE_RIGHTMOST(str);
      }
      /* If there are no dots remaining, this is a generic ALPHANUM. */
      if (_tcschr(strBuf, '.') == NULL) {
        forcedType = CL_NS2(analysis,standard)::ALPHANUM;
      }

    /* Check the token to see if it's an acronym.  An acronym must have a
    ** letter in every even slot and a dot in every odd slot, including the
    ** last slot (for example, "U.S.A."). */
    } else if (rightmostIsDot) {
      bool isAcronym = true;
      const int32_t upperCheckLimit = str.len - 1; /* -1 b/c we already checked the last slot. */

      for (int32_t i = 0; i < upperCheckLimit; i++) {
        const bool even = (i % 2 == 0);
        ch = strBuf[i];
        if ( (even && !ALPHA) || (!even && !DOT) ) {
          isAcronym = false;
          break;
        }
      }
      if (isAcronym) {
        forcedType = CL_NS2(analysis,standard)::ACRONYM;
      } else {
        /* If it's not an acronym, we don't want the trailing dot. */
        SHAVE_RIGHTMOST(str);
        /* If there are no dots remaining, this is a generic ALPHANUM. */
        if (_tcschr(strBuf, '.') == NULL) {
          forcedType = CL_NS2(analysis,standard)::ALPHANUM;
        }
      }
    }
    } /* End block-guard of strBuf */

    if (!EOS) {
      if (ch == '@' && str.len < LUCENE_MAX_WORD_LEN-1) {
        str.appendChar('@');
        return ReadAt(&str,t);
      } else {
        unReadChar();
      }
    }

	return setToken(t,&str,CL_NS2(analysis,standard)::UNKNOWN
			? forcedType : CL_NS2(analysis,standard)::HOST);
  }

  bool StandardTokenizer::ReadApostrophe(StringBuffer* _str, Token* t) {
    StringBuffer& str=*_str;

    TokenTypes tokenType = CL_NS2(analysis,standard)::APOSTROPHE;
    const int32_t specialCharPos = rdPos;
    TCHAR ch=0;

    CONSUME_ALPHAS;
    if (RIGHTMOST_IS(str, '\'') || CONSUMED_NOTHING_OF_VALUE) {
      /* After the apostrophe, no more alphanums were available within this
      ** token; shave trailing apostrophe and revert to generic ALPHANUM. */
      SHAVE_RIGHTMOST(str);
      tokenType = CL_NS2(analysis,standard)::ALPHANUM;
    }
    if (!EOS) {
      unReadChar();
    }

	return setToken(t,&str,tokenType);
  }

  bool StandardTokenizer::ReadAt(StringBuffer* str, Token* t) {
    bool ret = ReadDotted(str, CL_NS2(analysis,standard)::EMAIL,t);
    /* JLucene grammar indicates dots/digits not allowed in company name: */
    if (!CONTAINS_ANY((*str), ".0123456789")) {
		setToken(t,str,CL_NS2(analysis,standard)::COMPANY);
    }
    return true;
  }

  bool StandardTokenizer::ReadCompany(StringBuffer* _str, Token* t) {
    StringBuffer& str = *_str;
    const int32_t specialCharPos = rdPos;
    TCHAR ch=0;

    CONSUME_WORD;
    if (CONSUMED_NOTHING_OF_VALUE) {
      /* After the ampersand, no more alphanums were available within this
      ** token; shave trailing ampersand and revert to ALPHANUM. */
      CND_PRECONDITION(RIGHTMOST_IS(str, '&'),"ReadCompany failed");
      SHAVE_RIGHTMOST(str);


	  return setToken(t,&str,CL_NS2(analysis,standard)::ALPHANUM);
    }
    if (!EOS) {
      unReadChar();
    }

	return setToken(t,&str,CL_NS2(analysis,standard)::COMPANY);
  }

CL_NS_END2
上一页 12
💿 文件大小 377 K
👤 上传用户 lemon_zc1949
📂 所属分类其他
🏷️ 相关标签

#CLucene #汉化
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -