⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 snowball.cpp

📁 lucene in java大家一定听说过了
💻 CPP
字号:
#include "CLucene/StdHeader.h"
#include "SnowballAnalyzer.h"
#include "SnowballFilter.h"

CL_NS_DEF2(analysis,snowball)

  /** Builds the named analyzer with no stop words. */
  SnowballAnalyzer::SnowballAnalyzer(const TCHAR* language) {
    this->language = STRDUP_TtoT(language);
	stopSet = NULL;
  }

  SnowballAnalyzer::~SnowballAnalyzer(){
	  _CLDELETE_CARRAY(language);
	  if ( stopSet != NULL )
		  _CLDELETE(stopSet);
  }

  /** Builds the named analyzer with the given stop words.
  */
  SnowballAnalyzer::SnowballAnalyzer(const TCHAR* language, TCHAR** stopWords) {
    this->language = STRDUP_TtoT(language);

    stopSet = _CLNEW CL_NS(util)::CLSetList<TCHAR*>;
	StopFilter::fillStopTable(stopSet,stopWords);
  }

  /** Constructs a {@link StandardTokenizer} filtered by a {@link
      StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
  TokenStream* SnowballAnalyzer::tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader) {
    TokenStream* result = _CLNEW CL_NS(analysis)::standard::StandardTokenizer(reader);
	result = _CLNEW CL_NS(analysis)::standard::StandardFilter(result, true);
    result = _CLNEW CL_NS(analysis)::LowerCaseFilter(result, true);
    if (stopSet != NULL)
      result = _CLNEW CL_NS(analysis)::StopFilter(result, true, stopSet);
    result = _CLNEW SnowballFilter(result, language, true);
    return result;
  }
  
  
  
  
  
  
  
    /** Construct the named stemming filter.
   *
   * @param in the input tokens to stem
   * @param name the name of a stemmer
   */
	SnowballFilter::SnowballFilter(TokenStream* in, const TCHAR* language, bool deleteTS):
		TokenFilter(in,deleteTS)
	{
		TCHAR tlang[50];
		char lang[50];
		STRCPY_TtoT(tlang,language,50);
		_tcslwr(tlang);

		STRCPY_TtoA(lang,tlang,50);
		stemmer = sb_stemmer_new(lang, NULL); //use utf8 encoding

		if ( stemmer == NULL ){
			_CLTHROWA(CL_ERR_IllegalArgument, "language not available for stemming\n"); //todo: richer error
		}
    }

	SnowballFilter::~SnowballFilter(){
		sb_stemmer_delete(stemmer);
	}

  /** Returns the next input Token, after being stemmed */
  bool SnowballFilter::next(Token* token){
    if (!input->next(token))
      return false;

	unsigned char uctext[LUCENE_MAX_WORD_LEN];
	TCHAR tchartext[LUCENE_MAX_WORD_LEN];

#ifdef _UCS2
	char utf8text[LUCENE_MAX_WORD_LEN];

	size_t len = lucene_wcstoutf8(utf8text,token->termText(),LUCENE_MAX_WORD_LEN);
	memcpy(uctext,utf8text,len);
	uctext[len]='\0';
#else
	const char* tmp = token->termText();
	int len = token->termTextLength();
	for (int i=0;i<len+1;i++)
		uctext[i]=tmp[i];
#endif

    const sb_symbol* stemmed = sb_stemmer_stem(stemmer, uctext, len);
	if ( stemmed == NULL )
		_CLTHROWA(CL_ERR_Runtime,"Out of memory");

	int stemmedLen=sb_stemmer_length(stemmer);

#ifdef _UCS2
	memcpy(utf8text,stemmed,stemmedLen);
	utf8text[stemmedLen]=0;
	lucene_utf8towcs(tchartext,utf8text,LUCENE_MAX_WORD_LEN);
#else
	for (int i=0;i<stemmedLen+1;i++)
		tchartext[i]=stemmed[i];
#endif
	token->set(tchartext,token->startOffset(), token->endOffset(), token->type());
	return true;
  }


CL_NS_END2

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -