📄 snowball.cpp
字号:
#include "CLucene/StdHeader.h"
#include "SnowballAnalyzer.h"
#include "SnowballFilter.h"
CL_NS_DEF2(analysis,snowball)
/** Builds the named analyzer with no stop words. */
SnowballAnalyzer::SnowballAnalyzer(const TCHAR* language) {
this->language = STRDUP_TtoT(language);
stopSet = NULL;
}
SnowballAnalyzer::~SnowballAnalyzer(){
_CLDELETE_CARRAY(language);
if ( stopSet != NULL )
_CLDELETE(stopSet);
}
/** Builds the named analyzer with the given stop words.
*/
SnowballAnalyzer::SnowballAnalyzer(const TCHAR* language, TCHAR** stopWords) {
this->language = STRDUP_TtoT(language);
stopSet = _CLNEW CL_NS(util)::CLSetList<TCHAR*>;
StopFilter::fillStopTable(stopSet,stopWords);
}
/** Constructs a {@link StandardTokenizer} filtered by a {@link
StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
TokenStream* SnowballAnalyzer::tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader) {
TokenStream* result = _CLNEW CL_NS(analysis)::standard::StandardTokenizer(reader);
result = _CLNEW CL_NS(analysis)::standard::StandardFilter(result, true);
result = _CLNEW CL_NS(analysis)::LowerCaseFilter(result, true);
if (stopSet != NULL)
result = _CLNEW CL_NS(analysis)::StopFilter(result, true, stopSet);
result = _CLNEW SnowballFilter(result, language, true);
return result;
}
/** Construct the named stemming filter.
*
* @param in the input tokens to stem
* @param name the name of a stemmer
*/
SnowballFilter::SnowballFilter(TokenStream* in, const TCHAR* language, bool deleteTS):
TokenFilter(in,deleteTS)
{
TCHAR tlang[50];
char lang[50];
STRCPY_TtoT(tlang,language,50);
_tcslwr(tlang);
STRCPY_TtoA(lang,tlang,50);
stemmer = sb_stemmer_new(lang, NULL); //use utf8 encoding
if ( stemmer == NULL ){
_CLTHROWA(CL_ERR_IllegalArgument, "language not available for stemming\n"); //todo: richer error
}
}
SnowballFilter::~SnowballFilter(){
sb_stemmer_delete(stemmer);
}
/** Returns the next input Token, after being stemmed */
bool SnowballFilter::next(Token* token){
if (!input->next(token))
return false;
unsigned char uctext[LUCENE_MAX_WORD_LEN];
TCHAR tchartext[LUCENE_MAX_WORD_LEN];
#ifdef _UCS2
char utf8text[LUCENE_MAX_WORD_LEN];
size_t len = lucene_wcstoutf8(utf8text,token->termText(),LUCENE_MAX_WORD_LEN);
memcpy(uctext,utf8text,len);
uctext[len]='\0';
#else
const char* tmp = token->termText();
int len = token->termTextLength();
for (int i=0;i<len+1;i++)
uctext[i]=tmp[i];
#endif
const sb_symbol* stemmed = sb_stemmer_stem(stemmer, uctext, len);
if ( stemmed == NULL )
_CLTHROWA(CL_ERR_Runtime,"Out of memory");
int stemmedLen=sb_stemmer_length(stemmer);
#ifdef _UCS2
memcpy(utf8text,stemmed,stemmedLen);
utf8text[stemmedLen]=0;
lucene_utf8towcs(tchartext,utf8text,LUCENE_MAX_WORD_LEN);
#else
for (int i=0;i<stemmedLen+1;i++)
tchartext[i]=stemmed[i];
#endif
token->set(tchartext,token->startOffset(), token->endOffset(), token->type());
return true;
}
CL_NS_END2
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -