📄 analyzers.h

📁 clucene是c++版的全文检索引擎,完全移植于lucene,采用 stl 编写.
💻 H
字号:
/*------------------------------------------------------------------------------
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
* 
* Distributable under the terms of either the Apache License (Version 2.0) or 
* the GNU Lesser General Public License, as specified in the COPYING file.
------------------------------------------------------------------------------*/
#ifndef _lucene_analysis_Analyzers_
#define _lucene_analysis_Analyzers_

#if defined(_LUCENE_PRAGMA_ONCE)
# pragma once
#endif

#include "CLucene/util/Reader.h"
#include "AnalysisHeader.h"
#include "CLucene/util/Misc.h"

CL_NS_DEF(analysis)
    class CharTokenizer:public Tokenizer {
	private:
		int32_t offset, bufferIndex, dataLen;
		TCHAR buffer[LUCENE_MAX_WORD_LEN+1];
		const TCHAR* ioBuffer;
	protected:
		// Returns true iff a character should be included in a token.  This
		// tokenizer generates as tokens adjacent sequences of characters which
		// satisfy this predicate.  Characters for which this is false are used to
		// define token boundaries and are not included in tokens.
		virtual bool isTokenChar(const TCHAR c) const = 0;

		// Called on each token character to normalize it before it is added to the
		// token.  The default implementation does nothing.  Subclasses may use this
		// to, e.g., lowercase tokens. 
		virtual TCHAR normalize(const TCHAR c) const;
	
	public:
		CharTokenizer(CL_NS(util)::Reader* in);
		
		virtual ~CharTokenizer(){
		}
		
		// Returns the next token in the stream, or null at EOS. 
		// *** This is not a pointer. Use of it must deleted.
		bool next(Token* token);
	};
	
	class LetterTokenizer:public CharTokenizer {
	public:
		// Construct a new LetterTokenizer. 
		LetterTokenizer(CL_NS(util)::Reader* in):
		CharTokenizer(in) {}
	
	    ~LetterTokenizer(){}
	protected:
		// Collects only characters which satisfy
		// {@link Character#isLetter(TCHAR)}.
		bool isTokenChar(const TCHAR c) const;
	};
	
	
	
	
	
	// LowerCaseTokenizer performs the function of LetterTokenizer
	// and LowerCaseFilter together.  It divides text at non-letters and converts
	// them to lower case.  While it is functionally equivalent to the combination
	// of LetterTokenizer and LowerCaseFilter, there is a performance advantage
	// to doing the two tasks at once, hence this (redundant) implementation.
	// <P>
	// Note: this does a decent job for most European languages, but does a terrible
	// job for some Asian languages, where words are not separated by spaces.
	class LowerCaseTokenizer:public LetterTokenizer {
	public:
		// Construct a new LowerCaseTokenizer. 
		LowerCaseTokenizer(CL_NS(util)::Reader* in):
		LetterTokenizer(in) {}

	    ~LowerCaseTokenizer(){}
	protected:
		// Collects only characters which satisfy
		// {@link Character#isLetter(TCHAR)}.
		TCHAR normalize(const TCHAR chr) const;
	};
	
	

	class WhitespaceTokenizer: public CharTokenizer {
	public:
		// Construct a new WhitespaceTokenizer. 
		WhitespaceTokenizer(CL_NS(util)::Reader* in):CharTokenizer(in) {}
		~WhitespaceTokenizer(){}
	protected:
		// Collects only characters which do not satisfy
		// {@link Character#isWhitespace(TCHAR)}.
		bool isTokenChar(const TCHAR c) const;
	};
	
	
	// An Analyzer that uses WhitespaceTokenizer. 
    class WhitespaceAnalyzer: public Analyzer {
     public:
      TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
	  ~WhitespaceAnalyzer(){}
    };


    class SimpleAnalyzer: public Analyzer {
	public:
		TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
		~SimpleAnalyzer(){}
	};
	
	
	
   /**
   * Normalizes token text to lower case.
   *
   * @version $Id: Analyzers.h 2238 2006-06-27 10:52:25Z ustramooner $
   */
	class LowerCaseFilter: public TokenFilter {
	public:
		LowerCaseFilter(TokenStream* in, bool deleteTokenStream):TokenFilter(in,deleteTokenStream) {}
		~LowerCaseFilter(){}
		bool next(Token* token);
    };



    // Removes stop words from a token stream.
	class StopFilter: public TokenFilter {
	private:
		//bvk: i found this to work faster with a non-hash table. the number of items
		//in the stop table is not like to make it worth having hashing.
		CL_NS(util)::CLSetList<const TCHAR*>* table;
	public:
		// Constructs a filter which removes words from the input
		//	TokenStream that are named in the array of words. 
		StopFilter(TokenStream* in, bool deleteTokenStream, const TCHAR** stopWords);

		~StopFilter(){}

		// Constructs a filter which removes words from the input
		//	TokenStream that are named in the Hashtable.
		StopFilter(TokenStream* in, bool deleteTokenStream, CL_NS(util)::CLSetList<const TCHAR*>* stopTable):
			TokenFilter(in, deleteTokenStream),
			table(stopTable)
		{} 
		  
		
		/**
		* Builds a Hashtable from an array of stop words, appropriate for passing
		*	into the StopFilter constructor.  This permits this table construction to
		*	be cached once when an Analyzer is constructed. 
		* Note: the stopWords list must be a static list because the strings are not copied
		*
		* @swig stopWords
		*/
		static void fillStopTable(CL_NS(util)::CLSetList<const TCHAR*>* stopTable,
                                          const TCHAR** stopWords);

		/**
		* Returns the next input Token whose termText() is not a stop word.
		*
		* @swig token byref
		*/ 
		bool next(Token* token);
	};
	
	
	
	
    //An array containing some common English words that are usually not
	//useful for searching.
	
    // Filters LetterTokenizer with LowerCaseFilter and StopFilter. 
    class StopAnalyzer: public Analyzer {
        CL_NS(util)::CLSetList<const TCHAR*> stopTable;
    
    public:
        // Builds an analyzer which removes words in ENGLISH_STOP_WORDS. 
        StopAnalyzer();
        ~StopAnalyzer();
        
        // Builds an analyzer which removes words in the provided array. 
        StopAnalyzer( const TCHAR** stopWords );
        // Filters LowerCaseTokenizer with StopFilter. 
        TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
		
        static const TCHAR* ENGLISH_STOP_WORDS[];
    };
    
    
    
    /**
     * This analyzer is used to facilitate scenarios where different
     * fields require different analysis techniques.  Use {@link #addAnalyzer}
     * to add a non-default analyzer on a field name basis.
     * 
     * <p>Example usage:
     * 
     * <pre>
     *   PerFieldAnalyzerWrapper aWrapper =
     *      new PerFieldAnalyzerWrapper(new StandardAnalyzer());
     *   aWrapper.addAnalyzer("firstname", new KeywordAnalyzer());
     *   aWrapper.addAnalyzer("lastname", new KeywordAnalyzer());
     * </pre>
     * 
     * <p>In this example, StandardAnalyzer will be used for all fields except "firstname"
     * and "lastname", for which KeywordAnalyzer will be used.
     * 
     * <p>A PerFieldAnalyzerWrapper can be used like any other analyzer, for both indexing
     * and query parsing.
     */
    class PerFieldAnalyzerWrapper : public Analyzer {
    private:
      Analyzer* defaultAnalyzer;
      CL_NS(util)::CLHashMap<const TCHAR*, Analyzer*, CL_NS(util)::Compare::TChar,
		  CL_NS(util)::Equals::TChar, CL_NS(util)::Deletor::tcArray,CL_NS(util)::Deletor::Void<Analyzer> > analyzerMap;
    public:
      /**
       * Constructs with default analyzer.
       *
       * @param defaultAnalyzer Any fields not specifically
       * defined to use a different analyzer will use the one provided here.
       */
      PerFieldAnalyzerWrapper(Analyzer* defaultAnalyzer);
        ~PerFieldAnalyzerWrapper();
    
      /**
       * Defines an analyzer to use for the specified field.
       *
       * @param fieldName field name requiring a non-default analyzer
       * @param analyzer non-default analyzer to use for field
       */
      void addAnalyzer(const TCHAR* fieldName, Analyzer* analyzer);
      TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
};


/**
 * A filter that replaces accented characters in the ISO Latin 1 character set 
 * (ISO-8859-1) by their unaccented equivalent. The case will not be altered.
 * <p>
 * For instance, '&agrave;' will be replaced by 'a'.
 * <p>
 */
class ISOLatin1AccentFilter: public TokenFilter {
public:
	ISOLatin1AccentFilter(TokenStream* input, bool deleteTs):
		TokenFilter(input,deleteTs)
	{
	}
	
	/**
	 * To replace accented characters in a String by unaccented equivalents.
	 */
	bool next(Token* token);
};

CL_NS_END
#endif
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -