📄 strsearch.hpp
字号:
/* * =========================================================================== * PRODUCTION $Log: strsearch.hpp,v $ * PRODUCTION Revision 1000.2 2004/06/01 19:39:02 gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.19 * PRODUCTION * =========================================================================== */#ifndef UTIL___STRSEARCH__HPP#define UTIL___STRSEARCH__HPP/* $Id: strsearch.hpp,v 1000.2 2004/06/01 19:39:02 gouriano Exp $* ===========================================================================** PUBLIC DOMAIN NOTICE* National Center for Biotechnology Information** This software/database is a "United States Government Work" under the* terms of the United States Copyright Act. It was written as part of* the author's official duties as a United States Government employee and* thus cannot be copyrighted. This software/database is freely available* to the public for use. The National Library of Medicine and the U.S.* Government have not placed any restriction on its use or reproduction.** Although all reasonable efforts have been taken to ensure the accuracy* and reliability of the software and data, the NLM and the U.S.* Government do not and cannot warrant the performance or results that* may be obtained by using this software or data. The NLM and the U.S.* Government disclaim all warranties, express or implied, including* warranties of performance, merchantability or fitness for any particular* purpose.** Please cite the author in any work or product based on this material.** ===========================================================================** Author: Mati Shomrat* Anatoliy Kuznetsov** File Description:* String search utilities.**//// @file strsearch.hpp/// String search utilities.////// Currently there are two search utilities:/// 1. An implementation of the Boyer-Moore algorithm./// 2. A finite state automata.#include <corelib/ncbistd.hpp>#include <corelib/ncbistr.hpp>#include <string>#include <vector>/** @addtogroup StringSearch * * @{ */BEGIN_NCBI_SCOPE//============================================================================//// CBoyerMooreMatcher ////============================================================================///// This implemetation uses the Boyer-Moore alg. in order to search a single/// pattern over varying texts.class NCBI_XUTIL_EXPORT CBoyerMooreMatcher {public: enum EWordMatch { eSubstrMatch = 0, ePrefixMatch = (1 << 0), eSuffixMatch = (1 << 1), eWholeWordMatch = (ePrefixMatch | eSuffixMatch) };public: /// Initialize a matcher with the pattern to be matched. /// /// @param pattern /// Pattern to be matched /// @param case_sensitive /// should the search be case sensitive (false by default). /// @param whole_word /// a match is found ony if the pattern was found to /// be between delimiting characters (whitespaces) CBoyerMooreMatcher(const string& pattern, NStr::ECase case_sensitive = NStr::eNocase, unsigned int whole_word = eSubstrMatch); /// Initialize a matcher with the pattern to be matched. /// /// @param pattern /// Pattern to be matched /// @param word_delimeters /// a match is found ony if the pattern was found to /// be between delimiting characters like whitespaces /// and punctuation marks /// @param case_sensitive /// should the search be case sensitive (false by default). /// @param invert_delimiters /// when TRUE means all characters NOT belonging to /// word_delimeters are to be used CBoyerMooreMatcher(const string& pattern, const string& word_delimeters, NStr::ECase case_sensitive = NStr::eNocase, bool invert_delimiters = false); /// Set word delimiting characters /// /// @param word_delimeters /// string of characters used as word delimiters /// @param invert_delimiters /// when TRUE means all characters NOT belonging to /// word_delimeters are to be used void SetWordDelimiters(const string& word_delimeters, bool invert_delimiters = false); /// Add new word delimiters /// /// @param word_delimeters /// string of characters used as word delimiters /// void AddDelimiters(const string& word_delimeters); /// Add new word delimiter charracter /// /// @param word_delimeters /// string of characters used as word delimiters /// void AddDelimiters(char ch); /// Init delimiters most common for the English language, /// (whitespaces, punctuations, etc) void InitCommonDelimiters(); /// Set word matching mode /// /// @param whole_word /// word matching mode. Can be OR combination of EWordMatch values void SetWordMatching(unsigned int whole_word = eWholeWordMatch) { m_WholeWord = whole_word; } /// Search for the pattern over text starting at position pos. /// /// @param text /// Text to search in. /// @param pos /// Position shift in text /// @return /// the position at which the pattern was found, -1 otherwise. size_t Search(const string& text, size_t pos = 0) const { return Search(text.c_str(), pos, text.length()); } /// Search for the pattern over text starting at position pos. /// /// @param text /// Text memory taken as NON ASCIIZ string. /// @param pos /// String offset position from the start /// @param text_len /// Length of text /// @return /// the position at which the pattern was found, -1 otherwise. SIZE_TYPE Search(const char* text, SIZE_TYPE pos, SIZE_TYPE text_len) const; private: // Constants static const int sm_AlphabetSize; // Member Functions: /// Check if the pattern at position pos in the text lies on a /// whole word boundry. bool IsWholeWord(const char* text, SIZE_TYPE pos, SIZE_TYPE text_len) const; void x_InitPattern(void);private: string m_Pattern; SIZE_TYPE m_PatLen; NStr::ECase m_CaseSensitive; unsigned int m_WholeWord; vector<size_t> m_LastOccurrence; vector<unsigned char> m_WordDelimiters; };//============================================================================//// Finite State Automata ////============================================================================//template <typename MatchType>class CTextFsm{public: // Constants (done as an enum to avoid link errors on Darwin) enum ESpecialStates { eFailState = -1 }; // Constructors and Destructors: CTextFsm(bool case_sensitive = false); ~CTextFsm(void); // Add a word to the Fsm. Store a match for later retreival. void AddWord(const string& word, const MatchType& match); // Prime the FSM. // After finishing adding all the words to the FSM it needs to be // primed to enable usage. bool IsPrimed(void) const { return m_Primed; } void Prime(void); // Retreive the FSM's initial state. int GetInitialState(void) const { return 0; } // Get the next state, based on the currect state and a transition // character. int GetNextState(int state, char letter) const; // Are there any matches stored in state? bool IsMatchFound(int state) const; // Retrieve the stored matches in state. const vector<MatchType>& GetMatches(int state) const; private: // Internal representation of states. class CState { public: // Ad-hoc definitions typedef map<char, int> TMapCharInt; // Constructors and Destructors CState(void) : m_OnFailure(0) {} ~CState(void) {}; // Add a transition to the table. void AddTransition(char letter, int to) { m_Transitions[letter] = to; } // Retreive the transition state, give the transition character. int GetNextState(char letter) const { TMapCharInt::const_iterator it = m_Transitions.find(letter); return it != m_Transitions.end() ? it->second : eFailState; } // Are there any matches stored in this state? bool IsMatchFound(void) const { return !m_Matches.empty(); } // Retreive the stored matches vector<MatchType>& GetMatches(void) { return m_Matches; } const vector<MatchType>& GetMatches(void) const { return m_Matches; } // Store a match. void AddMatch(const MatchType& match) { m_Matches.push_back(match); } const TMapCharInt& GetTransitions(void) const { return m_Transitions; }; // Getter and Setter for failure transition. void SetOnFailure(int state) { m_OnFailure = state; } int GetOnFailure(void) const { return m_OnFailure; } private: // Member Variables: TMapCharInt m_Transitions; // Transition table vector<MatchType> m_Matches; // Stored matches int m_OnFailure; // Transition state in failure. }; // end of class CState // Private Methods: CState *GetState(int state); int GetNextState(const CState& from, char letter) const; // Compute the transition to be performed on failure. void ComputeFail(void); void FindFail(int state, int new_state, char ch); void QueueAdd(vector<int>& in_queue, int qbeg, int val); // Member Variables bool m_Primed;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -