📄 regexpr2.cpp
字号:
//+---------------------------------------------------------------------------
//
// Copyright ( C ) Microsoft, 1994 - 2002.
//
// File: regexpr2.cpp
//
// Contents: implementation for rpattern methods, definitions for all the
// subexpression types used to perform the matching, the
// charset class definition .
//
// Classes: too many to list here
//
// Functions:
//
// Author: Eric Niebler ( ericne@microsoft.com )
//
// History: 12-11-1998 ericne Created
// 01-05-2001 ericne Removed dependency on VC's choice
// of STL iterator types.
// 08-15-2001 ericne Removed regexpr class, moved match
// state to match_results container.
// 09-17-2001 nathann Add DEBUG_HEAP_SUPPORT
// 11-16-2001 ericne Add stack-conservative algorithm
//
//----------------------------------------------------------------------------
#ifdef _MSC_VER
// unlimited inline expansion ( compile with /Ob1 or /Ob2 )
# pragma inline_recursion( on )
# pragma inline_depth( 255 )
// warning C4127: conditional expression is constant
// warning C4355: 'this' : used in base member initializer list
// warning C4702: unreachable code
// warning C4710: function 'blah' not inlined
// warning C4786: identifier was truncated to '255' characters in the debug information
# pragma warning( push )
# pragma warning( disable : 4127 4355 4702 4710 4786 )
#endif
#include <limits>
#include <cctype>
#include <cwchar>
#include <memory>
#include <cwctype>
#include <malloc.h>
#include <algorithm>
#ifdef __MWERKS__
# include <alloca.h>
#endif
// If the implementation file has been included in the header, then we
// need to mark some functions as inline to prevent them from being multiply
// defined. But if the implementation file is not included in the header,
// we can't mark them as inline, otherwise the linker won't find them.
#ifdef REGEXPR_H
# define REGEXPR_H_INLINE inline
#else
# define REGEXPR_H_INLINE
# include "regexpr2.h"
#endif
#ifdef REGEX_TO_INCLUDE
# include REGEX_TO_INCLUDE
#endif
// $PORT$
// _alloca is not standard
#ifndef alloca
# define alloca _alloca
#endif
namespace regex
{
namespace detail
{
inline wctype_t REGEX_CDECL regex_wctype( char const * sz )
{
using namespace std;
return wctype( sz );
}
namespace
{
#ifdef __GLIBC__
struct regex_ctype_t
{
int m_ctype;
wctype_t m_wctype;
};
#define REGEX_DECL_CTYPE(desc) \
inline regex_ctype_t const & wct_ ## desc() \
{ \
static regex_ctype_t const s_wct = { _IS ## desc, regex_wctype(#desc) };\
return s_wct; \
}
REGEX_DECL_CTYPE(alnum)
REGEX_DECL_CTYPE(alpha)
REGEX_DECL_CTYPE(blank)
REGEX_DECL_CTYPE(cntrl)
REGEX_DECL_CTYPE(digit)
REGEX_DECL_CTYPE(graph)
REGEX_DECL_CTYPE(lower)
REGEX_DECL_CTYPE(print)
REGEX_DECL_CTYPE(punct)
REGEX_DECL_CTYPE(space)
REGEX_DECL_CTYPE(upper)
REGEX_DECL_CTYPE(xdigit)
regex_ctype_t const wct_zero = { 0, 0 };
inline regex_ctype_t & operator |= ( regex_ctype_t & lhs, regex_ctype_t const & rhs )
{
lhs.m_ctype |= rhs.m_ctype;
lhs.m_wctype |= rhs.m_wctype;
return lhs;
}
inline regex_ctype_t operator | ( regex_ctype_t lhs, regex_ctype_t const & rhs )
{
return lhs |= rhs;
}
inline int REGEX_CDECL regex_isctype( int ch, regex_ctype_t const & desc )
{
return __isctype( ch, desc.m_ctype );
}
inline int REGEX_CDECL regex_iswctype( wint_t wc, regex_ctype_t desc )
{
using namespace std;
return iswctype( wc, desc.m_wctype );
}
inline bool operator == ( regex_ctype_t const & lhs, regex_ctype_t const & rhs )
{
return lhs.m_ctype == rhs.m_ctype && lhs.m_wctype == rhs.m_wctype;
}
inline bool operator != ( regex_ctype_t const & lhs, regex_ctype_t const & rhs )
{
return lhs.m_ctype != rhs.m_ctype || lhs.m_wctype != rhs.m_wctype;
}
#else
typedef wctype_t regex_ctype_t;
#define REGEX_DECL_CTYPE(desc) \
inline regex_ctype_t const wct_ ## desc() \
{ \
static regex_ctype_t const s_wct = regex_wctype(#desc); \
return s_wct; \
}
REGEX_DECL_CTYPE(alnum)
REGEX_DECL_CTYPE(alpha)
REGEX_DECL_CTYPE(cntrl)
REGEX_DECL_CTYPE(digit)
REGEX_DECL_CTYPE(graph)
REGEX_DECL_CTYPE(lower)
REGEX_DECL_CTYPE(print)
REGEX_DECL_CTYPE(punct)
REGEX_DECL_CTYPE(space)
REGEX_DECL_CTYPE(upper)
REGEX_DECL_CTYPE(xdigit)
regex_ctype_t const wct_zero = 0;
#if defined(_MSC_VER) & ( _MSC_VER==1200 | defined(_CPPLIB_VER) )
inline regex_ctype_t const wct_blank() { return _BLANK; } // work around for bug in VC++
inline int REGEX_CDECL regex_isctype( int ch, regex_ctype_t desc )
{
return _isctype( ch, static_cast<int>( desc ) );
}
#else
REGEX_DECL_CTYPE(blank)
inline int REGEX_CDECL regex_isctype( int ch, regex_ctype_t desc )
{
using namespace std;
return iswctype( btowc( ch ), desc );
}
#endif
inline int REGEX_CDECL regex_iswctype( wint_t wc, regex_ctype_t desc )
{
using namespace std;
return iswctype( wc, desc );
}
#endif
} // unnamed namespace
template< typename CStringsT, typename IterT >
bool _do_match_iterative( sub_expr_base<IterT> const * expr, match_param<IterT> & param, IterT icur, CStringsT );
// NathanN:
// By defining the symbol REGEX_DEBUG_HEAP the allocator object
// no longer sub allocates memory. This enables heap checking tools like
// AppVerifier & PageHeap to find errors like buffer overruns
#if !defined( REGEX_DEBUG_HEAP ) & REGEX_DEBUG
# define REGEX_DEBUG_HEAP 1
#else
# define REGEX_DEBUG_HEAP 0
#endif
REGEXPR_H_INLINE size_t DEFAULT_BLOCK_SIZE()
{
#if REGEX_DEBUG_HEAP
// put each allocation in its own mem_block
return 1;
#else
// put multiple allocation in each mem_block
return 352;
#endif
}
template< typename IBeginT, typename IEndT >
inline size_t parse_int( IBeginT & ibegin, IEndT iend, size_t const max_ = size_t( -1 ) )
{
typedef typename std::iterator_traits<IEndT>::value_type char_type;
size_t retval = 0;
while( iend != ibegin && REGEX_CHAR(char_type,'0') <= *ibegin && REGEX_CHAR(char_type,'9') >= *ibegin && max_ > retval )
{
retval *= 10;
retval += static_cast<size_t>( *ibegin - REGEX_CHAR(char_type,'0') );
++ibegin;
}
if( max_ < retval )
{
retval /= 10;
--ibegin;
}
return retval;
}
// --------------------------------------------------------------------------
//
// Class: boyer_moore
//
// Description: fast sub-string search algorithm
//
// Members: m_begin - iter to first char in pattern sequence
// m_last - iter to last char in pattern sequence
// m_len - length of the pattern sequence
// m_off - array of offsets, indexed by ASCII char values
//
// History: 6/8/2003 - ericne - Created
//
// --------------------------------------------------------------------------
template< typename IterT >
class boyer_moore
{
typedef typename std::iterator_traits<IterT>::value_type char_type;
typedef typename std::char_traits<char_type> traits_type;
enum { OFFSET_SIZE = UCHAR_MAX + 1 };
IterT m_begin;
IterT m_last;
char_type const* m_low_last;
unsigned char m_len;
unsigned char m_off[ OFFSET_SIZE ];
static unsigned char hash_char( char ch ) { return static_cast<unsigned char>( ch ); }
static unsigned char hash_char( signed char ch ) { return static_cast<unsigned char>( ch ); }
static unsigned char hash_char( unsigned char ch ) { return ch; }
static unsigned char hash_char( wchar_t ch ) { return static_cast<unsigned char>( ch % OFFSET_SIZE ); }
template< typename CharT >
static unsigned char REGEX_VC6(REGEX_CDECL) hash_char( CharT ch REGEX_VC6(...) )
{
return static_cast<unsigned char>( std::char_traits<CharT>::to_int_type( ch ) % OFFSET_SIZE );
}
// case-sensitive Boyer-Moore search
template< typename OtherT >
OtherT find_with_case( OtherT begin, OtherT end ) const
{
typedef typename std::iterator_traits<OtherT>::difference_type diff_type;
diff_type const endpos = std::distance( begin, end );
diff_type offset = m_len;
for( diff_type curpos = offset; curpos < endpos; curpos += offset )
{
std::advance( begin, offset );
IterT pat_tmp = m_last;
OtherT str_tmp = begin;
for( ; traits_type::eq( *str_tmp, *pat_tmp );
--pat_tmp, --str_tmp )
{
if( pat_tmp == m_begin )
{
return str_tmp;
}
}
offset = m_off[ hash_char( *begin ) ];
}
return end;
}
// case-insensitive Boyer-Moore search
template< typename OtherT >
OtherT find_without_case( OtherT begin, OtherT end ) const
{
typedef typename std::iterator_traits<OtherT>::difference_type diff_type;
diff_type const endpos = std::distance( begin, end );
diff_type offset = m_len;
for( diff_type curpos = offset; curpos < endpos; curpos += offset )
{
std::advance( begin, offset );
IterT pat_tmp = m_last;
char_type const* low_tmp = m_low_last;
OtherT str_tmp = begin;
for( ; traits_type::eq( *str_tmp, *pat_tmp ) || traits_type::eq( *str_tmp, *low_tmp );
--pat_tmp, --str_tmp, --low_tmp )
{
if( pat_tmp == m_begin )
{
return str_tmp;
}
}
offset = m_off[ hash_char( *begin ) ];
}
return end;
}
public:
// initialize the Boyer-Moore search data structure, using the
// search sub-sequence to prime the pump.
boyer_moore( IterT begin, IterT end, char_type const* lower = 0 )
: m_begin( begin )
, m_last( begin )
, m_low_last( lower )
{
typedef typename std::iterator_traits<IterT>::difference_type diff_type;
diff_type diff = std::distance( begin, end );
m_len = static_cast<unsigned char>( regex_min<diff_type>( diff, UCHAR_MAX ) );
std::fill_n( m_off, ARRAYSIZE( m_off ), m_len );
--m_len;
for( unsigned char offset = m_len; offset; --offset, ++m_last )
{
m_off[ hash_char( *m_last ) ] = offset;
}
if( m_low_last )
{
for( unsigned char offset = m_len; offset; --offset, ++m_low_last )
{
unsigned char hash = hash_char( *m_low_last );
m_off[ hash ] = regex_min( m_off[ hash ], offset );
}
}
}
template< typename OtherT >
OtherT find( OtherT begin, OtherT end ) const
{
if( m_low_last )
{
return find_without_case( begin, end );
}
else
{
return find_with_case( begin, end );
}
}
static void * operator new( size_t size, regex_arena & arena )
{
return arena.allocate( size );
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -