📄 regexpr2.cpp
字号:
};
template< typename T, typename U >
inline bool operator==( regex_allocator<T> const & rhs, regex_allocator<U> const & lhs )
{return &rhs.m_arena == &lhs.m_arena;}
template< typename T, typename U >
inline bool operator!=( regex_allocator<T> const & rhs, regex_allocator<U> const & lhs )
{return &rhs.m_arena != &lhs.m_arena;}
// Use the regex allocator by default because it makes pattern compilation
// and clean-up go much faster. If you define REGEX_NO_ALLOCATOR, though,
// you can save some code bloat.
// BUGBUG This is actually implementation-dependent. regex_allocator is
// not a 100% compliant STL allocator. It does not have a default c'tor, and
// all regex_allocator<T> instances do not necessarily compare equal for
// any type T. To be truly portable, I would need to write my own containers
// that do not make any of these assumptions. Sigh. Alternatively, I could
// just give up passing regex_allocators to STL containers by compiling with
// REGEX_NO_ALLOCATOR defined, but this make compiling patterns slow. :-(
#ifndef REGEX_NO_ALLOCATOR
# define REGEX_ALLOCATOR regex_allocator
#else
# define REGEX_ALLOCATOR std::allocator
#endif
#if defined(_MSC_VER) & _MSC_VER < 1300
# ifndef REGEX_NO_ALLOCATOR
# define MAKE_ALLOCATOR(type,arena) regex_allocator<type>(arena)
# else
# define MAKE_ALLOCATOR(type,arena) std::allocator<type>()
# endif
#else
// Define an allocator factory that can create
// an allocator from a regex_arena.
template< typename AL >
struct allocator_factory
{
template< typename T >
static typename AL::template rebind<T>::other create( regex_arena & )
{
typedef typename AL::template rebind<T>::other other;
return other();
}
};
template<>
struct allocator_factory< regex_allocator<char> >
{
template< typename T >
static regex_allocator<T> create( regex_arena & arena )
{
return regex_allocator<T>( arena );
}
};
#define MAKE_ALLOCATOR(type,arena) allocator_factory<REGEX_ALLOCATOR<char> >::template create<type>( arena )
#endif
// This class is used to speed up character set matching by providing
// a bitset that spans the ASCII range. std::bitset is not used because
// the range-checking slows it down.
// Note: The division and modulus operations are optimized by the compiler
// into bit-shift operations.
class ascii_bitvector
{
typedef unsigned int elem_type;
enum { CBELEM = CHAR_BIT * sizeof( elem_type ), // count of bits per element
CELEMS = ( UCHAR_MAX+1 ) / CBELEM }; // number of element in array
elem_type m_rg[ CELEMS ];
// Used to inline operations like: bv1 |= ~bv2; without creating temp bit vectors.
struct not_ascii_bitvector
{
ascii_bitvector const & m_ref;
not_ascii_bitvector( ascii_bitvector const & ref )
: m_ref( ref ) {}
private:
not_ascii_bitvector & operator=( not_ascii_bitvector const & );
};
public:
ascii_bitvector()
{ zero(); }
void zero()
{ memset( m_rg, 0, CELEMS * sizeof( elem_type ) ); }
void set( unsigned char ch )
{ m_rg[ ( ch / CBELEM ) ] |= ( ( elem_type )1U << ( ch % CBELEM ) ); }
bool operator[]( unsigned char ch ) const
{ return 0 != ( m_rg[ ( ch / CBELEM ) ] & ( ( elem_type )1U << ( ch % CBELEM ) ) ); }
not_ascii_bitvector const operator~() const
{ return not_ascii_bitvector( *this ); }
ascii_bitvector & operator|=( ascii_bitvector const & that )
{ for( int i=0; i<CELEMS; ++i )
m_rg[ i ] |= that.m_rg[ i ];
return *this; }
ascii_bitvector & operator|=( not_ascii_bitvector const & that )
{ for( int i=0; i<CELEMS; ++i )
m_rg[ i ] |= ~that.m_ref.m_rg[ i ];
return *this; }
ascii_bitvector & operator=( ascii_bitvector const & that )
{ for( int i=0; i<CELEMS; ++i )
m_rg[ i ] = that.m_rg[ i ];
return *this; }
ascii_bitvector & operator=( not_ascii_bitvector const & that )
{ for( int i=0; i<CELEMS; ++i )
m_rg[ i ] = ~that.m_ref.m_rg[ i ];
return *this; }
};
typedef std::pair<wchar_t, wchar_t> range_type;
// determines if one range is less then another.
// used in binary search of range vector
struct range_less : public std::binary_function< range_type, range_type, bool >
{
inline bool operator()( range_type const & rg1, range_type const & rg2 ) const
{
return rg1.second < rg2.first;
}
};
struct charset;
template< typename A1, typename A2, typename A3 >
struct charset_t
{
bool m_fcompliment;
bool m_fskip_extended_check;
ascii_bitvector m_ascii_bitvector;
wctype_t m_posixcharson;
std::vector<range_type, A1> m_range_vector;
std::list<wctype_t, A2> m_posixcharsoff;
std::list<charset const *, A3> m_nestedcharsets;
charset_t( A1 const & a1 = A1(), A2 const & a2 = A2(), A3 const & a3 = A3() )
: m_fcompliment( false ),
m_fskip_extended_check( false ),
m_ascii_bitvector(),
m_posixcharson( 0 ),
m_range_vector( a1 ),
m_posixcharsoff( a2 ),
m_nestedcharsets( a3 )
{
}
// We'll be inheriting from this, so a virtual d'tor is regretably necessary.
virtual ~charset_t()
{
}
void clear()
{
m_fcompliment = false;
m_fskip_extended_check = false;
m_ascii_bitvector.zero();
m_posixcharson = 0;
m_range_vector.clear();
m_posixcharsoff.clear();
m_nestedcharsets.clear();
}
void add_range( range_type rg )
{
// Prevent excessive reallocs by reserving in blocks of 5
if( m_range_vector.capacity() == m_range_vector.size() )
m_range_vector.reserve( m_range_vector.size() + 5 );
m_range_vector.push_back( rg );
}
// merge one charset into another
charset_t & operator|=( charset const & that )
{
if( that.m_fcompliment )
{
// If no posix-style character sets are used, then we can merge this
// nested character set directly into the enclosing character set.
if( 0 == that.m_posixcharson &&
that.m_posixcharsoff.empty() &&
that.m_nestedcharsets.empty() )
{
m_ascii_bitvector |= ~ that.m_ascii_bitvector;
// append the inverse of that.m_range_vector to this->m_range_vector
wchar_t chlow = UCHAR_MAX;
typedef std::vector<range_type>::const_iterator VCI;
for( VCI prg = that.m_range_vector.begin(); prg != that.m_range_vector.end(); ++prg )
{
if( UCHAR_MAX + 1 != prg->first )
add_range( range_type( wchar_t( chlow+1 ), wchar_t( prg->first-1 ) ) );
chlow = prg->second;
}
if( WCHAR_MAX != chlow )
add_range( range_type( wchar_t( chlow+1 ), WCHAR_MAX ) );
}
else
{
// There is no simple way to merge this nested character
// set into the enclosing character set, so we must save
// a pointer to the nested character set in a list.
m_nestedcharsets.push_back( & that );
}
}
else
{
m_ascii_bitvector |= that.m_ascii_bitvector;
m_range_vector.insert( m_range_vector.end(),
that.m_range_vector.begin(),
that.m_range_vector.end() );
m_posixcharson |= that.m_posixcharson;
std::copy( that.m_posixcharsoff.begin(),
that.m_posixcharsoff.end() ,
std::back_inserter( m_posixcharsoff ) );
std::copy( that.m_nestedcharsets.begin(),
that.m_nestedcharsets.end(),
std::back_inserter( m_nestedcharsets ) );
}
return *this;
}
// Note overloading based on second parameter
void set_bit( char ch, bool const fnocase )
{
if( fnocase )
{
m_ascii_bitvector.set( static_cast<unsigned char>( regex_tolower( ch ) ) );
m_ascii_bitvector.set( static_cast<unsigned char>( regex_toupper( ch ) ) );
}
else
{
m_ascii_bitvector.set( static_cast<unsigned char>( ch ) );
}
}
// Note overloading based on second parameter
void set_bit( wchar_t ch, bool const fnocase )
{
if( UCHAR_MAX >= ch )
set_bit( static_cast<char>( ch ), fnocase );
else
add_range( range_type( ch, ch ) );
}
// Note overloading based on second parameter
void set_bit_range( char ch1, char ch2, bool const fnocase )
{
if( static_cast<unsigned char>( ch1 ) > static_cast<unsigned char>( ch2 ) )
throw bad_regexpr( "invalid range specified in character set" );
if( fnocase )
{
// i is unsigned int to prevent overflow if ch2 is UCHAR_MAX
for( unsigned int i = static_cast<unsigned char>( ch1 );
i <= static_cast<unsigned char>( ch2 ); ++i )
{
m_ascii_bitvector.set( static_cast<unsigned char>( regex_toupper( (char)i ) ) );
m_ascii_bitvector.set( static_cast<unsigned char>( regex_tolower( (char)i ) ) );
}
}
else
{
// i is unsigned int to prevent overflow if ch2 is UCHAR_MAX
for( unsigned int i = static_cast<unsigned char>( ch1 );
i <= static_cast<unsigned char>( ch2 ); ++i )
{
m_ascii_bitvector.set( static_cast<unsigned char>( i ) );
}
}
}
// Note overloading based on second parameter
void set_bit_range( wchar_t ch1, wchar_t ch2, bool const fnocase )
{
if( ch1 > ch2 )
throw bad_regexpr( "invalid range specified in character set" );
if( UCHAR_MAX >= ch1 )
set_bit_range( static_cast<char>( ch1 ), static_cast<char>( (std::min)( static_cast<wchar_t>( UCHAR_MAX ), ch2 ) ), fnocase );
if( UCHAR_MAX < ch2 )
add_range( range_type( (std::max)( static_cast<wchar_t>( UCHAR_MAX+1 ), ch1 ), ch2 ) );
}
void optimize( type2type<wchar_t> )
{
// this sorts on range_type.first ( uses operator<() for pair templates )
std::sort( m_range_vector.begin(), m_range_vector.end() );
// This merges ranges that overlap
for( size_t index = 1; index < m_range_vector.size(); )
{
if( m_range_vector[ index ].first <= m_range_vector[ index-1 ].second + 1 )
{
m_range_vector[ index-1 ].second = (std::max)(
m_range_vector[ index-1 ].second, m_range_vector[ index ].second );
m_range_vector.erase( m_range_vector.begin() + index );
}
else
++index;
}
// For the ASCII range, merge the m_posixcharson info
// into the ascii_bitvector
if( m_posixcharson )
{
// BUGBUG this is kind of expensive. Think of a better way.
for( unsigned int i=0; i<=UCHAR_MAX; ++i )
if( _isctype( i, m_posixcharson ) )
m_ascii_bitvector.set( static_cast<unsigned char>( i ) );
}
// m_fskip_extended_check is a cache which tells us whether we
// need to check the m_posixcharsoff and m_nestedcharsets vectors,
// which would only be used in nested user-defined character sets
m_fskip_extended_check = m_posixcharsoff.empty() && m_nestedcharsets.empty();
}
void optimize( type2type<char> )
{
optimize( type2type<wchar_t>() );
// the posixcharson info was merged into the ascii bitvector,
// so we don't need to ever call _isctype ever again.
m_posixcharson = 0;
}
#define DECLARE_EXTENDED_CHECK(FUN,CHAR,CTYPE,PMF)\
bool FUN( CHAR ch ) const\
{\
if( m_fskip_extended_check )\
{\
assert( m_posixcharsoff.empty() && m_nestedcharsets.empty() );\
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -