📄 regexpr2.cpp
字号:
// merge one charset into another
basic_charset & operator|=( other_type const & that )
{
if( that.m_fcompliment )
{
// If no posix-style character sets are used, then we can merge this
// nested character set directly into the enclosing character set.
if( wct_zero == that.m_posixcharson &&
that.m_posixcharsoff.empty() &&
that.m_nestedcharsets.empty() )
{
m_ascii_bitvector |= ~ that.m_ascii_bitvector;
// append the inverse of that.m_ranges to this->m_ranges
wchar_t chlow = UCHAR_MAX;
typedef typename other_ranges_type::const_iterator iter_type;
for( iter_type prg = that.m_ranges.begin(); that.m_ranges.end() != prg; ++prg )
{
if( UCHAR_MAX + 1 != prg->first )
m_ranges.push_front( range_type( wchar_t( chlow+1 ), wchar_t( prg->first-1 ) ) );
chlow = prg->second;
}
if( WCHAR_MAX != chlow )
m_ranges.push_front( range_type( wchar_t( chlow+1 ), WCHAR_MAX ) );
}
else
{
// There is no simple way to merge this nested character
// set into the enclosing character set, so we must save
// a pointer to the nested character set in a list.
m_nestedcharsets.push_front( &that );
}
}
else
{
m_ascii_bitvector |= that.m_ascii_bitvector;
std::copy( that.m_ranges.begin(),
that.m_ranges.end(),
std::front_inserter( m_ranges ) );
m_posixcharson |= that.m_posixcharson;
std::copy( that.m_posixcharsoff.begin(),
that.m_posixcharsoff.end(),
std::front_inserter( m_posixcharsoff ) );
std::copy( that.m_nestedcharsets.begin(),
that.m_nestedcharsets.end(),
std::front_inserter( m_nestedcharsets ) );
}
return *this;
}
// Note overloading based on first parameter
void set_bit( char ch, bool const fnocase )
{
if( fnocase )
{
m_ascii_bitvector.set( static_cast<unsigned char>( regex_tolower( ch ) ) );
m_ascii_bitvector.set( static_cast<unsigned char>( regex_toupper( ch ) ) );
}
else
{
m_ascii_bitvector.set( static_cast<unsigned char>( ch ) );
}
}
// Note overloading based on first parameter
void set_bit( wchar_t ch, bool const fnocase )
{
if( UCHAR_MAX >= ch )
set_bit( static_cast<char>( ch ), fnocase );
else
m_ranges.push_front( range_type( ch, ch ) );
}
// Note overloading based on first two parameters
void set_bit_range( char ch1, char ch2, bool const fnocase )
{
if( static_cast<unsigned char>( ch1 ) > static_cast<unsigned char>( ch2 ) )
throw bad_regexpr( "invalid range specified in character set" );
if( fnocase )
{
// i is unsigned int to prevent overflow if ch2 is UCHAR_MAX
for( unsigned int i = static_cast<unsigned char>( ch1 );
i <= static_cast<unsigned char>( ch2 ); ++i )
{
m_ascii_bitvector.set( static_cast<unsigned char>( regex_toupper( (char) i ) ) );
m_ascii_bitvector.set( static_cast<unsigned char>( regex_tolower( (char) i ) ) );
}
}
else
{
// i is unsigned int to prevent overflow if ch2 is UCHAR_MAX
for( unsigned int i = static_cast<unsigned char>( ch1 );
i <= static_cast<unsigned char>( ch2 ); ++i )
{
m_ascii_bitvector.set( static_cast<unsigned char>( i ) );
}
}
}
// Note overloading based on first two parameters
void set_bit_range( wchar_t ch1, wchar_t ch2, bool const fnocase )
{
if( ch1 > ch2 )
throw bad_regexpr( "invalid range specified in character set" );
if( UCHAR_MAX >= ch1 )
set_bit_range( static_cast<char>( ch1 ), static_cast<char>( regex_min<wchar_t>( UCHAR_MAX, ch2 ) ), fnocase );
if( UCHAR_MAX < ch2 )
m_ranges.push_front( range_type( regex_max( static_cast<wchar_t>( UCHAR_MAX + 1 ), ch1 ), ch2 ) );
}
void optimize( type2type<wchar_t> )
{
if( m_ranges.begin() != m_ranges.end() )
{
// this sorts on range_type.m_pfirst ( uses operator<() for pair templates )
m_ranges.sort();
// merge ranges that overlap
typename ranges_type::iterator icur=m_ranges.begin(), iprev=icur++;
while( icur != m_ranges.end() )
{
if( icur->first <= iprev->second + 1 )
{
iprev->second = regex_max( iprev->second, icur->second );
icur = m_ranges.erase( icur, iprev );
}
else
{
iprev=icur++;
}
}
}
// For the ASCII range, merge the m_posixcharson info
// into the ascii_bitvector
if( wct_zero != m_posixcharson )
{
// BUGBUG this is kind of expensive. Think of a better way.
for( unsigned int i=0; i<=UCHAR_MAX; ++i )
if( regex_isctype( i, m_posixcharson ) )
m_ascii_bitvector.set( static_cast<unsigned char>( i ) );
}
// m_fskip_extended_check is a cache which tells us whether we
// need to check the m_posixcharsoff and m_nestedcharsets vectors,
// which would only be used in nested user-defined character sets
m_fskip_extended_check = m_posixcharsoff.empty() && m_nestedcharsets.empty();
}
void optimize( type2type<char> )
{
optimize( type2type<wchar_t>() );
// the posixcharson info was merged into the ascii bitvector,
// so we don't need to ever call regex_isctype ever again.
m_posixcharson = wct_zero;
}
template< bool CaseT, typename CharT >
bool extended_check( CharT ch REGEX_VC6(COMMA bool2type<CaseT>) ) const
{
REGEX_ASSERT( m_fskip_extended_check == ( m_posixcharsoff.empty() && m_nestedcharsets.empty() ) );
if( m_fskip_extended_check )
{
return false;
}
return ( m_posixcharsoff.end() !=
std::find_if( m_posixcharsoff.begin(), m_posixcharsoff.end(),
posixcharsoff_pred<CharT>( ch ) ) )
|| ( m_nestedcharsets.end() !=
std::find_if( m_nestedcharsets.begin(), m_nestedcharsets.end(),
in_charset_pred<CharT, CaseT>( ch ) ) );
}
inline bool in_ranges( wchar_t ch, true_t ) const
{
typedef typename ranges_type::const_iterator iter_type;
iter_type ibegin = m_ranges.begin(), iend = m_ranges.end();
return ibegin != iend &&
std::binary_search( ibegin, iend, range_type( ch, ch ), range_less() );
}
inline bool in_ranges( wchar_t ch, false_t ) const
{
typedef typename ranges_type::const_iterator iter_type;
iter_type ibegin = m_ranges.begin(), iend = m_ranges.end();
if( ibegin == iend )
return false;
wchar_t const chup = regex_toupper( ch );
if( std::binary_search( ibegin, iend, range_type( chup, chup ), range_less() ) )
return true;
wchar_t const chlo = regex_tolower( ch );
if( chup == chlo )
return false;
return std::binary_search( ibegin, iend, range_type( chlo, chlo ), range_less() );
}
// Note overloading based on parameter
template< bool CaseT >
bool in( char ch REGEX_VC6(COMMA bool2type<CaseT>) ) const
{
// Whoops, forgot to call optimize() on this charset
REGEX_ASSERT( wct_zero == m_posixcharson );
return m_fcompliment !=
(
( m_ascii_bitvector[ static_cast<unsigned char>( ch ) ] )
|| ( extended_check REGEX_NVC6(<CaseT>) ( ch REGEX_VC6(COMMA bool2type<CaseT>()) ) )
);
}
// Note overloading based on parameter
template< bool CaseT >
bool in( wchar_t ch REGEX_VC6(COMMA bool2type<CaseT>) ) const
{
// use range_match_type to see if this character is within one of the
// ranges stored in m_rgranges.
return m_fcompliment !=
(
( ( UCHAR_MAX >= ch ) ?
( m_ascii_bitvector[ static_cast<unsigned char>( ch ) ] ) :
( ( in_ranges( ch, bool2type<CaseT>() ) )
|| ( wct_zero != m_posixcharson && regex_iswctype( ch, m_posixcharson ) ) ) )
|| ( extended_check REGEX_NVC6(<CaseT>) ( ch REGEX_VC6(COMMA bool2type<CaseT>()) ) )
);
}
private:
basic_charset & operator=( basic_charset const & that );
basic_charset( basic_charset const & that );
};
// Intrinsic character sets are allocated on the heap with the standard allocator.
// They are either the built-in character sets, or the user-defined ones.
struct charset : public basic_charset<std::allocator<char> >
{
charset()
{
}
private:
charset( charset const & );
charset & operator=( charset const & );
};
// charset is no longer an incomplete type so we now
// know how to destroy one. free_charset() is used in syntax2.h
REGEXPR_H_INLINE void free_charset( charset const * pcharset )
{
delete pcharset;
}
// Custom character sets are the ones that appear in patterns between
// square brackets. They are allocated in a regex_arena to speed up
// pattern compilation and to make rpattern clean-up faster.
struct custom_charset : public basic_charset<regex_arena>
{
static void * operator new( size_t size, regex_arena & arena )
{
return arena.allocate( size );
}
static void operator delete( void *, regex_arena & ) {}
static void operator delete( void * ) {}
custom_charset( regex_arena & arena )
: basic_charset<regex_arena>( arena )
{
}
private:
custom_charset( custom_charset const & );
custom_charset & operator=( custom_charset const & );
};
template< typename CharT >
class intrinsic_charsets
{
struct intrinsic_charset : public charset
{
intrinsic_charset( bool fcompliment, regex_ctype_t desc, char const * sz )
{
reset( fcompliment, desc, sz );
}
void reset( bool fcompliment, regex_ctype_t desc, char const * sz )
{
clear();
m_fcompliment = fcompliment;
m_posixcharson = desc;
for( ; *sz; ++sz )
m_ascii_bitvector.set( static_cast<unsigned char>( *sz ) );
optimize( type2type<CharT>() );
}
private:
intrinsic_charset( intrinsic_charset const & );
intrinsic_charset & operator=( intrinsic_charset const & );
};
static intrinsic_charset & _get_word_charset()
{
static intrinsic_charset s_word_charset( false, wct_alpha()|wct_digit(), "_" );
return s_word_charset;
}
static intrinsic_charset & _get_digit_charset()
{
static intrinsic_charset s_digit_charset( false, wct_digit(), "" );
return s_digit_charset;
}
static intrinsic_charset & _get_space_charset()
{
static intrinsic_charset s_space_charset( false, wct_space(), "" );
return s_space_charset;
}
static intrinsic_charset & _get_not_word_charset()
{
static intrinsic_charset s_not_word_charset( true, wct_alpha()|wct_digit(), "_" );
return s_not_word_charset;
}
static intrinsic_charset & _get_not_digit_charset()
{
static intrinsic_charset s_not_digit_charset( true, wct_digit(), "" );
return s_not_digit_charset;
}
static intrinsic_charset & _get_not_space_charset()
{
static intrinsic_charset s_not_space_charset( true, wct_space(), "" );
return s_not_space_charset;
}
public:
static charset const & get_word_charset()
{
return _get_word_charset();
}
static charset const & get_digit_charset()
{
return _get_digit_charset();
}
static charset const & get_space_charset()
{
return _get_space_charset();
}
static charset const & get_not_word_charset()
{
return _get_not_word_charset();
}
static charset const & get_not_digit_charset()
{
return _get_not_digit_charset();
}
static charset const & get_not_space_charset()
{
return _get_not_space_charset();
}
static void reset()
{
_get_word_charset().reset( false, wct_alpha()|wct_digit(), "_" );
_get_digit_charset().reset( false, wct_digit(), "" );
_get_space_charset().reset( false, wct_space(), "" );
_get_not_word_charset().reset( true, wct_alpha()|wct_digit(), "_" );
_get_not_digit_charset().reset( true, wct_digit(), "" );
_get_not_space_charset().reset( true, wct_space(), "" );
}
};
//
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -