📄 regexpr2.cpp

📁 代理服务器原代码
💻 CPP
📖 第 1 页 / 共 5 页
字号:
};

template< typename T, typename U >
inline bool operator==( regex_allocator<T> const & rhs, regex_allocator<U> const & lhs )
    {return &rhs.m_arena == &lhs.m_arena;}

template< typename T, typename U >
inline bool operator!=( regex_allocator<T> const & rhs, regex_allocator<U> const & lhs )
    {return &rhs.m_arena != &lhs.m_arena;}

// Use the regex allocator by default because it makes pattern compilation
// and clean-up go much faster. If you define REGEX_NO_ALLOCATOR, though,
// you can save some code bloat.

// BUGBUG This is actually implementation-dependent. regex_allocator is
// not a 100% compliant STL allocator. It does not have a default c'tor, and
// all regex_allocator<T> instances do not necessarily compare equal for
// any type T. To be truly portable, I would need to write my own containers
// that do not make any of these assumptions.  Sigh.  Alternatively, I could
// just give up passing regex_allocators to STL containers by compiling with
// REGEX_NO_ALLOCATOR defined, but this make compiling patterns slow.  :-(

#ifndef REGEX_NO_ALLOCATOR
# define REGEX_ALLOCATOR regex_allocator
#else
# define REGEX_ALLOCATOR std::allocator
#endif

#if defined(_MSC_VER) & _MSC_VER < 1300

# ifndef REGEX_NO_ALLOCATOR
#  define MAKE_ALLOCATOR(type,arena) regex_allocator<type>(arena)
# else
#  define MAKE_ALLOCATOR(type,arena) std::allocator<type>()
# endif

#else
// Define an allocator factory that can create
// an allocator from a regex_arena.
template< typename AL >
struct allocator_factory
{
    template< typename T >
    static typename AL::template rebind<T>::other create( regex_arena & )
    {
        typedef typename AL::template rebind<T>::other other;
        return other();
    }
};
template<>
struct allocator_factory< regex_allocator<char> >
{
    template< typename T >
    static regex_allocator<T> create( regex_arena & arena )
    {
        return regex_allocator<T>( arena );
    }
};
#define MAKE_ALLOCATOR(type,arena) allocator_factory<REGEX_ALLOCATOR<char> >::template create<type>( arena )
#endif

// This class is used to speed up character set matching by providing
// a bitset that spans the ASCII range. std::bitset is not used because
// the range-checking slows it down.
// Note: The division and modulus operations are optimized by the compiler
// into bit-shift operations.
class ascii_bitvector
{
    typedef unsigned int elem_type;

    enum { CBELEM = CHAR_BIT * sizeof( elem_type ), // count of bits per element
           CELEMS = ( UCHAR_MAX+1 ) / CBELEM };       // number of element in array
    elem_type m_rg[ CELEMS ];

    // Used to inline operations like: bv1 |= ~bv2; without creating temp bit vectors.
    struct not_ascii_bitvector
    {
        ascii_bitvector const & m_ref;
        not_ascii_bitvector( ascii_bitvector const & ref )
            : m_ref( ref ) {}
    private:
        not_ascii_bitvector & operator=( not_ascii_bitvector const & );
    };
public:
    ascii_bitvector()
        { zero(); }

    void zero()
        { memset( m_rg, 0, CELEMS * sizeof( elem_type ) ); }

    void set( unsigned char ch )
        { m_rg[ ( ch / CBELEM ) ] |= ( ( elem_type )1U << ( ch % CBELEM ) ); }

    bool operator[]( unsigned char ch ) const
        { return 0 != ( m_rg[ ( ch / CBELEM ) ] & ( ( elem_type )1U << ( ch % CBELEM ) ) ); }

    not_ascii_bitvector const operator~() const
        { return not_ascii_bitvector( *this ); }

    ascii_bitvector & operator|=( ascii_bitvector const & that )
        { for( int i=0; i<CELEMS; ++i )
              m_rg[ i ] |= that.m_rg[ i ];
          return *this; }

    ascii_bitvector & operator|=( not_ascii_bitvector const & that )
        { for( int i=0; i<CELEMS; ++i )
              m_rg[ i ] |= ~that.m_ref.m_rg[ i ];
          return *this; }

    ascii_bitvector & operator=( ascii_bitvector const & that )
        { for( int i=0; i<CELEMS; ++i )
              m_rg[ i ] = that.m_rg[ i ];
          return *this; }

    ascii_bitvector & operator=( not_ascii_bitvector const & that )
        { for( int i=0; i<CELEMS; ++i )
              m_rg[ i ] = ~that.m_ref.m_rg[ i ];
          return *this; }
};

typedef std::pair<wchar_t, wchar_t> range_type;

// determines if one range is less then another.
// used in binary search of range vector
struct range_less : public std::binary_function< range_type, range_type, bool >
{
    inline bool operator()( range_type const & rg1, range_type const & rg2 ) const
    {
        return rg1.second < rg2.first;
    }
};

struct charset;

template< typename A1, typename A2, typename A3 >
struct charset_t
{
    bool                            m_fcompliment;
    bool                            m_fskip_extended_check;
    ascii_bitvector                 m_ascii_bitvector;
    wctype_t                        m_posixcharson;
    std::vector<range_type, A1>     m_range_vector;
    std::list<wctype_t, A2>         m_posixcharsoff;
    std::list<charset const *, A3>  m_nestedcharsets;
    
    charset_t( A1 const & a1 = A1(), A2 const & a2 = A2(), A3 const & a3 = A3() )
        : m_fcompliment( false ),
          m_fskip_extended_check( false ),
          m_ascii_bitvector(),
          m_posixcharson( 0 ),
          m_range_vector( a1 ),
          m_posixcharsoff( a2 ),
          m_nestedcharsets( a3 )
    {
    }

    // We'll be inheriting from this, so a virtual d'tor is regretably necessary.
    virtual ~charset_t()
    {
    }

    void clear()
    {
        m_fcompliment = false;
        m_fskip_extended_check = false;
        m_ascii_bitvector.zero();
        m_posixcharson = 0;
        m_range_vector.clear();
        m_posixcharsoff.clear();
        m_nestedcharsets.clear();
    }

    void add_range( range_type rg )
    {
        // Prevent excessive reallocs by reserving in blocks of 5
        if( m_range_vector.capacity() == m_range_vector.size() )
            m_range_vector.reserve( m_range_vector.size() + 5 );
        m_range_vector.push_back( rg );
    }

    // merge one charset into another
    charset_t & operator|=( charset const & that )
    {
        if( that.m_fcompliment )
        {
            // If no posix-style character sets are used, then we can merge this
            // nested character set directly into the enclosing character set.
            if( 0 == that.m_posixcharson     &&
                that.m_posixcharsoff.empty() &&
                that.m_nestedcharsets.empty() )
            {
                m_ascii_bitvector |= ~ that.m_ascii_bitvector;

                // append the inverse of that.m_range_vector to this->m_range_vector
                wchar_t chlow = UCHAR_MAX;
                typedef std::vector<range_type>::const_iterator VCI;
                for( VCI prg = that.m_range_vector.begin(); prg != that.m_range_vector.end(); ++prg )
                {
                    if( UCHAR_MAX + 1 != prg->first )
                        add_range( range_type( wchar_t( chlow+1 ), wchar_t( prg->first-1 ) ) );
                    chlow = prg->second;
                }
                if( WCHAR_MAX != chlow )
                    add_range( range_type( wchar_t( chlow+1 ), WCHAR_MAX ) );
            }
            else
            {
                // There is no simple way to merge this nested character
                // set into the enclosing character set, so we must save
                // a pointer to the nested character set in a list.
                m_nestedcharsets.push_back( & that );
            }
        }
        else
        {
            m_ascii_bitvector |= that.m_ascii_bitvector;
            m_range_vector.insert( m_range_vector.end(),
                that.m_range_vector.begin(),
                that.m_range_vector.end() );

            m_posixcharson |= that.m_posixcharson;
            std::copy( that.m_posixcharsoff.begin(),
                       that.m_posixcharsoff.end() ,
                       std::back_inserter( m_posixcharsoff ) );

            std::copy( that.m_nestedcharsets.begin(),
                       that.m_nestedcharsets.end(),
                       std::back_inserter( m_nestedcharsets ) );
        }
        return *this;
    }

    // Note overloading based on second parameter
    void set_bit( char ch, bool const fnocase )
    {
        if( fnocase )
        {
            m_ascii_bitvector.set( static_cast<unsigned char>( regex_tolower( ch ) ) );
            m_ascii_bitvector.set( static_cast<unsigned char>( regex_toupper( ch ) ) );
        }
        else
        {
            m_ascii_bitvector.set( static_cast<unsigned char>( ch ) );
        }
    }

    // Note overloading based on second parameter
    void set_bit( wchar_t ch, bool const fnocase )
    {
        if( UCHAR_MAX >= ch )
            set_bit( static_cast<char>( ch ), fnocase );
        else
            add_range( range_type( ch, ch ) );
    }

    // Note overloading based on second parameter
    void set_bit_range( char ch1, char ch2, bool const fnocase )
    {
        if( static_cast<unsigned char>( ch1 ) > static_cast<unsigned char>( ch2 ) )
            throw bad_regexpr( "invalid range specified in character set" );

        if( fnocase )
        {
            // i is unsigned int to prevent overflow if ch2 is UCHAR_MAX
            for( unsigned int i = static_cast<unsigned char>( ch1 ); 
                 i <= static_cast<unsigned char>( ch2 ); ++i )
            {
                m_ascii_bitvector.set( static_cast<unsigned char>( regex_toupper( (char)i ) ) );
                m_ascii_bitvector.set( static_cast<unsigned char>( regex_tolower( (char)i ) ) );
            }
        }
        else
        {
            // i is unsigned int to prevent overflow if ch2 is UCHAR_MAX
            for( unsigned int i = static_cast<unsigned char>( ch1 ); 
                 i <= static_cast<unsigned char>( ch2 ); ++i )
            {
                m_ascii_bitvector.set( static_cast<unsigned char>( i ) );
            }
        }
    }

    // Note overloading based on second parameter
    void set_bit_range( wchar_t ch1, wchar_t ch2, bool const fnocase )
    {
        if( ch1 > ch2 )
            throw bad_regexpr( "invalid range specified in character set" );

        if( UCHAR_MAX >= ch1 )
            set_bit_range( static_cast<char>( ch1 ), static_cast<char>( (std::min)( static_cast<wchar_t>( UCHAR_MAX ), ch2 ) ), fnocase );

        if( UCHAR_MAX < ch2 )
            add_range( range_type( (std::max)( static_cast<wchar_t>( UCHAR_MAX+1 ), ch1 ), ch2 ) );
    }

    void optimize( type2type<wchar_t> )
    {
        // this sorts on range_type.first ( uses operator<() for pair templates )
        std::sort( m_range_vector.begin(), m_range_vector.end() );

        // This merges ranges that overlap
        for( size_t index = 1; index < m_range_vector.size(); )
        {
            if( m_range_vector[ index ].first <= m_range_vector[ index-1 ].second + 1 )
            {
                m_range_vector[ index-1 ].second = (std::max)(
                    m_range_vector[ index-1 ].second, m_range_vector[ index ].second );
                m_range_vector.erase( m_range_vector.begin() + index );
            }
            else
                ++index;
        }

        // For the ASCII range, merge the m_posixcharson info
        // into the ascii_bitvector
        if( m_posixcharson )
        {
            // BUGBUG this is kind of expensive. Think of a better way.
            for( unsigned int i=0; i<=UCHAR_MAX; ++i )
                if( _isctype( i, m_posixcharson ) )
                    m_ascii_bitvector.set( static_cast<unsigned char>( i ) );
        }

        // m_fskip_extended_check is a cache which tells us whether we
        // need to check the m_posixcharsoff and m_nestedcharsets vectors,
        // which would only be used in nested user-defined character sets
        m_fskip_extended_check = m_posixcharsoff.empty() && m_nestedcharsets.empty();
    }

    void optimize( type2type<char> )
    {
        optimize( type2type<wchar_t>() );

        // the posixcharson info was merged into the ascii bitvector,
        // so we don't need to ever call _isctype ever again.
        m_posixcharson = 0;
    }

#define DECLARE_EXTENDED_CHECK(FUN,CHAR,CTYPE,PMF)\
    bool FUN( CHAR ch ) const\
    {\
        if( m_fskip_extended_check )\
        {\
            assert( m_posixcharsoff.empty() && m_nestedcharsets.empty() );\
💿 文件大小 180 K
👤 上传用户 weizik
📂 所属分类源码/资料
🏷️ 相关标签

#代理服务器 #代码
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -