📄 qregexp.cpp
字号:
case '+': case '.': case '\\': case '^': case '{': case '|': case '}': rx += QChar( '\\' ); rx += c; break; case '[': rx += c; if ( wc[i] == QChar('^') ) rx += wc[i++]; if ( i < wclen ) { if ( rx[i] == ']' ) rx += wc[i++]; while ( i < wclen && wc[i] != QChar(']') ) { if ( wc[i] == '\\' ) rx += QChar( '\\' ); rx += wc[i++]; } } break; default: rx += c; } } return rx;}#endif/* The class QRegExpEngine encapsulates a modified nondeterministic finite automaton (NFA).*/class QRegExpEngine : public QShared{public:#ifndef QT_NO_REGEXP_CCLASS /* The class CharClass represents a set of characters, such as can be found in regular expressions (e.g., [a-z] denotes the set {a, b, ..., z}). */ class CharClass { public: CharClass(); CharClass( const CharClass& cc ) { operator=( cc ); } CharClass& operator=( const CharClass& cc ); void clear(); bool negative() const { return n; } void setNegative( bool negative ); void addCategories( int cats ); void addRange( ushort from, ushort to ); void addSingleton( ushort ch ) { addRange( ch, ch ); } bool in( QChar ch ) const;#ifndef QT_NO_REGEXP_OPTIM const QMemArray<int>& firstOccurrence() const { return occ1; }#endif#if defined(QT_DEBUG) void dump() const;#endif private: /* The struct Range represents a range of characters (e.g., [0-9] denotes range 48 to 57). */ struct Range { ushort from; // 48 ushort to; // 57 }; int c; // character classes QMemArray<Range> r; // character ranges bool n; // negative?#ifndef QT_NO_REGEXP_OPTIM QMemArray<int> occ1; // first-occurrence array#endif };#else struct CharClass { int dummy;#ifndef QT_NO_REGEXP_OPTIM CharClass() { occ1.fill( 0, NumBadChars ); } const QMemArray<int>& firstOccurrence() const { return occ1; } QMemArray<int> occ1;#endif };#endif QRegExpEngine( bool caseSensitive ) { setup( caseSensitive ); } QRegExpEngine( const QString& rx, bool caseSensitive );#ifndef QT_NO_REGEXP_OPTIM ~QRegExpEngine();#endif bool isValid() const { return valid; } bool caseSensitive() const { return cs; } const QString& errorString() const { return yyError; } int numCaptures() const { return officialncap; } void match( const QString& str, int pos, bool minimal, bool oneTest, int caretIndex, QMemArray<int>& captured ); int partialMatchLength() const { return mmOneTestMatchedLen; } int createState( QChar ch ); int createState( const CharClass& cc );#ifndef QT_NO_REGEXP_BACKREF int createState( int bref );#endif void addCatTransitions( const QMemArray<int>& from, const QMemArray<int>& to );#ifndef QT_NO_REGEXP_CAPTURE void addPlusTransitions( const QMemArray<int>& from, const QMemArray<int>& to, int atom );#endif#ifndef QT_NO_REGEXP_ANCHOR_ALT int anchorAlternation( int a, int b ); int anchorConcatenation( int a, int b );#else int anchorAlternation( int a, int b ) { return a & b; } int anchorConcatenation( int a, int b ) { return a | b; }#endif void addAnchors( int from, int to, int a );#ifndef QT_NO_REGEXP_OPTIM void heuristicallyChooseHeuristic();#endif#if defined(QT_DEBUG) void dump() const;#endifprivate: enum { CharClassBit = 0x10000, BackRefBit = 0x20000 }; /* The struct State represents one state in a modified NFA. The input characters matched are stored in the state instead of on the transitions, something possible for an automaton constructed from a regular expression. */ struct State {#ifndef QT_NO_REGEXP_CAPTURE int atom; // which atom does this state belong to?#endif int match; // what does it match? (see CharClassBit and BackRefBit) QMemArray<int> outs; // out-transitions QMap<int, int> *reenter; // atoms reentered when transiting out QMap<int, int> *anchors; // anchors met when transiting out#ifndef QT_NO_REGEXP_CAPTURE State( int a, int m ) : atom( a ), match( m ), reenter( 0 ), anchors( 0 ) { }#else State( int m ) : match( m ), reenter( 0 ), anchors( 0 ) { }#endif ~State() { delete reenter; delete anchors; } };#ifndef QT_NO_REGEXP_LOOKAHEAD /* The struct Lookahead represents a lookahead a la Perl (e.g., (?=foo) and (?!bar)). */ struct Lookahead { QRegExpEngine *eng; // NFA representing the embedded regular expression bool neg; // negative lookahead? Lookahead( QRegExpEngine *eng0, bool neg0 ) : eng( eng0 ), neg( neg0 ) { } ~Lookahead() { delete eng; } };#endif#ifndef QT_NO_REGEXP_CAPTURE /* The struct Atom represents one node in the hierarchy of regular expression atoms. */ struct Atom { int parent; // index of parent in array of atoms int capture; // index of capture, from 1 to ncap };#endif#ifndef QT_NO_REGEXP_ANCHOR_ALT /* The struct AnchorAlternation represents a pair of anchors with OR semantics. */ struct AnchorAlternation { int a; // this anchor... int b; // ...or this one };#endif enum { InitialState = 0, FinalState = 1 }; void setup( bool caseSensitive ); int setupState( int match ); /* Let's hope that 13 lookaheads and 14 back-references are enough. */ enum { MaxLookaheads = 13, MaxBackRefs = 14 }; enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002, Anchor_Word = 0x00000004, Anchor_NonWord = 0x00000008, Anchor_FirstLookahead = 0x00000010, Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads, Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1, Anchor_Alternation = Anchor_BackRef1Empty << MaxBackRefs, Anchor_LookaheadMask = ( Anchor_FirstLookahead - 1 ) ^ ( (Anchor_FirstLookahead << MaxLookaheads) - 1 ) };#ifndef QT_NO_REGEXP_CAPTURE int startAtom( bool capture ); void finishAtom( int atom ) { cf = f[atom].parent; }#endif#ifndef QT_NO_REGEXP_LOOKAHEAD int addLookahead( QRegExpEngine *eng, bool negative );#endif#ifndef QT_NO_REGEXP_CAPTURE bool isBetterCapture( const int *begin1, const int *end1, const int *begin2, const int *end2 );#endif bool testAnchor( int i, int a, const int *capBegin );#ifndef QT_NO_REGEXP_OPTIM bool goodStringMatch(); bool badCharMatch();#else bool bruteMatch();#endif bool matchHere(); QPtrVector<State> s; // array of states int ns; // number of states#ifndef QT_NO_REGEXP_CAPTURE QMemArray<Atom> f; // atom hierarchy int nf; // number of atoms int cf; // current atom#endif int officialncap; // number of captures, seen from the outside int ncap; // number of captures, seen from the inside#ifndef QT_NO_REGEXP_CCLASS QPtrVector<CharClass> cl; // array of character classes#endif#ifndef QT_NO_REGEXP_LOOKAHEAD QPtrVector<Lookahead> ahead; // array of lookaheads#endif#ifndef QT_NO_REGEXP_ANCHOR_ALT QMemArray<AnchorAlternation> aa; // array of (a, b) pairs of anchors#endif#ifndef QT_NO_REGEXP_OPTIM bool caretAnchored; // does the regexp start with ^? bool trivial; // is the good-string all that needs to match?#endif bool valid; // is the regular expression valid? bool cs; // case sensitive?#ifndef QT_NO_REGEXP_BACKREF int nbrefs; // number of back-references#endif#ifndef QT_NO_REGEXP_OPTIM bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch int goodEarlyStart; // the index where goodStr can first occur in a match int goodLateStart; // the index where goodStr can last occur in a match QString goodStr; // the string that any match has to contain int minl; // the minimum length of a match QMemArray<int> occ1; // first-occurrence array#endif /* The class Box is an abstraction for a regular expression fragment. It can also be seen as one node in the syntax tree of a regular expression with synthetized attributes. Its interface is ugly for performance reasons. */ class Box { public: Box( QRegExpEngine *engine ); Box( const Box& b ) { operator=( b ); } Box& operator=( const Box& b ); void clear() { operator=( Box(eng) ); } void set( QChar ch ); void set( const CharClass& cc );#ifndef QT_NO_REGEXP_BACKREF void set( int bref );#endif void cat( const Box& b ); void orx( const Box& b ); void plus( int atom ); void opt(); void catAnchor( int a );#ifndef QT_NO_REGEXP_OPTIM void setupHeuristics();#endif#if defined(QT_DEBUG) void dump() const;#endif private: void addAnchorsToEngine( const Box& to ) const; QRegExpEngine *eng; // the automaton under construction QMemArray<int> ls; // the left states (firstpos) QMemArray<int> rs; // the right states (lastpos) QMap<int, int> lanchors; // the left anchors QMap<int, int> ranchors; // the right anchors int skipanchors; // the anchors to match if the box is skipped#ifndef QT_NO_REGEXP_OPTIM int earlyStart; // the index where str can first occur int lateStart; // the index where str can last occur QString str; // a string that has to occur in any match QString leftStr; // a string occurring at the left of this box QString rightStr; // a string occurring at the right of this box int maxl; // the maximum length of this box (possibly InftyLen)#endif int minl; // the minimum length of this box#ifndef QT_NO_REGEXP_OPTIM QMemArray<int> occ1; // first-occurrence array#endif }; friend class Box; /* This is the lexical analyzer for regular expressions. */ enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen, Tok_PosLookahead, Tok_NegLookahead, Tok_RightParen, Tok_CharClass, Tok_Caret, Tok_Quantifier, Tok_Bar, Tok_Word, Tok_NonWord, Tok_Char = 0x10000, Tok_BackRef = 0x20000 }; int getChar(); int getEscape();#ifndef QT_NO_REGEXP_INTERVAL int getRep( int def );#endif#ifndef QT_NO_REGEXP_LOOKAHEAD void skipChars( int n );#endif void error( const char *msg ); void startTokenizer( const QChar *rx, int len ); int getToken(); const QChar *yyIn; // a pointer to the input regular expression pattern int yyPos0; // the position of yyTok in the input pattern int yyPos; // the position of the next character to read int yyLen; // the length of yyIn int yyCh; // the last character read CharClass *yyCharClass; // attribute for Tok_CharClass tokens int yyMinRep; // attribute for Tok_Quantifier int yyMaxRep; // ditto QString yyError; // syntax error or overflow during parsing? /* This is the syntactic analyzer for regular expressions. */ int parse( const QChar *rx, int len ); void parseAtom( Box *box ); void parseFactor( Box *box ); void parseTerm( Box *box ); void parseExpression( Box *box ); int yyTok; // the last token read bool yyMayCapture; // set this to FALSE to disable capturing /* This is the engine state during matching. */ const QString *mmStr; // a pointer to the input QString const QChar *mmIn; // a pointer to the input string data int mmPos; // the current position in the string int mmCaretPos; int mmLen; // the length of the input string bool mmMinimal; // minimal matching? QMemArray<int> mmBigArray; // big QMemArray<int> array int *mmInNextStack; // is state is mmNextStack? int *mmCurStack; // stack of current states
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -