📄 qregexp.cpp
字号:
private: enum { CharClassBit = 0x10000, BackRefBit = 0x20000 }; enum { InitialState = 0, FinalState = 1 }; void setup(); int setupState(int match); /* Let's hope that 13 lookaheads and 14 back-references are enough. */ enum { MaxLookaheads = 13, MaxBackRefs = 14 }; enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002, Anchor_Word = 0x00000004, Anchor_NonWord = 0x00000008, Anchor_FirstLookahead = 0x00000010, Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads, Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1, Anchor_Alternation = Anchor_BackRef1Empty << MaxBackRefs, Anchor_LookaheadMask = (Anchor_FirstLookahead - 1) ^ ((Anchor_FirstLookahead << MaxLookaheads) - 1) };#ifndef QT_NO_REGEXP_CAPTURE int startAtom(bool officialCapture); void finishAtom(int atom, bool needCapture);#endif#ifndef QT_NO_REGEXP_LOOKAHEAD int addLookahead(QRegExpEngine *eng, bool negative);#endif#ifndef QT_NO_REGEXP_OPTIM bool goodStringMatch(QRegExpMatchState &matchState) const; bool badCharMatch(QRegExpMatchState &matchState) const;#else bool bruteMatch(QRegExpMatchState &matchState) const;#endif QVector<QRegExpAutomatonState> s; // array of states#ifndef QT_NO_REGEXP_CAPTURE QVector<QRegExpAtom> f; // atom hierarchy int nf; // number of atoms int cf; // current atom QVector<int> captureForOfficialCapture;#endif int officialncap; // number of captures, seen from the outside int ncap; // number of captures, seen from the inside#ifndef QT_NO_REGEXP_CCLASS QVector<QRegExpCharClass> cl; // array of character classes#endif#ifndef QT_NO_REGEXP_LOOKAHEAD QVector<QRegExpLookahead *> ahead; // array of lookaheads#endif#ifndef QT_NO_REGEXP_ANCHOR_ALT QVector<QRegExpAnchorAlternation> aa; // array of (a, b) pairs of anchors#endif#ifndef QT_NO_REGEXP_OPTIM bool caretAnchored; // does the regexp start with ^? bool trivial; // is the good-string all that needs to match?#endif bool valid; // is the regular expression valid? Qt::CaseSensitivity cs; // case sensitive? bool greedyQuantifiers; // RegExp2?#ifndef QT_NO_REGEXP_BACKREF int nbrefs; // number of back-references#endif#ifndef QT_NO_REGEXP_OPTIM bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch int goodEarlyStart; // the index where goodStr can first occur in a match int goodLateStart; // the index where goodStr can last occur in a match QString goodStr; // the string that any match has to contain int minl; // the minimum length of a match QVector<int> occ1; // first-occurrence array#endif /* The class Box is an abstraction for a regular expression fragment. It can also be seen as one node in the syntax tree of a regular expression with synthetized attributes. Its interface is ugly for performance reasons. */ class Box { public: Box(QRegExpEngine *engine); Box(const Box &b) { operator=(b); } Box &operator=(const Box &b); void clear() { operator=(Box(eng)); } void set(QChar ch); void set(const QRegExpCharClass &cc);#ifndef QT_NO_REGEXP_BACKREF void set(int bref);#endif void cat(const Box &b); void orx(const Box &b); void plus(int atom); void opt(); void catAnchor(int a);#ifndef QT_NO_REGEXP_OPTIM void setupHeuristics();#endif#if defined(QT_DEBUG) void dump() const;#endif private: void addAnchorsToEngine(const Box &to) const; QRegExpEngine *eng; // the automaton under construction QVector<int> ls; // the left states (firstpos) QVector<int> rs; // the right states (lastpos) QMap<int, int> lanchors; // the left anchors QMap<int, int> ranchors; // the right anchors int skipanchors; // the anchors to match if the box is skipped#ifndef QT_NO_REGEXP_OPTIM int earlyStart; // the index where str can first occur int lateStart; // the index where str can last occur QString str; // a string that has to occur in any match QString leftStr; // a string occurring at the left of this box QString rightStr; // a string occurring at the right of this box int maxl; // the maximum length of this box (possibly InftyLen)#endif int minl; // the minimum length of this box#ifndef QT_NO_REGEXP_OPTIM QVector<int> occ1; // first-occurrence array#endif }; friend class Box; /* This is the lexical analyzer for regular expressions. */ enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen, Tok_PosLookahead, Tok_NegLookahead, Tok_RightParen, Tok_CharClass, Tok_Caret, Tok_Quantifier, Tok_Bar, Tok_Word, Tok_NonWord, Tok_Char = 0x10000, Tok_BackRef = 0x20000 }; int getChar(); int getEscape();#ifndef QT_NO_REGEXP_INTERVAL int getRep(int def);#endif#ifndef QT_NO_REGEXP_LOOKAHEAD void skipChars(int n);#endif void error(const char *msg); void startTokenizer(const QChar *rx, int len); int getToken(); const QChar *yyIn; // a pointer to the input regular expression pattern int yyPos0; // the position of yyTok in the input pattern int yyPos; // the position of the next character to read int yyLen; // the length of yyIn int yyCh; // the last character read QRegExpCharClass *yyCharClass; // attribute for Tok_CharClass tokens int yyMinRep; // attribute for Tok_Quantifier int yyMaxRep; // ditto QString yyError; // syntax error or overflow during parsing? /* This is the syntactic analyzer for regular expressions. */ int parse(const QChar *rx, int len); void parseAtom(Box *box); void parseFactor(Box *box); void parseTerm(Box *box); void parseExpression(Box *box); int yyTok; // the last token read bool yyMayCapture; // set this to false to disable capturing friend struct QRegExpMatchState;};#ifndef QT_NO_REGEXP_LOOKAHEAD/* The struct QRegExpLookahead represents a lookahead a la Perl (e.g., (?=foo) and (?!bar)).*/struct QRegExpLookahead{ QRegExpEngine *eng; // NFA representing the embedded regular expression bool neg; // negative lookahead? inline QRegExpLookahead(QRegExpEngine *eng0, bool neg0) : eng(eng0), neg(neg0) { } inline ~QRegExpLookahead() { delete eng; }};#endifQRegExpEngine::QRegExpEngine(const QRegExpEngineKey &key) : cs(key.cs), greedyQuantifiers(key.patternSyntax == QRegExp::RegExp2){ setup(); QString rx; switch (key.patternSyntax) { case QRegExp::Wildcard:#ifndef QT_NO_REGEXP_WILDCARD rx = wc2rx(key.pattern);#endif break; case QRegExp::FixedString: rx = QRegExp::escape(key.pattern); break; default: rx = key.pattern; } valid = (parse(rx.unicode(), rx.length()) == rx.length()); if (!valid) {#ifndef QT_NO_REGEXP_OPTIM trivial = false;#endif error(RXERR_LEFTDELIM); }}QRegExpEngine::~QRegExpEngine(){#ifndef QT_NO_REGEXP_LOOKAHEAD qDeleteAll(ahead);#endif}void QRegExpMatchState::prepareForMatch(QRegExpEngine *eng){ /* We use one QVector<int> for all the big data used a lot in matchHere() and friends. */ int ns = eng->s.size(); // number of states int ncap = eng->ncap;#ifndef QT_NO_REGEXP_OPTIM slideTabSize = qMax(eng->minl + 1, 16);#else slideTabSize = 0;#endif bigArray.resize((3 + 4 * ncap) * ns + 4 * ncap + slideTabSize); inNextStack = bigArray.data(); memset(inNextStack, -1, ns * sizeof(int)); curStack = inNextStack + ns; nextStack = inNextStack + 2 * ns; curCapBegin = inNextStack + 3 * ns; nextCapBegin = curCapBegin + ncap * ns; curCapEnd = curCapBegin + 2 * ncap * ns; nextCapEnd = curCapBegin + 3 * ncap * ns; tempCapBegin = curCapBegin + 4 * ncap * ns; tempCapEnd = tempCapBegin + ncap; capBegin = tempCapBegin + 2 * ncap; capEnd = tempCapBegin + 3 * ncap; slideTab = tempCapBegin + 4 * ncap; this->eng = eng;}/* Tries to match in str and returns an array of (begin, length) pairs for captured text. If there is no match, all pairs are (-1, -1).*/void QRegExpMatchState::match(const QString &str0, int pos0, bool minimal0, bool oneTest, int caretIndex){ bool matched = false; QChar char_null;#ifndef QT_NO_REGEXP_OPTIM if (eng->trivial && !oneTest) { pos = str0.indexOf(eng->goodStr, pos0, eng->cs); matchLen = eng->goodStr.length(); matched = (pos != -1); } else#endif { str = &str0; in = str0.unicode(); if (in == 0) in = &char_null; pos = pos0; caretPos = caretIndex; len = str0.length(); minimal = minimal0; matchLen = 0; oneTestMatchedLen = 0; if (eng->valid && pos >= 0 && pos <= len) {#ifndef QT_NO_REGEXP_OPTIM if (oneTest) { matched = matchHere(); } else { if (pos <= len - eng->minl) { if (eng->caretAnchored) { matched = matchHere(); } else if (eng->useGoodStringHeuristic) { matched = eng->goodStringMatch(*this); } else { matched = eng->badCharMatch(*this); } } }#else matched = oneTest ? matchHere() : eng->bruteMatch(*this);#endif } } int numCaptures = eng->numCaptures(); int capturedSize = 2 + 2 * numCaptures; captured.resize(capturedSize); if (matched) { int *c = captured.data(); *c++ = pos; *c++ = matchLen;#ifndef QT_NO_REGEXP_CAPTURE for (int i = 0; i < numCaptures; ++i) { int j = eng->captureForOfficialCapture.at(i); int len = capEnd[j] - capBegin[j]; *c++ = (len > 0) ? pos + capBegin[j] : 0; *c++ = len; }#endif } else { // we rely on 2's complement here memset(captured.data(), -1, capturedSize * sizeof(int)); }}/* The three following functions add one state to the automaton and return the number of the state.*/int QRegExpEngine::createState(QChar ch){ return setupState(ch.unicode());}int QRegExpEngine::createState(const QRegExpCharClass &cc){#ifndef QT_NO_REGEXP_CCLASS int n = cl.size(); cl += QRegExpCharClass(cc); return setupState(CharClassBit | n);#else Q_UNUSED(cc); return setupState(CharClassBit);#endif}#ifndef QT_NO_REGEXP_BACKREFint QRegExpEngine::createState(int bref){ if (bref > nbrefs) { nbrefs = bref; if (nbrefs > MaxBackRefs) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -