📄 qregexp.cpp
字号:
#ifndef QT_NO_REGEXP_BACKREF void set(int bref);#endif void cat(const Box &b); void orx(const Box &b); void plus(int atom); void opt(); void catAnchor(int a);#ifndef QT_NO_REGEXP_OPTIM void setupHeuristics();#endif#if defined(QT_DEBUG) void dump() const;#endif private: void addAnchorsToEngine(const Box &to) const; QRegExpEngine *eng; // the automaton under construction QVector<int> ls; // the left states (firstpos) QVector<int> rs; // the right states (lastpos) QMap<int, int> lanchors; // the left anchors QMap<int, int> ranchors; // the right anchors int skipanchors; // the anchors to match if the box is skipped#ifndef QT_NO_REGEXP_OPTIM int earlyStart; // the index where str can first occur int lateStart; // the index where str can last occur QString str; // a string that has to occur in any match QString leftStr; // a string occurring at the left of this box QString rightStr; // a string occurring at the right of this box int maxl; // the maximum length of this box (possibly InftyLen)#endif int minl; // the minimum length of this box#ifndef QT_NO_REGEXP_OPTIM QVector<int> occ1; // first-occurrence array#endif }; friend class Box; /* This is the lexical analyzer for regular expressions. */ enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen, Tok_PosLookahead, Tok_NegLookahead, Tok_RightParen, Tok_CharClass, Tok_Caret, Tok_Quantifier, Tok_Bar, Tok_Word, Tok_NonWord, Tok_Char = 0x10000, Tok_BackRef = 0x20000 }; int getChar(); int getEscape();#ifndef QT_NO_REGEXP_INTERVAL int getRep(int def);#endif#ifndef QT_NO_REGEXP_LOOKAHEAD void skipChars(int n);#endif void error(const char *msg); void startTokenizer(const QChar *rx, int len); int getToken(); const QChar *yyIn; // a pointer to the input regular expression pattern int yyPos0; // the position of yyTok in the input pattern int yyPos; // the position of the next character to read int yyLen; // the length of yyIn int yyCh; // the last character read CharClass *yyCharClass; // attribute for Tok_CharClass tokens int yyMinRep; // attribute for Tok_Quantifier int yyMaxRep; // ditto QString yyError; // syntax error or overflow during parsing? /* This is the syntactic analyzer for regular expressions. */ int parse(const QChar *rx, int len); void parseAtom(Box *box); void parseFactor(Box *box); void parseTerm(Box *box); void parseExpression(Box *box); int yyTok; // the last token read bool yyMayCapture; // set this to false to disable capturing /* This is the engine state during matching. */ const QString *mmStr; // a pointer to the input QString const QChar *mmIn; // a pointer to the input string data int mmPos; // the current position in the string int mmCaretPos; int mmLen; // the length of the input string bool mmMinimal; // minimal matching? QVector<int> mmBigArray; // big QVector<int> array int *mmInNextStack; // is state is mmNextStack? int *mmCurStack; // stack of current states int *mmNextStack; // stack of next states int *mmCurCapBegin; // start of current states' captures int *mmNextCapBegin; // start of next states' captures int *mmCurCapEnd; // end of current states' captures int *mmNextCapEnd; // end of next states' captures int *mmTempCapBegin; // start of temporary captures int *mmTempCapEnd; // end of temporary captures int *mmCapBegin; // start of captures for a next state int *mmCapEnd; // end of captures for a next state int *mmSlideTab; // bump-along slide table for bad-character heuristic int mmSlideTabSize; // size of slide table#ifndef QT_NO_REGEXP_BACKREF QList<QVector<int> > mmSleeping; // list of back-reference sleepers#endif int mmMatchLen; // length of match int mmOneTestMatchedLen; // length of partial match};QRegExpEngine::QRegExpEngine(const QString &rx, Qt::CaseSensitivity cs){ setup(cs); valid = (parse(rx.unicode(), rx.length()) == rx.length()); if (!valid) {#ifndef QT_NO_REGEXP_OPTIM trivial = false;#endif error(RXERR_LEFTDELIM); }}QRegExpEngine::~QRegExpEngine(){ while (!s.isEmpty()) delete s.takeFirst();#ifndef QT_NO_REGEXP_CCLASS while (!cl.isEmpty()) delete cl.takeFirst();#endif#ifndef QT_NO_REGEXP_LOOKAHEAD while (!ahead.isEmpty()) delete ahead.takeFirst();#endif}/* Tries to match in str and returns an array of (begin, length) pairs for captured text. If there is no match, all pairs are (-1, -1).*/void QRegExpEngine::match(const QString &str, int pos, bool minimal, bool oneTest, int caretIndex, QVector<int> &captured){ bool matched = false; QChar char_null;#ifndef QT_NO_REGEXP_OPTIM if (trivial && !oneTest) { mmPos = str.indexOf(goodStr, pos, cs); mmMatchLen = goodStr.length(); matched = (mmPos != -1); } else#endif { mmStr = &str; mmIn = str.unicode(); if (mmIn == 0) mmIn = &char_null; mmPos = pos; mmCaretPos = caretIndex; mmLen = str.length(); mmMinimal = minimal; mmMatchLen = 0; mmOneTestMatchedLen = 0; if (valid && mmPos >= 0 && mmPos <= mmLen) {#ifndef QT_NO_REGEXP_OPTIM if (oneTest) { matched = matchHere(); } else { if (mmPos <= mmLen - minl) { if (caretAnchored) { matched = matchHere(); } else if (useGoodStringHeuristic) { matched = goodStringMatch(); } else { matched = badCharMatch(); } } }#else matched = oneTest ? matchHere() : bruteMatch();#endif } } int capturedSize = 2 + 2 * officialncap; captured.resize(capturedSize); if (matched) { int *c = captured.data(); c[0] = mmPos; c[1] = mmMatchLen; for (int j = 0; j < officialncap; j++) { int len = mmCapEnd[j] - mmCapBegin[j]; c[2 + 2 * j] = len > 0 ? mmPos + mmCapBegin[j] : 0; c[2 + 2 * j + 1] = len; } } else { // we rely on 2's complement here memset(captured.data(), -1, capturedSize * sizeof(int)); }}/* The three following functions add one state to the automaton and return the number of the state.*/int QRegExpEngine::createState(QChar ch){ return setupState(ch.unicode());}int QRegExpEngine::createState(const CharClass &cc){#ifndef QT_NO_REGEXP_CCLASS int n = cl.size(); cl += new CharClass(cc); return setupState(CharClassBit | n);#else Q_UNUSED(cc); return setupState(CharClassBit);#endif}#ifndef QT_NO_REGEXP_BACKREFint QRegExpEngine::createState(int bref){ if (bref > nbrefs) { nbrefs = bref; if (nbrefs > MaxBackRefs) { error(RXERR_LIMIT); return 0; } } return setupState(BackRefBit | bref);}#endif/* The two following functions add a transition between all pairs of states (i, j) where i is fond in from, and j is found in to. Cat-transitions are distinguished from plus-transitions for capturing.*/void QRegExpEngine::addCatTransitions(const QVector<int> &from, const QVector<int> &to){ for (int i = 0; i < from.size(); i++) { State *st = s.at(from.at(i)); mergeInto(&st->outs, to); }}#ifndef QT_NO_REGEXP_CAPTUREvoid QRegExpEngine::addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom){ for (int i = 0; i < from.size(); i++) { State *st = s.at(from.at(i)); QVector<int> oldOuts = st->outs; mergeInto(&st->outs, to); if (f.at(atom).capture >= 0) { if (st->reenter == 0) st->reenter = new QMap<int, int>; for (int j = 0; j < to.size(); j++) { if (!st->reenter->contains(to[j]) && qBinaryFind(oldOuts.begin(), oldOuts.end(), to[j]) == oldOuts.end()) st->reenter->insert(to[j], atom); } } }}#endif#ifndef QT_NO_REGEXP_ANCHOR_ALT/* Returns an anchor that means a OR b.*/int QRegExpEngine::anchorAlternation(int a, int b){ if (((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0) return a & b; int n = aa.size();#ifndef QT_NO_REGEXP_OPTIM if (n > 0 && aa.at(n - 1).a == a && aa.at(n - 1).b == b) return Anchor_Alternation | (n - 1);#endif aa.resize(n + 1); aa[n].a = a; aa[n].b = b; return Anchor_Alternation | n;}/* Returns an anchor that means a AND b.*/int QRegExpEngine::anchorConcatenation(int a, int b){ if (((a | b) & Anchor_Alternation) == 0) return a | b; if ((b & Anchor_Alternation) != 0) qSwap(a, b); int aprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).a, b); int bprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).b, b); return anchorAlternation(aprime, bprime);}#endif/* Adds anchor a on a transition caracterised by its from state and its to state.*/void QRegExpEngine::addAnchors(int from, int to, int a){ State *st = s.at(from); if (st->anchors == 0) st->anchors = new QMap<int, int>; if (st->anchors->contains(to)) a = anchorAlternation((*st->anchors)[to], a); st->anchors->insert(to, a);}#ifndef QT_NO_REGEXP_OPTIM/* This function chooses between the good-string and the bad-character heuristics. It computes two scores and chooses the heuristic with the highest score. Here are some common-sense constraints on the scores that should be respected if the formulas are ever modified: (1) If goodStr is empty, the good-string heuristic scores 0. (2) If the regular expression is trivial, the good-string heuristic should be used. (3) If the search is case insensitive, the good-string heuristic should be used, unless it scores 0. (Case insensitivity turns all entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is big, the good-string heuristic should score less.*/void QRegExpEngine::heuristicallyChooseHeuristic(){ if (minl == 0) { useGoodStringHeuristic = false; } else if (trivial) { useGoodStringHeuristic = true; } else { /* Magic formula: The good string has to constitute a good proportion of the minimum-length string, and appear at a more-or-less known index. */ int goodStringScore = (64 * goodStr.length() / minl) - (goodLateStart - goodEarlyStart); /* Less magic formula: We pick some characters at random, and check whether they are good or bad. */ int badCharScore = 0; int step = qMax(1, NumBadChars / 32); for (int i = 1; i < NumBadChars; i += step) { if (occ1[i] == NoOccurrence) badCharScore += minl; else badCharScore += occ1[i]; } badCharScore /= minl; useGoodStringHeuristic = (goodStringScore > badCharScore); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -