📄 qregexp.cpp
字号:
int *mmNextStack; // stack of next states int *mmCurCapBegin; // start of current states' captures int *mmNextCapBegin; // start of next states' captures int *mmCurCapEnd; // end of current states' captures int *mmNextCapEnd; // end of next states' captures int *mmTempCapBegin; // start of temporary captures int *mmTempCapEnd; // end of temporary captures int *mmCapBegin; // start of captures for a next state int *mmCapEnd; // end of captures for a next state int *mmSlideTab; // bump-along slide table for bad-character heuristic int mmSlideTabSize; // size of slide table#ifndef QT_NO_REGEXP_BACKREF QIntDict<int> mmSleeping; // dictionary of back-reference sleepers#endif int mmMatchLen; // length of match int mmOneTestMatchedLen; // length of partial match};QRegExpEngine::QRegExpEngine( const QString& rx, bool caseSensitive )#ifndef QT_NO_REGEXP_BACKREF : mmSleeping( 101 )#endif{ setup( caseSensitive ); valid = ( parse(rx.unicode(), rx.length()) == (int) rx.length() ); if ( !valid ) {#ifndef QT_NO_REGEXP_OPTIM trivial = FALSE;#endif error( RXERR_LEFTDELIM ); }}#ifndef QT_NO_REGEXP_OPTIMQRegExpEngine::~QRegExpEngine(){}#endif/* Tries to match in str and returns an array of (begin, length) pairs for captured text. If there is no match, all pairs are (-1, -1).*/void QRegExpEngine::match( const QString& str, int pos, bool minimal, bool oneTest, int caretIndex, QMemArray<int>& captured ){ bool matched = FALSE;#ifndef QT_NO_REGEXP_OPTIM if ( trivial && !oneTest ) { mmPos = str.find( goodStr, pos, cs ); mmMatchLen = goodStr.length(); matched = ( mmPos != -1 ); } else#endif { mmStr = &str; mmIn = str.unicode(); if ( mmIn == 0 ) mmIn = &QChar::null; mmPos = pos; mmCaretPos = caretIndex; mmLen = str.length(); mmMinimal = minimal; mmMatchLen = 0; mmOneTestMatchedLen = 0; if ( valid && mmPos >= 0 && mmPos <= mmLen ) {#ifndef QT_NO_REGEXP_OPTIM if ( oneTest ) { matched = matchHere(); } else { if ( mmPos <= mmLen - minl ) { if ( caretAnchored ) { matched = matchHere(); } else if ( useGoodStringHeuristic ) { matched = goodStringMatch(); } else { matched = badCharMatch(); } } }#else matched = oneTest ? matchHere() : bruteMatch();#endif } } int capturedSize = 2 + 2 * officialncap; captured.detach(); captured.resize( capturedSize ); if ( matched ) { captured[0] = mmPos; captured[1] = mmMatchLen; for ( int j = 0; j < officialncap; j++ ) { int len = mmCapEnd[j] - mmCapBegin[j]; captured[2 + 2 * j] = len > 0 ? mmPos + mmCapBegin[j] : 0; captured[2 + 2 * j + 1] = len; } } else { // we rely on 2's complement here memset( captured.data(), -1, capturedSize * sizeof(int) ); }}/* The three following functions add one state to the automaton and return the number of the state.*/int QRegExpEngine::createState( QChar ch ){ return setupState( ch.unicode() );}int QRegExpEngine::createState( const CharClass& cc ){#ifndef QT_NO_REGEXP_CCLASS int n = cl.size(); cl.resize( n + 1 ); cl.insert( n, new CharClass(cc) ); return setupState( CharClassBit | n );#else Q_UNUSED( cc ); return setupState( CharClassBit );#endif}#ifndef QT_NO_REGEXP_BACKREFint QRegExpEngine::createState( int bref ){ if ( bref > nbrefs ) { nbrefs = bref; if ( nbrefs > MaxBackRefs ) { error( RXERR_LIMIT ); return 0; } } return setupState( BackRefBit | bref );}#endif/* The two following functions add a transition between all pairs of states (i, j) where i is fond in from, and j is found in to. Cat-transitions are distinguished from plus-transitions for capturing.*/void QRegExpEngine::addCatTransitions( const QMemArray<int>& from, const QMemArray<int>& to ){ for ( int i = 0; i < (int) from.size(); i++ ) { State *st = s[from[i]]; mergeInto( &st->outs, to ); }}#ifndef QT_NO_REGEXP_CAPTUREvoid QRegExpEngine::addPlusTransitions( const QMemArray<int>& from, const QMemArray<int>& to, int atom ){ for ( int i = 0; i < (int) from.size(); i++ ) { State *st = s[from[i]]; QMemArray<int> oldOuts = st->outs.copy(); mergeInto( &st->outs, to ); if ( f[atom].capture >= 0 ) { if ( st->reenter == 0 ) st->reenter = new QMap<int, int>; for ( int j = 0; j < (int) to.size(); j++ ) { if ( !st->reenter->contains(to[j]) && oldOuts.bsearch(to[j]) < 0 ) st->reenter->insert( to[j], atom ); } } }}#endif#ifndef QT_NO_REGEXP_ANCHOR_ALT/* Returns an anchor that means a OR b.*/int QRegExpEngine::anchorAlternation( int a, int b ){ if ( ((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0 ) return a & b; int n = aa.size();#ifndef QT_NO_REGEXP_OPTIM if ( n > 0 && aa[n - 1].a == a && aa[n - 1].b == b ) return Anchor_Alternation | ( n - 1 );#endif aa.resize( n + 1 ); aa[n].a = a; aa[n].b = b; return Anchor_Alternation | n;}/* Returns an anchor that means a AND b.*/int QRegExpEngine::anchorConcatenation( int a, int b ){ if ( ((a | b) & Anchor_Alternation) == 0 ) return a | b; if ( (b & Anchor_Alternation) != 0 ) qSwap( a, b ); int aprime = anchorConcatenation( aa[a ^ Anchor_Alternation].a, b ); int bprime = anchorConcatenation( aa[a ^ Anchor_Alternation].b, b ); return anchorAlternation( aprime, bprime );}#endif/* Adds anchor a on a transition caracterised by its from state and its to state.*/void QRegExpEngine::addAnchors( int from, int to, int a ){ State *st = s[from]; if ( st->anchors == 0 ) st->anchors = new QMap<int, int>; if ( st->anchors->contains(to) ) a = anchorAlternation( (*st->anchors)[to], a ); st->anchors->insert( to, a );}#ifndef QT_NO_REGEXP_OPTIM/* This function chooses between the good-string and the bad-character heuristics. It computes two scores and chooses the heuristic with the highest score. Here are some common-sense constraints on the scores that should be respected if the formulas are ever modified: (1) If goodStr is empty, the good-string heuristic scores 0. (2) If the regular expression is trivial, the good-string heuristic should be used. (3) If the search is case insensitive, the good-string heuristic should be used, unless it scores 0. (Case insensitivity turns all entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is big, the good-string heuristic should score less.*/void QRegExpEngine::heuristicallyChooseHeuristic(){ if ( minl == 0 ) { useGoodStringHeuristic = FALSE; } else if ( trivial ) { useGoodStringHeuristic = TRUE; } else { /* Magic formula: The good string has to constitute a good proportion of the minimum-length string, and appear at a more-or-less known index. */ int goodStringScore = ( 64 * goodStr.length() / minl ) - ( goodLateStart - goodEarlyStart ); /* Less magic formula: We pick some characters at random, and check whether they are good or bad. */ int badCharScore = 0; int step = QMAX( 1, NumBadChars / 32 ); for ( int i = 1; i < NumBadChars; i += step ) { if ( occ1[i] == NoOccurrence ) badCharScore += minl; else badCharScore += occ1[i]; } badCharScore /= minl; useGoodStringHeuristic = ( goodStringScore > badCharScore ); }}#endif#if defined(QT_DEBUG)void QRegExpEngine::dump() const{ int i, j; qDebug( "Case %ssensitive engine", cs ? "" : "in" ); qDebug( " States" ); for ( i = 0; i < ns; i++ ) { qDebug( " %d%s", i, i == InitialState ? " (initial)" : i == FinalState ? " (final)" : "" );#ifndef QT_NO_REGEXP_CAPTURE qDebug( " in atom %d", s[i]->atom );#endif int m = s[i]->match; if ( (m & CharClassBit) != 0 ) { qDebug( " match character class %d", m ^ CharClassBit );#ifndef QT_NO_REGEXP_CCLASS cl[m ^ CharClassBit]->dump();#else qDebug( " negative character class" );#endif } else if ( (m & BackRefBit) != 0 ) { qDebug( " match back-reference %d", m ^ BackRefBit ); } else if ( m >= 0x20 && m <= 0x7e ) { qDebug( " match 0x%.4x (%c)", m, m ); } else { qDebug( " match 0x%.4x", m ); } for ( j = 0; j < (int) s[i]->outs.size(); j++ ) { int next = s[i]->outs[j]; qDebug( " -> %d", next ); if ( s[i]->reenter != 0 && s[i]->reenter->contains(next) ) qDebug( " [reenter %d]", (*s[i]->reenter)[next] ); if ( s[i]->anchors != 0 && at(*s[i]->anchors, next) != 0 ) qDebug( " [anchors 0x%.8x]", (*s[i]->anchors)[next] ); } }#ifndef QT_NO_REGEXP_CAPTURE if ( nf > 0 ) { qDebug( " Atom Parent Capture" ); for ( i = 0; i < nf; i++ ) qDebug( " %6d %6d %6d", i, f[i].parent, f[i].capture ); }#endif#ifndef QT_NO_REGEXP_ANCHOR_ALT for ( i = 0; i < (int) aa.size(); i++ ) qDebug( " Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a, aa[i].b );#endif}#endifvoid QRegExpEngine::setup( bool caseSensitive ){ s.setAutoDelete( TRUE ); s.resize( 32 ); ns = 0;#ifndef QT_NO_REGEXP_CAPTURE f.resize( 32 ); nf = 0; cf = -1;#endif officialncap = 0; ncap = 0;#ifndef QT_NO_REGEXP_CCLASS cl.setAutoDelete( TRUE );#endif#ifndef QT_NO_REGEXP_LOOKAHEAD ahead.setAutoDelete( TRUE );#endif#ifndef QT_NO_REGEXP_OPTIM caretAnchored = TRUE; trivial = TRUE;#endif valid = FALSE; cs = caseSensitive;#ifndef QT_NO_REGEXP_BACKREF nbrefs = 0;#endif#ifndef QT_NO_REGEXP_OPTIM useGoodStringHeuristic = TRUE; minl = 0; occ1.fill( 0, NumBadChars );#endif}int QRegExpEngine::setupState( int match ){ if ( (ns & (ns + 1)) == 0 && ns + 1 >= (int) s.size() ) s.resize( (ns + 1) << 1 );#ifndef QT_NO_REGEXP_CAPTURE s.insert( ns, new State(cf, match) );#else s.insert( ns, new State(match) );#endif return ns++;}#ifndef QT_NO_REGEXP_CAPTURE/* Functions startAtom() and finishAtom() should be called to delimit atoms. When a state is created, it is assigned to the current atom. The information is later used for capturing.*/int QRegExpEngine::startAtom( bool capture ){ if ( (nf & (nf + 1)) == 0 && nf + 1 >= (int) f.size() ) f.resize( (nf + 1) << 1 ); f[nf].parent = cf; cf = nf++; f[cf].capture = capture ? ncap++ : -1; return cf;}#endif#ifndef QT_NO_REGEXP_LOOKAHEAD/* Creates a lookahead anchor.*/int QRegExpEngine::addLookahead( QRegExpEngine *eng, bool negative ){ int n = ahead.size(); if ( n == MaxLookaheads ) { error( RXERR_LIMIT ); return 0; } ahead.resize( n + 1 ); ahead.insert( n, new Lookahead(eng, negative) ); return Anchor_FirstLookahead << n;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -