📄 pcre_compile.cpp

📁 linux下开源浏览器WebKit的源码,市面上的很多商用浏览器都是移植自WebKit
💻 CPP
📖 第 1 页 / 共 5 页
字号:
     find one.          When we hit a repeat whose minimum is zero, we may have to adjust these values     to take the zero repeat into account. This is implemented by setting them to     zeroFirstByte and zeroReqByte when such a repeat is encountered. The individual     item types that can be repeated set these backoff variables appropriately. */        int firstByte = REQ_UNSET;    int reqByte = REQ_UNSET;    int zeroReqByte = REQ_UNSET;    int zeroFirstByte = REQ_UNSET;        /* The variable reqCaseOpt contains either the REQ_IGNORE_CASE value or zero,     according to the current setting of the ignores-case flag. REQ_IGNORE_CASE is a bit     value > 255. It is added into the firstByte or reqByte variables to record the     case status of the value. This is used only for ASCII characters. */        int reqCaseOpt = (options & IgnoreCaseOption) ? REQ_IGNORE_CASE : 0;        /* Switch on next character until the end of the branch */        for (;; ptr++) {        bool negateClass;        bool shouldFlipNegation; /* If a negative special such as \S is used, we should negate the whole class to properly support Unicode. */        int classCharCount;        int classLastChar;        int skipBytes;        int subReqByte;        int subFirstByte;        int mcLength;        unsigned char mcbuffer[8];                /* Next byte in the pattern */                c = ptr < patternEnd ? *ptr : 0;                /* Fill in length of a previous callout, except when the next thing is         a quantifier. */                bool isQuantifier = c == '*' || c == '+' || c == '?' || (c == '{' && isCountedRepeat(ptr + 1, patternEnd));                switch (c) {            /* The branch terminates at end of string, |, or ). */                            case 0:                if (ptr < patternEnd)                    goto NORMAL_CHAR;                // End of string; fall through            case '|':            case ')':                *firstbyteptr = firstByte;                *reqbyteptr = reqByte;                *codePtr = code;                *ptrPtr = ptr;                return true;                            /* Handle single-character metacharacters. In multiline mode, ^ disables             the setting of any following char as a first character. */            case '^':                if (options & MatchAcrossMultipleLinesOption) {                    if (firstByte == REQ_UNSET)                        firstByte = REQ_NONE;                    *code++ = OP_BOL;                } else                    *code++ = OP_CIRC;                previous = NULL;                break;            case '$':                previous = NULL;                if (options & MatchAcrossMultipleLinesOption)                  *code++ = OP_EOL;                else                  *code++ = OP_DOLL;                break;            /* There can never be a first char if '.' is first, whatever happens about             repeats. The value of reqByte doesn't change either. */            case '.':                if (firstByte == REQ_UNSET)                    firstByte = REQ_NONE;                zeroFirstByte = firstByte;                zeroReqByte = reqByte;                previous = code;                *code++ = OP_NOT_NEWLINE;                break;                            /* Character classes. If the included characters are all < 256, we build a             32-byte bitmap of the permitted characters, except in the special case             where there is only one such character. For negated classes, we build the             map as usual, then invert it at the end. However, we use a different opcode             so that data characters > 255 can be handled correctly.                          If the class contains characters outside the 0-255 range, a different             opcode is compiled. It may optionally have a bit map for characters < 256,             but those above are are explicitly listed afterwards. A flag byte tells             whether the bitmap is present, and whether this is a negated class or not.             */                            case '[': {                previous = code;                shouldFlipNegation = false;                                /* PCRE supports POSIX class stuff inside a class. Perl gives an error if                 they are encountered at the top level, so we'll do that too. */                                /* If the first character is '^', set the negation flag and skip it. */                if (ptr + 1 >= patternEnd) {                    *errorCodePtr = ERR6;                    return false;                }                if (ptr[1] == '^') {                    negateClass = true;                    ++ptr;                } else                    negateClass = false;                                /* Keep a count of chars with values < 256 so that we can optimize the case                 of just a single character (as long as it's < 256). For higher valued UTF-8                 characters, we don't yet do any optimization. */                                classCharCount = 0;                classLastChar = -1;                                class_utf8 = false;                       /* No chars >= 256 */                class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */                                /* Initialize the 32-char bit map to all zeros. We have to build the                 map in a temporary bit of store, in case the class contains only 1                 character (< 256), because in that case the compiled code doesn't use the                 bit map. */                                memset(classbits, 0, 32 * sizeof(unsigned char));                                /* Process characters until ] is reached. The first pass                 through the regex checked the overall syntax, so we don't need to be very                 strict here. At the start of the loop, c contains the first byte of the                 character. */                while ((++ptr < patternEnd) && (c = *ptr) != ']') {                    /* Backslash may introduce a single character, or it may introduce one                     of the specials, which just set a flag. Escaped items are checked for                     validity in the pre-compiling pass. The sequence \b is a special case.                     Inside a class (and only there) it is treated as backspace. Elsewhere                     it marks a word boundary. Other escapes have preset maps ready to                     or into the one we are building. We assume they have more than one                     character in them, so set classCharCount bigger than one. */                                        if (c == '\\') {                        c = checkEscape(&ptr, patternEnd, errorCodePtr, cd.numCapturingBrackets, true);                        if (c < 0) {                            classCharCount += 2;     /* Greater than 1 is what matters */                            switch (-c) {                                case ESC_d:                                    for (c = 0; c < 32; c++)                                        classbits[c] |= classBitmapForChar(c + cbit_digit);                                    continue;                                                                    case ESC_D:                                    shouldFlipNegation = true;                                    for (c = 0; c < 32; c++)                                        classbits[c] |= ~classBitmapForChar(c + cbit_digit);                                    continue;                                                                    case ESC_w:                                    for (c = 0; c < 32; c++)                                        classbits[c] |= classBitmapForChar(c + cbit_word);                                    continue;                                                                    case ESC_W:                                    shouldFlipNegation = true;                                    for (c = 0; c < 32; c++)                                        classbits[c] |= ~classBitmapForChar(c + cbit_word);                                    continue;                                                                    case ESC_s:                                    for (c = 0; c < 32; c++)                                         classbits[c] |= classBitmapForChar(c + cbit_space);                                    continue;                                                                    case ESC_S:                                    shouldFlipNegation = true;                                    for (c = 0; c < 32; c++)                                         classbits[c] |= ~classBitmapForChar(c + cbit_space);                                    continue;                                                                        /* Unrecognized escapes are faulted if PCRE is running in its                                     strict mode. By default, for compatibility with Perl, they are                                     treated as literals. */                                                                    default:                                    c = *ptr;              /* The final character */                                    classCharCount -= 2;  /* Undo the default count from above */                            }                        }                                                /* Fall through if we have a single character (c >= 0). This may be                         > 256 in UTF-8 mode. */                                            }   /* End of backslash handling */                                        /* A single character may be followed by '-' to form a range. However,                     Perl does not permit ']' to be the end of the range. A '-' character                     here is treated as a literal. */                                        if ((ptr + 2 < patternEnd) && ptr[1] == '-' && ptr[2] != ']') {                        ptr += 2;                                                int d = *ptr;                                                /* The second part of a range can be a single-character escape, but                         not any of the other escapes. Perl 5.6 treats a hyphen as a literal                         in such circumstances. */                                                if (d == '\\') {                            const UChar* oldptr = ptr;                            d = checkEscape(&ptr, patternEnd, errorCodePtr, cd.numCapturingBrackets, true);                                                        /* \X is literal X; any other special means the '-' was literal */                            if (d < 0) {                                ptr = oldptr - 2;                                goto LONE_SINGLE_CHARACTER;  /* A few lines below */                            }                        }                                                /* The check that the two values are in the correct order happens in                         the pre-pass. Optimize one-character ranges */                                                if (d == c)                            goto LONE_SINGLE_CHARACTER;  /* A few lines below */                                                /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless                         matching, we have to use an XCLASS with extra data items. Caseless                         matching for characters > 127 is available only if UCP support is                         available. */                                                if ((d > 255 || ((options & IgnoreCaseOption) && d > 127))) {                            class_utf8 = true;                                                        /* With UCP support, we can find the other case equivalents of                             the relevant characters. There may be several ranges. Optimize how                             they fit with the basic range. */                                                        if (options & IgnoreCaseOption) {                                int occ, ocd;                                int cc = c;                                int origd = d;                                while (getOthercaseRange(&cc, origd, &occ, &ocd)) {                                    if (occ >= c && ocd <= d)                                        continue;  /* Skip embedded ranges */                                                                        if (occ < c  && ocd >= c - 1)        /* Extend the basic range */                                    {                                  /* if there is overlap,   */                                        c = occ;                           /* noting that if occ < c */                                        continue;                          /* we can't have ocd > d  */                                    }                                  /* because a subrange is  */                                    if (ocd > d && occ <= d + 1)         /* always shorter than    */                                    {                                  /* the basic range.       */                                        d = ocd;                                        continue;                                    }                                                                        if (occ == ocd)                                        *class_utf8data++ = XCL_SINGLE;                                    else {                                        *class_utf8data++ = XCL_RANGE;                                        class_utf8data += encodeUTF8(occ, class_utf8data);                                    }                                    class_utf8data += encodeUTF8(ocd, class_utf8data);                                }                            }                                                        /* Now record the original range, possibly modified for UCP caseless                             overlapping ranges. */                                                        *class_utf8data++ = XCL_RANGE;                            class_utf8data += encodeUTF8(c, class_utf8data);                            class_utf8data += encodeUTF8(d, class_utf8data);                                                        /* With UCP support, we are done. Without UCP support, there is no                             caseless matching for UTF-8 characters > 127; we can use the bit map                             for the smaller ones. */                                                        continue;    /* With next character in the class */                        }                                                /* We use the bit map for all cases when not in UTF-8 mode; else                         ranges that lie entirely within 0-127 when there is UCP support; else                         for partial ranges without UCP support. */                                                for (; c <= d; c++) {                            classbits[c/8] |= (1 << (c&7));
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -