📄 pcre_compile.cpp

📁 Google浏览器V8内核代码
💻 CPP
📖 第 1 页 / 共 5 页
字号:
                    }                                        /* Handle a lone single character - we can get here for a normal                     non-escape char, or after \ that introduces a single character or for an                     apparent range that isn't. */                                    LONE_SINGLE_CHARACTER:                                        /* Handle a character that cannot go in the bit map */                                        if ((c > 255 || ((options & IgnoreCaseOption) && c > 127))) {                        class_utf8 = true;                        *class_utf8data++ = XCL_SINGLE;                        class_utf8data += encodeUTF8(c, class_utf8data);                                                if (options & IgnoreCaseOption) {                            int othercase;                            if ((othercase = kjs_pcre_ucp_othercase(c)) >= 0) {                                *class_utf8data++ = XCL_SINGLE;                                class_utf8data += encodeUTF8(othercase, class_utf8data);                            }                        }                    } else {                        /* Handle a single-byte character */                        classbits[c/8] |= (1 << (c&7));                        if (options & IgnoreCaseOption) {                            c = flipCase(c);                            classbits[c/8] |= (1 << (c&7));                        }                        class_charcount++;                        class_lastchar = c;                    }                }                                /* If class_charcount is 1, we saw precisely one character whose value is                 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we                 can optimize the negative case only if there were no characters >= 128                 because OP_NOT and the related opcodes like OP_NOTSTAR operate on                 single-bytes only. This is an historical hangover. Maybe one day we can                 tidy these opcodes to handle multi-byte characters.                                  The optimization throws away the bit map. We turn the item into a                 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note                 that OP_NOT does not support multibyte characters. In the positive case, it                 can cause firstbyte to be set. Otherwise, there can be no first char if                 this item is first, whatever repeat count may follow. In the case of                 reqbyte, save the previous value for reinstating. */                                if (class_charcount == 1 && (!class_utf8 && (!negate_class || class_lastchar < 128))) {                    zeroreqbyte = reqbyte;                                        /* The OP_NOT opcode works on one-byte characters only. */                                        if (negate_class) {                        if (firstbyte == REQ_UNSET)                            firstbyte = REQ_NONE;                        zerofirstbyte = firstbyte;                        *code++ = OP_NOT;                        *code++ = class_lastchar;                        break;                    }                                        /* For a single, positive character, get the value into c, and                     then we can handle this with the normal one-character code. */                                        c = class_lastchar;                    goto NORMAL_CHAR;                }       /* End of 1-char optimization */                                /* The general case - not the one-char optimization. If this is the first                 thing in the branch, there can be no first char setting, whatever the                 repeat count. Any reqbyte setting must remain unchanged after any kind of                 repeat. */                                if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;                zerofirstbyte = firstbyte;                zeroreqbyte = reqbyte;                                /* If there are characters with values > 255, we have to compile an                 extended class, with its own opcode. If there are no characters < 256,                 we can omit the bitmap. */                                if (class_utf8 && !should_flip_negation) {                    *class_utf8data++ = XCL_END;    /* Marks the end of extra data */                    *code++ = OP_XCLASS;                    code += LINK_SIZE;                    *code = negate_class? XCL_NOT : 0;                                        /* If the map is required, install it, and move on to the end of                     the extra data */                                        if (class_charcount > 0) {                        *code++ |= XCL_MAP;                        memcpy(code, classbits, 32);                        code = class_utf8data;                    }                                        /* If the map is not required, slide down the extra data. */                                        else {                        int len = class_utf8data - (code + 33);                        memmove(code + 1, code + 33, len);                        code += len + 1;                    }                                        /* Now fill in the complete length of the item */                                        putLinkValue(previous + 1, code - previous);                    break;   /* End of class handling */                }                                /* If there are no characters > 255, negate the 32-byte map if necessary,                 and copy it into the code vector. If this is the first thing in the branch,                 there can be no first char setting, whatever the repeat count. Any reqbyte                 setting must remain unchanged after any kind of repeat. */                                *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;                if (negate_class)                    for (c = 0; c < 32; c++)                        code[c] = ~classbits[c];                else                    memcpy(code, classbits, 32);                code += 32;                break;            }                            /* Various kinds of repeat; '{' is not necessarily a quantifier, but this             has been tested above. */            case '{':                if (!is_quantifier)                    goto NORMAL_CHAR;                ptr = readRepeatCounts(ptr + 1, &repeat_min, &repeat_max, errorcodeptr);                if (*errorcodeptr)                    goto FAILED;                goto REPEAT;                            case '*':                repeat_min = 0;                repeat_max = -1;                goto REPEAT;                            case '+':                repeat_min = 1;                repeat_max = -1;                goto REPEAT;                            case '?':                repeat_min = 0;                repeat_max = 1;                            REPEAT:                if (!previous) {                    *errorcodeptr = ERR9;                    goto FAILED;                }                                if (repeat_min == 0) {                    firstbyte = zerofirstbyte;    /* Adjust for zero repeat */                    reqbyte = zeroreqbyte;        /* Ditto */                }                                /* Remember whether this is a variable length repeat */                                reqvary = (repeat_min == repeat_max) ? 0 : REQ_VARY;                                op_type = 0;                    /* Default single-char op codes */                                /* Save start of previous item, in case we have to move it up to make space                 for an inserted OP_ONCE for the additional '+' extension. */                /* FIXME: Probably don't need this because we don't use OP_ONCE. */                                tempcode = previous;                                /* If the next character is '+', we have a possessive quantifier. This                 implies greediness, whatever the setting of the PCRE_UNGREEDY option.                 If the next character is '?' this is a minimizing repeat, by default,                 but if PCRE_UNGREEDY is set, it works the other way round. We change the                 repeat type to the non-default. */                                if (safelyCheckNextChar(ptr, patternEnd, '?')) {                    repeat_type = 1;                    ptr++;                } else                    repeat_type = 0;                                /* If previous was a character match, abolish the item and generate a                 repeat item instead. If a char item has a minumum of more than one, ensure                 that it is set in reqbyte - it might not be if a sequence such as x{3} is                 the first thing in a branch because the x will have gone into firstbyte                 instead.  */                                if (*previous == OP_CHAR || *previous == OP_CHAR_IGNORING_CASE) {                    /* Deal with UTF-8 characters that take up more than one byte. It's                     easier to write this out separately than try to macrify it. Use c to                     hold the length of the character in bytes, plus 0x80 to flag that it's a                     length rather than a small character. */                                        if (code[-1] & 0x80) {                        unsigned char *lastchar = code - 1;                        while((*lastchar & 0xc0) == 0x80)                            lastchar--;                        c = code - lastchar;            /* Length of UTF-8 character */                        memcpy(utf8_char, lastchar, c); /* Save the char */                        c |= 0x80;                      /* Flag c as a length */                    }                    else {                        c = code[-1];                        if (repeat_min > 1)                            reqbyte = c | req_caseopt | cd.req_varyopt;                    }                                        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */                }                                else if (*previous == OP_ASCII_CHAR || *previous == OP_ASCII_LETTER_IGNORING_CASE) {                    c = previous[1];                    if (repeat_min > 1)                        reqbyte = c | req_caseopt | cd.req_varyopt;                    goto OUTPUT_SINGLE_REPEAT;                }                                /* If previous was a single negated character ([^a] or similar), we use                 one of the special opcodes, replacing it. The code is shared with single-                 character repeats by setting opt_type to add a suitable offset into                 repeat_type. OP_NOT is currently used only for single-byte chars. */                                else if (*previous == OP_NOT) {                    op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */                    c = previous[1];                    goto OUTPUT_SINGLE_REPEAT;                }                                /* If previous was a character type match (\d or similar), abolish it and                 create a suitable repeat item. The code is shared with single-character                 repeats by setting op_type to add a suitable offset into repeat_type. */                                else if (*previous <= OP_NOT_NEWLINE) {                    op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */                    c = *previous;                                    OUTPUT_SINGLE_REPEAT:                    int prop_type = -1;                    int prop_value = -1;                                        unsigned char* oldcode = code;                    code = previous;                  /* Usually overwrite previous item */                                        /* If the maximum is zero then the minimum must also be zero; Perl allows                     this case, so we do too - by simply omitting the item altogether. */                                        if (repeat_max == 0)                        goto END_REPEAT;                                        /* Combine the op_type with the repeat_type */                                        repeat_type += op_type;                                        /* A minimum of zero is handled either as the special case * or ?, or as                     an UPTO, with the maximum given. */                                        if (repeat_min == 0) {                        if (repeat_max == -1)                            *code++ = OP_STAR + repeat_type;                        else if (repeat_max == 1)                            *code++ = OP_QUERY + repeat_type;                        else {                            *code++ = OP_UPTO + repeat_type;                            put2ByteValueAndAdvance(code, repeat_max);                        }                    }                                        /* A repeat minimum of 1 is optimized into some special cases. If the                     maximum is unlimited, we use OP_PLUS. Otherwise, the original item it                     left in place and, if the maximum is greater than 1, we use OP_UPTO with                     one less than the maximum. */                                        else if (repeat_min == 1) {                        if (repeat_max == -1)                            *code++ = OP_PLUS + repeat_type;                        else {                            code = oldcode;                 /* leave previous item in place */                            if (repeat_max == 1)                                goto END_REPEAT;                            *code++ = OP_UPTO + repeat_type;                            put2ByteValueAndAdvance(code, repeat_max - 1);                        }                    }                                        /* The case {n,n} is just an EXACT, while the general case {n,m} is                     handled as an EXACT followed by an UPTO. */                                        else {                        *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -