📄 pcre_compile.cpp
字号:
find one. When we hit a repeat whose minimum is zero, we may have to adjust these values to take the zero repeat into account. This is implemented by setting them to zeroFirstByte and zeroReqByte when such a repeat is encountered. The individual item types that can be repeated set these backoff variables appropriately. */ int firstByte = REQ_UNSET; int reqByte = REQ_UNSET; int zeroReqByte = REQ_UNSET; int zeroFirstByte = REQ_UNSET; /* The variable reqCaseOpt contains either the REQ_IGNORE_CASE value or zero, according to the current setting of the ignores-case flag. REQ_IGNORE_CASE is a bit value > 255. It is added into the firstByte or reqByte variables to record the case status of the value. This is used only for ASCII characters. */ int reqCaseOpt = (options & IgnoreCaseOption) ? REQ_IGNORE_CASE : 0; /* Switch on next character until the end of the branch */ for (;; ptr++) { bool negateClass; bool shouldFlipNegation; /* If a negative special such as \S is used, we should negate the whole class to properly support Unicode. */ int classCharCount; int classLastChar; int skipBytes; int subReqByte; int subFirstByte; int mcLength; unsigned char mcbuffer[8]; /* Next byte in the pattern */ c = ptr < patternEnd ? *ptr : 0; /* Fill in length of a previous callout, except when the next thing is a quantifier. */ bool isQuantifier = c == '*' || c == '+' || c == '?' || (c == '{' && isCountedRepeat(ptr + 1, patternEnd)); switch (c) { /* The branch terminates at end of string, |, or ). */ case 0: if (ptr < patternEnd) goto NORMAL_CHAR; // End of string; fall through case '|': case ')': *firstbyteptr = firstByte; *reqbyteptr = reqByte; *codePtr = code; *ptrPtr = ptr; return true; /* Handle single-character metacharacters. In multiline mode, ^ disables the setting of any following char as a first character. */ case '^': if (options & MatchAcrossMultipleLinesOption) { if (firstByte == REQ_UNSET) firstByte = REQ_NONE; *code++ = OP_BOL; } else *code++ = OP_CIRC; previous = NULL; break; case '$': previous = NULL; if (options & MatchAcrossMultipleLinesOption) *code++ = OP_EOL; else *code++ = OP_DOLL; break; /* There can never be a first char if '.' is first, whatever happens about repeats. The value of reqByte doesn't change either. */ case '.': if (firstByte == REQ_UNSET) firstByte = REQ_NONE; zeroFirstByte = firstByte; zeroReqByte = reqByte; previous = code; *code++ = OP_NOT_NEWLINE; break; /* Character classes. If the included characters are all < 256, we build a 32-byte bitmap of the permitted characters, except in the special case where there is only one such character. For negated classes, we build the map as usual, then invert it at the end. However, we use a different opcode so that data characters > 255 can be handled correctly. If the class contains characters outside the 0-255 range, a different opcode is compiled. It may optionally have a bit map for characters < 256, but those above are are explicitly listed afterwards. A flag byte tells whether the bitmap is present, and whether this is a negated class or not. */ case '[': { previous = code; shouldFlipNegation = false; /* PCRE supports POSIX class stuff inside a class. Perl gives an error if they are encountered at the top level, so we'll do that too. */ /* If the first character is '^', set the negation flag and skip it. */ if (ptr + 1 >= patternEnd) { *errorCodePtr = ERR6; return false; } if (ptr[1] == '^') { negateClass = true; ++ptr; } else negateClass = false; /* Keep a count of chars with values < 256 so that we can optimize the case of just a single character (as long as it's < 256). For higher valued UTF-8 characters, we don't yet do any optimization. */ classCharCount = 0; classLastChar = -1; class_utf8 = false; /* No chars >= 256 */ class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */ /* Initialize the 32-char bit map to all zeros. We have to build the map in a temporary bit of store, in case the class contains only 1 character (< 256), because in that case the compiled code doesn't use the bit map. */ memset(classbits, 0, 32 * sizeof(unsigned char)); /* Process characters until ] is reached. The first pass through the regex checked the overall syntax, so we don't need to be very strict here. At the start of the loop, c contains the first byte of the character. */ while ((++ptr < patternEnd) && (c = *ptr) != ']') { /* Backslash may introduce a single character, or it may introduce one of the specials, which just set a flag. Escaped items are checked for validity in the pre-compiling pass. The sequence \b is a special case. Inside a class (and only there) it is treated as backspace. Elsewhere it marks a word boundary. Other escapes have preset maps ready to or into the one we are building. We assume they have more than one character in them, so set classCharCount bigger than one. */ if (c == '\\') { c = checkEscape(&ptr, patternEnd, errorCodePtr, cd.numCapturingBrackets, true); if (c < 0) { classCharCount += 2; /* Greater than 1 is what matters */ switch (-c) { case ESC_d: for (c = 0; c < 32; c++) classbits[c] |= classBitmapForChar(c + cbit_digit); continue; case ESC_D: shouldFlipNegation = true; for (c = 0; c < 32; c++) classbits[c] |= ~classBitmapForChar(c + cbit_digit); continue; case ESC_w: for (c = 0; c < 32; c++) classbits[c] |= classBitmapForChar(c + cbit_word); continue; case ESC_W: shouldFlipNegation = true; for (c = 0; c < 32; c++) classbits[c] |= ~classBitmapForChar(c + cbit_word); continue; case ESC_s: for (c = 0; c < 32; c++) classbits[c] |= classBitmapForChar(c + cbit_space); continue; case ESC_S: shouldFlipNegation = true; for (c = 0; c < 32; c++) classbits[c] |= ~classBitmapForChar(c + cbit_space); continue; /* Unrecognized escapes are faulted if PCRE is running in its strict mode. By default, for compatibility with Perl, they are treated as literals. */ default: c = *ptr; /* The final character */ classCharCount -= 2; /* Undo the default count from above */ } } /* Fall through if we have a single character (c >= 0). This may be > 256 in UTF-8 mode. */ } /* End of backslash handling */ /* A single character may be followed by '-' to form a range. However, Perl does not permit ']' to be the end of the range. A '-' character here is treated as a literal. */ if ((ptr + 2 < patternEnd) && ptr[1] == '-' && ptr[2] != ']') { ptr += 2; int d = *ptr; /* The second part of a range can be a single-character escape, but not any of the other escapes. Perl 5.6 treats a hyphen as a literal in such circumstances. */ if (d == '\\') { const UChar* oldptr = ptr; d = checkEscape(&ptr, patternEnd, errorCodePtr, cd.numCapturingBrackets, true); /* \X is literal X; any other special means the '-' was literal */ if (d < 0) { ptr = oldptr - 2; goto LONE_SINGLE_CHARACTER; /* A few lines below */ } } /* The check that the two values are in the correct order happens in the pre-pass. Optimize one-character ranges */ if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */ /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless matching, we have to use an XCLASS with extra data items. Caseless matching for characters > 127 is available only if UCP support is available. */ if ((d > 255 || ((options & IgnoreCaseOption) && d > 127))) { class_utf8 = true; /* With UCP support, we can find the other case equivalents of the relevant characters. There may be several ranges. Optimize how they fit with the basic range. */ if (options & IgnoreCaseOption) { int occ, ocd; int cc = c; int origd = d; while (getOthercaseRange(&cc, origd, &occ, &ocd)) { if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */ if (occ < c && ocd >= c - 1) /* Extend the basic range */ { /* if there is overlap, */ c = occ; /* noting that if occ < c */ continue; /* we can't have ocd > d */ } /* because a subrange is */ if (ocd > d && occ <= d + 1) /* always shorter than */ { /* the basic range. */ d = ocd; continue; } if (occ == ocd) *class_utf8data++ = XCL_SINGLE; else { *class_utf8data++ = XCL_RANGE; class_utf8data += encodeUTF8(occ, class_utf8data); } class_utf8data += encodeUTF8(ocd, class_utf8data); } } /* Now record the original range, possibly modified for UCP caseless overlapping ranges. */ *class_utf8data++ = XCL_RANGE; class_utf8data += encodeUTF8(c, class_utf8data); class_utf8data += encodeUTF8(d, class_utf8data); /* With UCP support, we are done. Without UCP support, there is no caseless matching for UTF-8 characters > 127; we can use the bit map for the smaller ones. */ continue; /* With next character in the class */ } /* We use the bit map for all cases when not in UTF-8 mode; else ranges that lie entirely within 0-127 when there is UCP support; else for partial ranges without UCP support. */ for (; c <= d; c++) { classbits[c/8] |= (1 << (c&7));
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -