📄 pcre.c
字号:
if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1; branchlength += (cc[1] << 8) + cc[2]; cc += 5; break; default: branchlength++; } break; /* Anything else is variable length */ default: return -1; } }/* Control never gets here */}/************************************************** Check for POSIX class syntax **************************************************//* This function is called when the sequence "[:" or "[." or "[=" isencountered in a character class. It checks whether this is followed by anoptional ^ and then a sequence of letters, terminated by a matching ":]" or".]" or "=]".Argument: ptr pointer to the initial [ endptr where to return the end pointer cd pointer to compile dataReturns: TRUE or FALSE*/static BOOLcheck_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd){int terminator; /* Don't combine these lines; the Solaris cc */terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */if (*(++ptr) == '^') ptr++;while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;if (*ptr == terminator && ptr[1] == ']') { *endptr = ptr; return TRUE; }return FALSE;}/************************************************** Check POSIX class name **************************************************//* This function is called to check the name given in a POSIX-style class entrysuch as [:alnum:].Arguments: ptr points to the first letter len the length of the nameReturns: a value representing the name, or -1 if unknown*/static intcheck_posix_name(const uschar *ptr, int len){register int yield = 0;while (posix_name_lengths[yield] != 0) { if (len == posix_name_lengths[yield] && strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield; yield++; }return -1;}/************************************************** Compile one branch **************************************************//* Scan the pattern, compiling it into the code vector.Arguments: options the option bits brackets points to number of extracting brackets used code points to the pointer to the current code point ptrptr points to the current pattern pointer errorptr points to pointer to error message optchanged set to the value of the last OP_OPT item compiled reqchar set to the last literal character required, else -1 countlits set to count of mandatory literal characters cd contains pointers to tablesReturns: TRUE on success FALSE, with *errorptr set on error*/static BOOLcompile_branch(int options, int *brackets, uschar **codeptr, const uschar **ptrptr, const char **errorptr, int *optchanged, int *reqchar, int *countlits, compile_data *cd){int repeat_type, op_type;int repeat_min, repeat_max;int bravalue, length;int greedy_default, greedy_non_default;int prevreqchar;int condcount = 0;int subcountlits = 0;register int c;register uschar *code = *codeptr;uschar *tempcode;const uschar *ptr = *ptrptr;const uschar *tempptr;uschar *previous = NULL;uschar class[32];/* Set up the default and non-default settings for greediness */greedy_default = ((options & PCRE_UNGREEDY) != 0);greedy_non_default = greedy_default ^ 1;/* Initialize no required char, and count of literals */*reqchar = prevreqchar = -1;*countlits = 0;/* Switch on next character until the end of the branch */for (;; ptr++) { BOOL negate_class; int class_charcount; int class_lastchar; int newoptions; int skipbytes; int subreqchar; c = *ptr; if ((options & PCRE_EXTENDED) != 0) { if ((cd->ctypes[c] & ctype_space) != 0) continue; if (c == '#') { /* The space before the ; is to avoid a warning on a silly compiler on the Macintosh. */ while ((c = *(++ptr)) != 0 && c != NEWLINE) ; continue; } } switch(c) { /* The branch terminates at end of string, |, or ). */ case 0: case '|': case ')': *codeptr = code; *ptrptr = ptr; return TRUE; /* Handle single-character metacharacters */ case '^': previous = NULL; *code++ = OP_CIRC; break; case '$': previous = NULL; *code++ = OP_DOLL; break; case '.': previous = code; *code++ = OP_ANY; break; /* Character classes. These always build a 32-byte bitmap of the permitted characters, except in the special case where there is only one character. For negated classes, we build the map as usual, then invert it at the end. */ case '[': previous = code; *code++ = OP_CLASS; /* If the first character is '^', set the negation flag and skip it. */ if ((c = *(++ptr)) == '^') { negate_class = TRUE; c = *(++ptr); } else negate_class = FALSE; /* Keep a count of chars so that we can optimize the case of just a single character. */ class_charcount = 0; class_lastchar = -1; /* Initialize the 32-char bit map to all zeros. We have to build the map in a temporary bit of store, in case the class contains only 1 character, because in that case the compiled code doesn't use the bit map. */ memset(class, 0, 32 * sizeof(uschar)); /* Process characters until ] is reached. By writing this as a "do" it means that an initial ] is taken as a data character. */ do { if (c == 0) { *errorptr = ERR6; goto FAILED; } /* Handle POSIX class names. Perl allows a negation extension of the form [:^name]. A square bracket that doesn't match the syntax is treated as a literal. We also recognize the POSIX constructions [.ch.] and [=ch=] ("collating elements") and fault them, as Perl 5.6 does. */ if (c == '[' && (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && check_posix_syntax(ptr, &tempptr, cd)) { BOOL local_negate = FALSE; int posix_class, i; register const uschar *cbits = cd->cbits; if (ptr[1] != ':') { *errorptr = ERR31; goto FAILED; } ptr += 2; if (*ptr == '^') { local_negate = TRUE; ptr++; } posix_class = check_posix_name(ptr, tempptr - ptr); if (posix_class < 0) { *errorptr = ERR30; goto FAILED; } /* If matching is caseless, upper and lower are converted to alpha. This relies on the fact that the class table starts with alpha, lower, upper as the first 3 entries. */ if ((options & PCRE_CASELESS) != 0 && posix_class <= 2) posix_class = 0; /* Or into the map we are building up to 3 of the static class tables, or their negations. */ posix_class *= 3; for (i = 0; i < 3; i++) { int taboffset = posix_class_maps[posix_class + i]; if (taboffset < 0) break; if (local_negate) for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset]; else for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset]; } ptr = tempptr + 1; class_charcount = 10; /* Set > 1; assumes more than 1 per class */ continue; } /* Backslash may introduce a single character, or it may introduce one of the specials, which just set a flag. Escaped items are checked for validity in the pre-compiling pass. The sequence \b is a special case. Inside a class (and only there) it is treated as backspace. Elsewhere it marks a word boundary. Other escapes have preset maps ready to or into the one we are building. We assume they have more than one character in them, so set class_count bigger than one. */ if (c == '\\') { c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd); if (-c == ESC_b) c = '\b'; else if (c < 0) { register const uschar *cbits = cd->cbits; class_charcount = 10; switch (-c) { case ESC_d: for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit]; continue; case ESC_D: for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit]; continue; case ESC_w: for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word]; continue; case ESC_W: for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word]; continue; case ESC_s: for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space]; continue; case ESC_S: for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space]; continue; default: *errorptr = ERR7; goto FAILED; } } /* Fall through if single character, but don't at present allow chars > 255 in UTF-8 mode. */#ifdef SUPPORT_UTF8 if (c > 255) { *errorptr = ERR33; goto FAILED; }#endif } /* A single character may be followed by '-' to form a range. However, Perl does not permit ']' to be the end of the range. A '-' character here is treated as a literal. */ if (ptr[1] == '-' && ptr[2] != ']') { int d; ptr += 2; d = *ptr; if (d == 0) { *errorptr = ERR6; goto FAILED; } /* The second part of a range can be a single-character escape, but not any of the other escapes. Perl 5.6 treats a hyphen as a literal in such circumstances. */ if (d == '\\') { const uschar *oldptr = ptr; d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);#ifdef SUPPORT_UTF8 if (d > 255) { *errorptr = ERR33; goto FAILED; }#endif /* \b is backslash; any other special means the '-' was literal */ if (d < 0) { if (d == -ESC_b) d = '\b'; else { ptr = oldptr - 2; goto SINGLE_CHARACTER; /* A few lines below */ } } } if (d < c) { *errorptr = ERR8; goto FAILED; } for (; c <= d; c++) { class[c/8] |= (1 << (c&7)); if ((options & PCRE_CASELESS) != 0) { int uc = cd->fcc[c]; /* flip case */ class[uc/8] |= (1 << (uc&7)); } class_charcount++; /* in case a one-char range */ class_lastchar = c; } continue; /* Go get the next char in the class */ } /* Handle a lone single character - we can get here for a normal non-escape char, or after \ that introduces a single character. */ SINGLE_CHARACTER: class [c/8] |= (1 << (c&7)); if ((options & PCRE_CASELESS) != 0) { c = cd->fcc[c]; /* flip case */ class[c/8] |= (1 << (c&7)); } class_charcount++; class_lastchar = c; } /* Loop until ']' reached; the check for end of string happens inside the loop. This "while" is the end of the "do" above. */ while ((c = *(++ptr)) != ']'); /* If class_charcount is 1 and class_lastchar is not negative, we saw precisely one character. This doesn't need the whole 32-byte bit map. We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if it's negative. */ if (class_charcount == 1 && class_lastchar >= 0) { if (negate_class) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -