📄 pcre.c
字号:
case '!': /* Negative lookahead */ bravalue = OP_ASSERT_NOT; ptr++; break; case '<': /* Lookbehinds */ switch (*(++ptr)) { case '=': /* Positive lookbehind */ bravalue = OP_ASSERTBACK; ptr++; break; case '!': /* Negative lookbehind */ bravalue = OP_ASSERTBACK_NOT; ptr++; break; default: /* Syntax error */ *errorptr = ERR24; goto FAILED; } break; case '>': /* One-time brackets */ bravalue = OP_ONCE; ptr++; break; case 'R': /* Pattern recursion */ *code++ = OP_RECURSE; ptr++; continue; default: /* Option setting */ set = unset = 0; optset = &set; while (*ptr != ')' && *ptr != ':') { switch (*ptr++) { case '-': optset = &unset; break; case 'i': *optset |= PCRE_CASELESS; break; case 'm': *optset |= PCRE_MULTILINE; break; case 's': *optset |= PCRE_DOTALL; break; case 'x': *optset |= PCRE_EXTENDED; break; case 'U': *optset |= PCRE_UNGREEDY; break; case 'X': *optset |= PCRE_EXTRA; break; default: *errorptr = ERR12; goto FAILED; } } /* Set up the changed option bits, but don't change anything yet. */ newoptions = (options | set) & (~unset); /* If the options ended with ')' this is not the start of a nested group with option changes, so the options change at this level. At top level there is nothing else to be done (the options will in fact have been set from the start of compiling as a result of the first pass) but at an inner level we must compile code to change the ims options if necessary, and pass the new setting back so that it can be put at the start of any following branches, and when this group ends, a resetting item can be compiled. */ if (*ptr == ')') { if ((options & PCRE_INGROUP) != 0 && (options & PCRE_IMS) != (newoptions & PCRE_IMS)) { *code++ = OP_OPT; *code++ = *optchanged = newoptions & PCRE_IMS; } options = newoptions; /* Change options at this level */ previous = NULL; /* This item can't be repeated */ continue; /* It is complete */ } /* If the options ended with ':' we are heading into a nested group with possible change of options. Such groups are non-capturing and are not assertions of any kind. All we need to do is skip over the ':'; the newoptions value is handled below. */ bravalue = OP_BRA; ptr++; } } /* Else we have a referencing group; adjust the opcode. If the bracket number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and arrange for the true number to follow later, in an OP_BRANUMBER item. */ else { if (++(*brackets) > EXTRACT_BASIC_MAX) { bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1; code[3] = OP_BRANUMBER; code[4] = *brackets >> 8; code[5] = *brackets & 255; skipbytes = 3; } else bravalue = OP_BRA + *brackets; } /* Process nested bracketed re. Assertions may not be repeated, but other kinds can be. We copy code into a non-register variable in order to be able to pass its address because some compilers complain otherwise. Pass in a new setting for the ims options if they have changed. */ previous = (bravalue >= OP_ONCE)? code : NULL; *code = bravalue; tempcode = code; if (!compile_regex( options | PCRE_INGROUP, /* Set for all nested groups */ ((options & PCRE_IMS) != (newoptions & PCRE_IMS))? newoptions & PCRE_IMS : -1, /* Pass ims options if changed */ brackets, /* Extracting bracket count */ &tempcode, /* Where to put code (updated) */ &ptr, /* Input pointer (updated) */ errorptr, /* Where to put an error message */ (bravalue == OP_ASSERTBACK || bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */ skipbytes, /* Skip over OP_COND/OP_BRANUMBER */ &subreqchar, /* For possible last char */ &subcountlits, /* For literal count */ cd)) /* Tables block */ goto FAILED; /* At the end of compiling, code is still pointing to the start of the group, while tempcode has been updated to point past the end of the group and any option resetting that may follow it. The pattern pointer (ptr) is on the bracket. */ /* If this is a conditional bracket, check that there are no more than two branches in the group. */ else if (bravalue == OP_COND) { uschar *tc = code; condcount = 0; do { condcount++; tc += (tc[1] << 8) | tc[2]; } while (*tc != OP_KET); if (condcount > 2) { *errorptr = ERR27; goto FAILED; } } /* Handle updating of the required character. If the subpattern didn't set one, leave it as it was. Otherwise, update it for normal brackets of all kinds, forward assertions, and conditions with two branches. Don't update the literal count for forward assertions, however. If the bracket is followed by a quantifier with zero repeat, we have to back off. Hence the definition of prevreqchar and subcountlits outside the main loop so that they can be accessed for the back off. */ if (subreqchar > 0 && (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT || (bravalue == OP_COND && condcount == 2))) { prevreqchar = *reqchar; *reqchar = subreqchar; if (bravalue != OP_ASSERT) *countlits += subcountlits; } /* Now update the main code pointer to the end of the group. */ code = tempcode; /* Error if hit end of pattern */ if (*ptr != ')') { *errorptr = ERR14; goto FAILED; } break; /* Check \ for being a real metacharacter; if not, fall through and handle it as a data character at the start of a string. Escape items are checked for validity in the pre-compiling pass. */ case '\\': tempptr = ptr; c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd); /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values are arranged to be the negation of the corresponding OP_values. For the back references, the values are ESC_REF plus the reference number. Only back references and those types that consume a character may be repeated. We can test for values between ESC_b and ESC_Z for the latter; this may have to change if any new ones are ever created. */ if (c < 0) { if (-c >= ESC_REF) { int number = -c - ESC_REF; previous = code; *code++ = OP_REF; *code++ = number >> 8; *code++ = number & 255; } else { previous = (-c > ESC_b && -c < ESC_Z)? code : NULL; *code++ = -c; } continue; } /* Data character: reset and fall through */ ptr = tempptr; c = '\\'; /* Handle a run of data characters until a metacharacter is encountered. The first character is guaranteed not to be whitespace or # when the extended flag is set. */ NORMAL_CHAR: default: previous = code; *code = OP_CHARS; code += 2; length = 0; do { if ((options & PCRE_EXTENDED) != 0) { if ((cd->ctypes[c] & ctype_space) != 0) continue; if (c == '#') { /* The space before the ; is to avoid a warning on a silly compiler on the Macintosh. */ while ((c = *(++ptr)) != 0 && c != NEWLINE) ; if (c == 0) break; continue; } } /* Backslash may introduce a data char or a metacharacter. Escaped items are checked for validity in the pre-compiling pass. Stop the string before a metaitem. */ if (c == '\\') { tempptr = ptr; c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd); if (c < 0) { ptr = tempptr; break; } /* If a character is > 127 in UTF-8 mode, we have to turn it into two or more characters in the UTF-8 encoding. */#ifdef SUPPORT_UTF8 if (c > 127 && (options & PCRE_UTF8) != 0) { uschar buffer[8]; int len = ord2utf8(c, buffer); for (c = 0; c < len; c++) *code++ = buffer[c]; length += len; continue; }#endif } /* Ordinary character or single-char escape */ *code++ = c; length++; } /* This "while" is the end of the "do" above. */ while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0); /* Update the last character and the count of literals */ prevreqchar = (length > 1)? code[-2] : *reqchar; *reqchar = code[-1]; *countlits += length; /* Compute the length and set it in the data vector, and advance to the next state. */ previous[1] = length; if (length < MAXLIT) ptr--; break; } } /* end of big loop *//* Control never reaches here by falling through, only by a goto for all theerror states. Pass back the position in the pattern so that it can be displayedto the user for diagnosing the error. */FAILED:*ptrptr = ptr;return FALSE;}/************************************************** Compile sequence of alternatives **************************************************//* On entry, ptr is pointing past the bracket character, but on returnit points to the closing bracket, or vertical bar, or end of string.The code variable is pointing at the byte into which the BRA operator has beenstored. If the ims options are changed at the start (for a (?ims: group) orduring any branch, we need to insert an OP_OPT item at the start of everyfollowing branch to ensure they get set correctly at run time, and also passthe new options into every subsequent branch compile.Argument: options the option bits optchanged new ims options to set as if (?ims) were at the start, or -1 for no change brackets -> int containing the number of extracting brackets used codeptr -> the address of the current code pointer ptrptr -> the address of the current pattern pointer errorptr -> pointer to error message lookbehind TRUE if this is a lookbehind assertion skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER) reqchar -> place to put the last required character, or a negative number countlits -> place to put the shortest literal count of any branch cd points to the data block with tables pointersReturns: TRUE on success*/static BOOLcompile_regex(int options, int optchanged, int *brackets, uschar **codeptr, const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes, int *reqchar, int *countlits, compile_data *cd){const uschar *ptr = *ptrptr;uschar *code = *codeptr;uschar *last_branch = code;uschar *start_bracket = code;uschar *reverse_count = NULL;int oldoptions = options & PCRE_IMS;int branchreqchar, branchcountlits;*reqchar = -1;*countlits = INT_MAX;code += 3 + skipbytes;/* Loop for each alternative branch */for (;;) { int length; /* Handle change of options */ if (optchanged >= 0) { *code++ = OP_OPT; *code++ = optchanged; options = (options & ~PCRE_IMS) | optchanged; } /* Set up dummy OP_REVERSE if lookbehind assertion */ if (lookbehind) { *code++ = OP_REVERSE; reverse_count = code; *code++ = 0; *code++ = 0; } /* Now compile the branch */ if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged, &branchreqchar, &branchcountlits, cd)) { *ptrptr = ptr; return FALSE; } /* Fill in the length of the last branch */ length = code - last_branch; last_branch[1] = length >> 8; last_branch[2] = length & 255; /* Save the last required character if all branches have the same; a current value of -1 means unset, while -2 means "previous branch had no last required char". */ if (*reqchar != -2) { if (branchreqchar >= 0) { if (*reqchar == -1) *reqchar = branchreqchar; else if (*reqchar != branchreqchar) *reqchar = -2; } else *reqchar = -2; } /* Keep the shortest literal count */ if (branchcountlits < *countlits) *countlits = branchcountlits; DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits)); /* If lookbehind, check that this branch matches a fixed-length string, and put the length into the OP_REVERSE item. Temporarily mark the end of the branch with OP_END. */ if (lookbehind) { *code = OP_END; length = find_fixedlength(last_branch, options); DPRINTF(("fixed length = %d\n", length)); if (length < 0) { *errorptr = ERR25; *ptrptr = ptr; return FALSE; } reverse_count[0] = (length >> 8); reverse_count[1] = length & 255; } /* Reach
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -