📄 pcre.c
字号:
#ifdef DEBUG/************************************************** Debugging function to print chars **************************************************//* Print a sequence of chars in printable format, stopping at the end of thesubject if the requested.Arguments: p points to characters length number to print is_subject TRUE if printing from within md->start_subject md pointer to matching data block, if is_subject is TRUEReturns: nothing*/static voidpchars(const uschar *p, int length, BOOL is_subject, match_data *md){int c;if (is_subject && length > md->end_subject - p) length = md->end_subject - p;while (length-- > 0) if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);}#endif/************************************************** Handle escapes **************************************************//* This function is called when a \ has been encountered. It either returns apositive value for a simple escape such as \n, or a negative value whichencodes one of the more complicated things such as \d. When UTF-8 is enabled,a positive value greater than 255 may be returned. On entry, ptr is pointing atthe \. On exit, it is on the final character of the escape sequence.Arguments: ptrptr points to the pattern position pointer errorptr points to the pointer to the error message bracount number of previous extracting brackets options the options bits isclass TRUE if inside a character class cd pointer to char tables blockReturns: zero or positive => a data character negative => a special escape sequence on error, errorptr is set*/static intcheck_escape(const uschar **ptrptr, const char **errorptr, int bracount, int options, BOOL isclass, compile_data *cd){const uschar *ptr = *ptrptr;int c, i;/* If backslash is at the end of the pattern, it's an error. */c = *(++ptr);if (c == 0) *errorptr = ERR1;/* Digits or letters may have special meaning; all others are literals. */else if (c < '0' || c > 'z') {}/* Do an initial lookup in a table. A non-zero result is something that can bereturned immediately. Otherwise further processing may be required. */else if ((i = escapes[c - '0']) != 0) c = i;/* Escapes that need further processing, or are illegal. */else { const uschar *oldptr; switch (c) { /* The handling of escape sequences consisting of a string of digits starting with one that is not zero is not straightforward. By experiment, the way Perl works seems to be as follows: Outside a character class, the digits are read as a decimal number. If the number is less than 10, or if there are that many previous extracting left brackets, then it is a back reference. Otherwise, up to three octal digits are read to form an escaped byte. Thus \123 is likely to be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal value is greater than 377, the least significant 8 bits are taken. Inside a character class, \ followed by a digit is always an octal number. */ case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': if (!isclass) { oldptr = ptr; c -= '0'; while ((cd->ctypes[ptr[1]] & ctype_digit) != 0) c = c * 10 + *(++ptr) - '0'; if (c < 10 || c <= bracount) { c = -(ESC_REF + c); break; } ptr = oldptr; /* Put the pointer back and fall through */ } /* Handle an octal number following \. If the first digit is 8 or 9, Perl generates a binary zero byte and treats the digit as a following literal. Thus we have to pull back the pointer by one. */ if ((c = *ptr) >= '8') { ptr--; c = 0; break; } /* \0 always starts an octal number, but we may drop through to here with a larger first octal digit. */ case '0': c -= '0'; while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 && ptr[1] != '8' && ptr[1] != '9') c = c * 8 + *(++ptr) - '0'; c &= 255; /* Take least significant 8 bits */ break; /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number which can be greater than 0xff, but only if the ddd are hex digits. */ case 'x':#ifdef SUPPORT_UTF8 if (ptr[1] == '{' && (options & PCRE_UTF8) != 0) { const uschar *pt = ptr + 2; register int count = 0; c = 0; while ((cd->ctypes[*pt] & ctype_xdigit) != 0) { count++; c = c * 16 + cd->lcc[*pt] - (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W'); pt++; } if (*pt == '}') { if (c < 0 || count > 8) *errorptr = ERR34; ptr = pt; break; } /* If the sequence of hex digits does not end with '}', then we don't recognize this construct; fall through to the normal \x handling. */ }#endif /* Read just a single hex char */ c = 0; while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0) { ptr++; c = c * 16 + cd->lcc[*ptr] - (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W'); } break; /* Other special escapes not starting with a digit are straightforward */ case 'c': c = *(++ptr); if (c == 0) { *errorptr = ERR2; return 0; } /* A letter is upper-cased; then the 0x40 bit is flipped */ if (c >= 'a' && c <= 'z') c = cd->fcc[c]; c ^= 0x40; break; /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any other alphameric following \ is an error if PCRE_EXTRA was set; otherwise, for Perl compatibility, it is a literal. This code looks a bit odd, but there used to be some cases other than the default, and there may be again in future, so I haven't "optimized" it. */ default: if ((options & PCRE_EXTRA) != 0) switch(c) { default: *errorptr = ERR3; break; } break; } }*ptrptr = ptr;return c;}/************************************************** Check for counted repeat **************************************************//* This function is called when a '{' is encountered in a place where it mightstart a quantifier. It looks ahead to see if it really is a quantifier or not.It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}where the ddds are digits.Arguments: p pointer to the first char after '{' cd pointer to char tables blockReturns: TRUE or FALSE*/static BOOLis_counted_repeat(const uschar *p, compile_data *cd){if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;while ((cd->ctypes[*p] & ctype_digit) != 0) p++;if (*p == '}') return TRUE;if (*p++ != ',') return FALSE;if (*p == '}') return TRUE;if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;while ((cd->ctypes[*p] & ctype_digit) != 0) p++;return (*p == '}');}/************************************************** Read repeat counts **************************************************//* Read an item of the form {n,m} and return the values. This is called onlyafter is_counted_repeat() has confirmed that a repeat-count quantifier exists,so the syntax is guaranteed to be correct, but we need to check the values.Arguments: p pointer to first char after '{' minp pointer to int for min maxp pointer to int for max returned as -1 if no max errorptr points to pointer to error message cd pointer to character tables clockReturns: pointer to '}' on success; current ptr on error, with errorptr set*/static const uschar *read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr, compile_data *cd){int min = 0;int max = -1;while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';if (*p == '}') max = min; else { if (*(++p) != '}') { max = 0; while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0'; if (max < min) { *errorptr = ERR4; return p; } } }/* Do paranoid checks, then fill in the required variables, and pass back thepointer to the terminating '}'. */if (min > 65535 || max > 65535) *errorptr = ERR5;else { *minp = min; *maxp = max; }return p;}/************************************************** Find the fixed length of a pattern **************************************************//* Scan a pattern and compute the fixed length of subject that will match it,if the length is fixed. This is needed for dealing with backward assertions.Arguments: code points to the start of the pattern (the bracket) options the compiling optionsReturns: the fixed length, or -1 if there is no fixed length*/static intfind_fixedlength(uschar *code, int options){int length = -1;register int branchlength = 0;register uschar *cc = code + 3;/* Scan along the opcodes for this branch. If we get to the end of thebranch, check the length against that of the other branches. */for (;;) { int d; register int op = *cc; if (op >= OP_BRA) op = OP_BRA; switch (op) { case OP_BRA: case OP_ONCE: case OP_COND: d = find_fixedlength(cc, options); if (d < 0) return -1; branchlength += d; do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT); cc += 3; break; /* Reached end of a branch; if it's a ket it is the end of a nested call. If it's ALT it is an alternation in a nested call. If it is END it's the end of the outer call. All can be handled by the same code. */ case OP_ALT: case OP_KET: case OP_KETRMAX: case OP_KETRMIN: case OP_END: if (length < 0) length = branchlength; else if (length != branchlength) return -1; if (*cc != OP_ALT) return length; cc += 3; branchlength = 0; break; /* Skip over assertive subpatterns */ case OP_ASSERT: case OP_ASSERT_NOT: case OP_ASSERTBACK: case OP_ASSERTBACK_NOT: do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT); cc += 3; break; /* Skip over things that don't match chars */ case OP_REVERSE: case OP_BRANUMBER: case OP_CREF: cc++; /* Fall through */ case OP_OPT: cc++; /* Fall through */ case OP_SOD: case OP_EOD: case OP_EODN: case OP_CIRC: case OP_DOLL: case OP_NOT_WORD_BOUNDARY: case OP_WORD_BOUNDARY: cc++; break; /* Handle char strings. In UTF-8 mode we must count characters, not bytes. This requires a scan of the string, unfortunately. We assume valid UTF-8 strings, so all we do is reduce the length by one for byte whose bits are 10xxxxxx. */ case OP_CHARS: branchlength += *(++cc);#ifdef SUPPORT_UTF8 for (d = 1; d <= *cc; d++) if ((cc[d] & 0xc0) == 0x80) branchlength--;#endif cc += *cc + 1; break; /* Handle exact repetitions */ case OP_EXACT: case OP_TYPEEXACT: branchlength += (cc[1] << 8) + cc[2]; cc += 4; break; /* Handle single-char matchers */ case OP_NOT_DIGIT: case OP_DIGIT: case OP_NOT_WHITESPACE: case OP_WHITESPACE: case OP_NOT_WORDCHAR: case OP_WORDCHAR: case OP_ANY: branchlength++; cc++; break; /* Check a class for variable quantification */ case OP_CLASS: cc += 33; switch (*cc) { case OP_CRSTAR: case OP_CRMINSTAR: case OP_CRQUERY: case OP_CRMINQUERY: return -1; case OP_CRRANGE: case OP_CRMINRANGE:
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -