📄 pcre_compile.c
字号:
c = c * 10 + *(++ptr) - '0'; if (c < 0) /* Integer overflow */ { *errorcodeptr = ERR61; break; } if (braced && *(++ptr) != '}') { *errorcodeptr = ERR57; break; } if (c == 0) { *errorcodeptr = ERR58; break; } if (negated) { if (c > bracount) { *errorcodeptr = ERR15; break; } c = bracount - (c - 1); } c = -(ESC_REF + c); break; /* The handling of escape sequences consisting of a string of digits starting with one that is not zero is not straightforward. By experiment, the way Perl works seems to be as follows: Outside a character class, the digits are read as a decimal number. If the number is less than 10, or if there are that many previous extracting left brackets, then it is a back reference. Otherwise, up to three octal digits are read to form an escaped byte. Thus \123 is likely to be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal value is greater than 377, the least significant 8 bits are taken. Inside a character class, \ followed by a digit is always an octal number. */ case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': if (!isclass) { oldptr = ptr; c -= '0'; while (g_ascii_isdigit(ptr[1]) != 0) c = c * 10 + *(++ptr) - '0'; if (c < 0) /* Integer overflow */ { *errorcodeptr = ERR61; break; } if (c < 10 || c <= bracount) { c = -(ESC_REF + c); break; } ptr = oldptr; /* Put the pointer back and fall through */ } /* Handle an octal number following \. If the first digit is 8 or 9, Perl generates a binary zero byte and treats the digit as a following literal. Thus we have to pull back the pointer by one. */ if ((c = *ptr) >= '8') { ptr--; c = 0; break; } /* \0 always starts an octal number, but we may drop through to here with a larger first octal digit. The original code used just to take the least significant 8 bits of octal numbers (I think this is what early Perls used to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more than 3 octal digits. */ case '0': c -= '0'; while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7') c = c * 8 + *(++ptr) - '0'; if (!utf8 && c > 255) *errorcodeptr = ERR51; break; /* \x is complicated. \x{ddd} is a character number which can be greater than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is treated as a data character. */ case 'x': if (ptr[1] == '{') { const uschar *pt = ptr + 2; int count = 0; c = 0; while (g_ascii_isxdigit(*pt) != 0) { register int cc = *pt++; if (c == 0 && cc == '0') continue; /* Leading zeroes */ count++;#ifndef EBCDIC /* ASCII coding */ if (cc >= 'a') cc -= 32; /* Convert to upper case */ c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));#else /* EBCDIC coding */ if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */ c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));#endif } if (*pt == '}') { if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34; ptr = pt; break; } /* If the sequence of hex digits does not end with '}', then we don't recognize this construct; fall through to the normal \x handling. */ } /* Read just a single-byte hex-defined char */ c = 0; while (i++ < 2 && g_ascii_isxdigit(ptr[1]) != 0) { int cc; /* Some compilers don't like ++ */ cc = *(++ptr); /* in initializers */#ifndef EBCDIC /* ASCII coding */ if (cc >= 'a') cc -= 32; /* Convert to upper case */ c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));#else /* EBCDIC coding */ if (cc <= 'z') cc += 64; /* Convert to upper case */ c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));#endif } break; /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped. This coding is ASCII-specific, but then the whole concept of \cx is ASCII-specific. (However, an EBCDIC equivalent has now been added.) */ case 'c': c = *(++ptr); if (c == 0) { *errorcodeptr = ERR2; break; }#ifndef EBCDIC /* ASCII coding */ if (c >= 'a' && c <= 'z') c -= 32; c ^= 0x40;#else /* EBCDIC coding */ if (c >= 'a' && c <= 'z') c += 64; c ^= 0xC0;#endif break; /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any other alphanumeric following \ is an error if PCRE_EXTRA was set; otherwise, for Perl compatibility, it is a literal. This code looks a bit odd, but there used to be some cases other than the default, and there may be again in future, so I haven't "optimized" it. */ default: if ((options & PCRE_EXTRA) != 0) switch(c) { default: *errorcodeptr = ERR3; break; } break; } }*ptrptr = ptr;return c;}#ifdef SUPPORT_UCP/************************************************** Handle \P and \p **************************************************//* This function is called after \P or \p has been encountered, provided thatPCRE is compiled with support for Unicode properties. On entry, ptrptr ispointing at the P or p. On exit, it is pointing at the final character of theescape sequence.Argument: ptrptr points to the pattern position pointer negptr points to a boolean that is set TRUE for negation else FALSE dptr points to an int that is set to the detailed property value errorcodeptr points to the error code variableReturns: type value from ucp_type_table, or -1 for an invalid type*/static intget_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr){int c, i, bot, top;const uschar *ptr = *ptrptr;char name[32];c = *(++ptr);if (c == 0) goto ERROR_RETURN;*negptr = FALSE;/* \P or \p can be followed by a name in {}, optionally preceded by ^ fornegation. */if (c == '{') { if (ptr[1] == '^') { *negptr = TRUE; ptr++; } for (i = 0; i < (int)sizeof(name) - 1; i++) { c = *(++ptr); if (c == 0) goto ERROR_RETURN; if (c == '}') break; name[i] = c; } if (c !='}') goto ERROR_RETURN; name[i] = 0; }/* Otherwise there is just one following character */else { name[0] = c; name[1] = 0; }*ptrptr = ptr;/* Search for a recognized property name using binary chop */bot = 0;top = _pcre_utt_size;while (bot < top) { i = (bot + top) >> 1; c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset); if (c == 0) { *dptr = _pcre_utt[i].value; return _pcre_utt[i].type; } if (c > 0) bot = i + 1; else top = i; }*errorcodeptr = ERR47;*ptrptr = ptr;return -1;ERROR_RETURN:*errorcodeptr = ERR46;*ptrptr = ptr;return -1;}#endif/************************************************** Check for counted repeat **************************************************//* This function is called when a '{' is encountered in a place where it mightstart a quantifier. It looks ahead to see if it really is a quantifier or not.It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}where the ddds are digits.Arguments: p pointer to the first char after '{'Returns: TRUE or FALSE*/static BOOLis_counted_repeat(const uschar *p){if (g_ascii_isdigit(*p++) == 0) return FALSE;while (g_ascii_isdigit(*p) != 0) p++;if (*p == '}') return TRUE;if (*p++ != ',') return FALSE;if (*p == '}') return TRUE;if (g_ascii_isdigit(*p++) == 0) return FALSE;while (g_ascii_isdigit(*p) != 0) p++;return (*p == '}');}/************************************************** Read repeat counts **************************************************//* Read an item of the form {n,m} and return the values. This is called onlyafter is_counted_repeat() has confirmed that a repeat-count quantifier exists,so the syntax is guaranteed to be correct, but we need to check the values.Arguments: p pointer to first char after '{' minp pointer to int for min maxp pointer to int for max returned as -1 if no max errorcodeptr points to error code variableReturns: pointer to '}' on success; current ptr on error, with errorcodeptr set non-zero*/static const uschar *read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr){int min = 0;int max = -1;/* Read the minimum value and do a paranoid check: a negative value indicatesan integer overflow. */while (g_ascii_isdigit(*p) != 0) min = min * 10 + *p++ - '0';if (min < 0 || min > 65535) { *errorcodeptr = ERR5; return p; }/* Read the maximum value if there is one, and again do a paranoid on its size.Also, max must not be less than min. */if (*p == '}') max = min; else { if (*(++p) != '}') { max = 0; while(g_ascii_isdigit(*p) != 0) max = max * 10 + *p++ - '0'; if (max < 0 || max > 65535) { *errorcodeptr = ERR5; return p; } if (max < min) { *errorcodeptr = ERR4; return p; } } }/* Fill in the required variables, and pass back the pointer to the terminating'}'. */*minp = min;*maxp = max;return p;}/************************************************** Find forward referenced subpattern **************************************************//* This function scans along a pattern's text looking for capturingsubpatterns, and counting them. If it finds a named pattern that matches thename it is given, it returns its number. Alternatively, if the name is NULL, itreturns when it reaches a given numbered subpattern. This is used for forwardreferences to subpatterns. We know that if (?P< is encountered, the name willbe terminated by '>' because that is checked in the first pass.Arguments: ptr current position in the pattern cd compile background data name name to seek, or NULL if seeking a numbered subpattern lorn name length, or subpattern number if name is NULL xmode TRUE if we are in /x modeReturns: the number of the named subpattern, or -1 if not found*/static intfind_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn, BOOL xmode){const uschar *thisname;int count = cd->bracount;for (; *ptr != 0; ptr++) { int term; /* Skip over backslashed characters and also entire \Q...\E */ if (*ptr == '\\') { if (*(++ptr) == 0) return -1; if (*ptr == 'Q') for (;;) { while (*(++ptr) != 0 && *ptr != '\\'); if (*ptr == 0) return -1; if (*(++ptr) == 'E') break; } continue; } /* Skip over character classes; this logic must be similar to the way they are handled for real. If the first character is '^', skip it. Also, if the first few characters (either before or after ^) are \Q\E or \E we skip them too. This makes for compatibility with Perl. */ if (*ptr == '[') { BOOL negate_class = FALSE; for (;;) { int c = *(++ptr); if (c == '\\') { if (ptr[1] == 'E') ptr++; else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3; else break; } else if (!negate_class && c == '^') negate_class = TRUE; else break; } /* If the next character is ']', it is a data character that must be skipped, except in JavaScript compatibility mode. */ if (ptr[1] == ']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0) ptr++; while (*(++ptr) != ']') { if (*ptr == 0) return -1;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -