📄 pcre_compile.c
字号:
*************************************************//* The error texts are now all in one long string, to save on relocations. Assome of the text is of unknown length, we can't use a table of offsets.Instead, just count through the strings. This is not a performance issuebecause it happens only when there has been a compilation error.Argument: the error numberReturns: pointer to the error string*/static const char *find_error_text(int n){const char *s = error_texts;for (; n > 0; n--) while (*s++ != 0);return s;}/************************************************** Handle escapes **************************************************//* This function is called when a \ has been encountered. It either returns apositive value for a simple escape such as \n, or a negative value whichencodes one of the more complicated things such as \d. A backreference to groupn is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. WhenUTF-8 is enabled, a positive value greater than 255 may be returned. On entry,ptr is pointing at the \. On exit, it is on the final character of the escapesequence.Arguments: ptrptr points to the pattern position pointer errorcodeptr points to the errorcode variable bracount number of previous extracting brackets options the options bits isclass TRUE if inside a character classReturns: zero or positive => a data character negative => a special escape sequence on error, errorcodeptr is set*/static intcheck_escape(const uschar **ptrptr, int *errorcodeptr, int bracount, int options, BOOL isclass){BOOL utf8 = (options & PCRE_UTF8) != 0;const uschar *ptr = *ptrptr + 1;int c, i;GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ptr--; /* Set pointer back to the last byte *//* If backslash is at the end of the pattern, it's an error. */if (c == 0) *errorcodeptr = ERR1;/* Non-alphamerics are literals. For digits or letters, do an initial lookup ina table. A non-zero result is something that can be returned immediately.Otherwise further processing may be required. */#ifndef EBCDIC /* ASCII coding */else if (c < '0' || c > 'z') {} /* Not alphameric */else if ((i = escapes[c - '0']) != 0) c = i;#else /* EBCDIC coding */else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */else if ((i = escapes[c - 0x48]) != 0) c = i;#endif/* Escapes that need further processing, or are illegal. */else { const uschar *oldptr; BOOL braced, negated; switch (c) { /* A number of Perl escapes are not handled by PCRE. We give an explicit error. */ case 'l': case 'L': case 'N': case 'u': case 'U': *errorcodeptr = ERR37; break; /* \g must be followed by a number, either plain or braced. If positive, it is an absolute backreference. If negative, it is a relative backreference. This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a reference to a named group. This is part of Perl's movement towards a unified syntax for back references. As this is synonymous with \k{name}, we fudge it up by pretending it really was \k. */ case 'g': if (ptr[1] == '{') { const uschar *p; for (p = ptr+2; *p != 0 && *p != '}'; p++) if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break; if (*p != 0 && *p != '}') { c = -ESC_k; break; } braced = TRUE; ptr++; } else braced = FALSE; if (ptr[1] == '-') { negated = TRUE; ptr++; } else negated = FALSE; c = 0; while ((digitab[ptr[1]] & ctype_digit) != 0) c = c * 10 + *(++ptr) - '0'; if (c < 0) { *errorcodeptr = ERR61; break; } if (c == 0 || (braced && *(++ptr) != '}')) { *errorcodeptr = ERR57; break; } if (negated) { if (c > bracount) { *errorcodeptr = ERR15; break; } c = bracount - (c - 1); } c = -(ESC_REF + c); break; /* The handling of escape sequences consisting of a string of digits starting with one that is not zero is not straightforward. By experiment, the way Perl works seems to be as follows: Outside a character class, the digits are read as a decimal number. If the number is less than 10, or if there are that many previous extracting left brackets, then it is a back reference. Otherwise, up to three octal digits are read to form an escaped byte. Thus \123 is likely to be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal value is greater than 377, the least significant 8 bits are taken. Inside a character class, \ followed by a digit is always an octal number. */ case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': if (!isclass) { oldptr = ptr; c -= '0'; while ((digitab[ptr[1]] & ctype_digit) != 0) c = c * 10 + *(++ptr) - '0'; if (c < 0) { *errorcodeptr = ERR61; break; } if (c < 10 || c <= bracount) { c = -(ESC_REF + c); break; } ptr = oldptr; /* Put the pointer back and fall through */ } /* Handle an octal number following \. If the first digit is 8 or 9, Perl generates a binary zero byte and treats the digit as a following literal. Thus we have to pull back the pointer by one. */ if ((c = *ptr) >= '8') { ptr--; c = 0; break; } /* \0 always starts an octal number, but we may drop through to here with a larger first octal digit. The original code used just to take the least significant 8 bits of octal numbers (I think this is what early Perls used to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more than 3 octal digits. */ case '0': c -= '0'; while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7') c = c * 8 + *(++ptr) - '0'; if (!utf8 && c > 255) *errorcodeptr = ERR51; break; /* \x is complicated. \x{ddd} is a character number which can be greater than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is treated as a data character. */ case 'x': if (ptr[1] == '{') { const uschar *pt = ptr + 2; int count = 0; c = 0; while ((digitab[*pt] & ctype_xdigit) != 0) { register int cc = *pt++; if (c == 0 && cc == '0') continue; /* Leading zeroes */ count++;#ifndef EBCDIC /* ASCII coding */ if (cc >= 'a') cc -= 32; /* Convert to upper case */ c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));#else /* EBCDIC coding */ if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */ c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));#endif } if (*pt == '}') { if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34; ptr = pt; break; } /* If the sequence of hex digits does not end with '}', then we don't recognize this construct; fall through to the normal \x handling. */ } /* Read just a single-byte hex-defined char */ c = 0; while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0) { int cc; /* Some compilers don't like ++ */ cc = *(++ptr); /* in initializers */#ifndef EBCDIC /* ASCII coding */ if (cc >= 'a') cc -= 32; /* Convert to upper case */ c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));#else /* EBCDIC coding */ if (cc <= 'z') cc += 64; /* Convert to upper case */ c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));#endif } break; /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped. This coding is ASCII-specific, but then the whole concept of \cx is ASCII-specific. (However, an EBCDIC equivalent has now been added.) */ case 'c': c = *(++ptr); if (c == 0) { *errorcodeptr = ERR2; break; }#ifndef EBCDIC /* ASCII coding */ if (c >= 'a' && c <= 'z') c -= 32; c ^= 0x40;#else /* EBCDIC coding */ if (c >= 'a' && c <= 'z') c += 64; c ^= 0xC0;#endif break; /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any other alphameric following \ is an error if PCRE_EXTRA was set; otherwise, for Perl compatibility, it is a literal. This code looks a bit odd, but there used to be some cases other than the default, and there may be again in future, so I haven't "optimized" it. */ default: if ((options & PCRE_EXTRA) != 0) switch(c) { default: *errorcodeptr = ERR3; break; } break; } }*ptrptr = ptr;return c;}#ifdef SUPPORT_UCP/************************************************** Handle \P and \p **************************************************//* This function is called after \P or \p has been encountered, provided thatPCRE is compiled with support for Unicode properties. On entry, ptrptr ispointing at the P or p. On exit, it is pointing at the final character of theescape sequence.Argument: ptrptr points to the pattern position pointer negptr points to a boolean that is set TRUE for negation else FALSE dptr points to an int that is set to the detailed property value errorcodeptr points to the error code variableReturns: type value from ucp_type_table, or -1 for an invalid type*/static intget_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr){int c, i, bot, top;const uschar *ptr = *ptrptr;char name[32];c = *(++ptr);if (c == 0) goto ERROR_RETURN;*negptr = FALSE;/* \P or \p can be followed by a name in {}, optionally preceded by ^ fornegation. */if (c == '{') { if (ptr[1] == '^') { *negptr = TRUE; ptr++; } for (i = 0; i < (int)sizeof(name) - 1; i++) { c = *(++ptr); if (c == 0) goto ERROR_RETURN; if (c == '}') break; name[i] = c; } if (c !='}') goto ERROR_RETURN; name[i] = 0; }/* Otherwise there is just one following character */else { name[0] = c; name[1] = 0; }*ptrptr = ptr;/* Search for a recognized property name using binary chop */bot = 0;top = _pcre_utt_size;while (bot < top) { i = (bot + top) >> 1; c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset); if (c == 0) { *dptr = _pcre_utt[i].value; return _pcre_utt[i].type; } if (c > 0) bot = i + 1; else top = i; }*errorcodeptr = ERR47;*ptrptr = ptr;return -1;ERROR_RETURN:*errorcodeptr = ERR46;*ptrptr = ptr;return -1;}#endif/************************************************** Check for counted repeat **************************************************//* This function is called when a '{' is encountered in a place where it mightstart a quantifier. It looks ahead to see if it really is a quantifier or not.It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}where the ddds are digits.Arguments: p pointer to the first char after '{'Returns: TRUE or FALSE*/static BOOLis_counted_repeat(const uschar *p){if ((digitab[*p++] & ctype_digit) == 0) return FALSE;while ((digitab[*p] & ctype_digit) != 0) p++;if (*p == '}') return TRUE;if (*p++ != ',') return FALSE;if (*p == '}') return TRUE;if ((digitab[*p++] & ctype_digit) == 0) return FALSE;while ((digitab[*p] & ctype_digit) != 0) p++;return (*p == '}');}/************************************************** Read repeat counts **************************************************//* Read an item of the form {n,m} and return the values. This is called onlyafter is_counted_repeat() has confirmed that a repeat-count quantifier exists,
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -