📄 pcre_compile.c
字号:
/************************************************** Perl-Compatible Regular Expressions **************************************************//* PCRE is a library of functions to support regular expressions whose syntaxand semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Copyright (c) 1997-2008 University of Cambridge-----------------------------------------------------------------------------Redistribution and use in source and binary forms, with or withoutmodification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the University of Cambridge nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THEIMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSEARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BELIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, ORCONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OFSUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESSINTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER INCONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THEPOSSIBILITY OF SUCH DAMAGE.-----------------------------------------------------------------------------*//* This module contains the external function pcre_compile(), along withsupporting internal functions that are not used by other modules. */#ifdef HAVE_CONFIG_H#include "config.h"#endif#define NLBLOCK cd /* Block containing newline information */#define PSSTART start_pattern /* Field containing processed string start */#define PSEND end_pattern /* Field containing processed string end */#include "pcre_internal.h"/* When DEBUG is defined, we need the pcre_printint() function, which is alsoused by pcretest. DEBUG is not defined when building a production library. */#ifdef DEBUG#include "pcre_printint.src"#endif/* Macro for setting individual bits in class bitmaps. */#define SETBIT(a,b) a[b/8] |= (1 << (b%8))/* Maximum length value to check against when making sure that the integer thatholds the compiled pattern length does not overflow. We make it a bit less thanINT_MAX to allow for adding in group terminating bytes, so that we don't haveto check them every time. */#define OFLOW_MAX (INT_MAX - 20)/************************************************** Code parameters and static tables **************************************************//* This value specifies the size of stack workspace that is used during thefirst pre-compile phase that determines how much memory is required. The regexis partly compiled into this space, but the compiled parts are discarded assoon as they can be, so that hopefully there will never be an overrun. The codedoes, however, check for an overrun. The largest amount I've seen used is 218,so this number is very generous.The same workspace is used during the second, actual compile phase forremembering forward references to groups so that they can be filled in at theend. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZEis 4 there is plenty of room. */#define COMPILE_WORK_SIZE (4096)/* Table for handling escaped characters in the range '0'-'z'. Positive returnsare simple data values; negative values are for special things like \d and soon. Zero means further processing is needed (for things like \x), or the escapeis invalid. */#ifndef EBCDIC /* This is the "normal" table for ASCII systems */static const short int escapes[] = { 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */ 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */ '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */-ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */-ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */-ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */ '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */-ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */-ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */ 0, 0, -ESC_z /* x - z */};#else /* This is the "abnormal" table for EBCDIC systems */static const short int escapes[] = {/* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',/* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,/* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',/* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,/* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',/* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,/* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',/* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,/* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,/* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,/* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,/* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,/* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,/* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,/* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,/* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,/* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,/* F8 */ 0, 0, 0, 0, 0, 0, 0, 0};#endif/* Table of special "verbs" like (*PRUNE). This is a short table, so it issearched linearly. Put all the names into a single string, in order to reducethe number of relocations when a shared library is dynamically linked. */typedef struct verbitem { int len; int op;} verbitem;static const char verbnames[] = "ACCEPT\0" "COMMIT\0" "F\0" "FAIL\0" "PRUNE\0" "SKIP\0" "THEN";static const verbitem verbs[] = { { 6, OP_ACCEPT }, { 6, OP_COMMIT }, { 1, OP_FAIL }, { 4, OP_FAIL }, { 5, OP_PRUNE }, { 4, OP_SKIP }, { 4, OP_THEN }};static const int verbcount = sizeof(verbs)/sizeof(verbitem);/* Tables of names of POSIX character classes and their lengths. The names arenow all in a single string, to reduce the number of relocations when a sharedlibrary is dynamically loaded. The list of lengths is terminated by a zerolength entry. The first three must be alpha, lower, upper, as this is assumedfor handling case independence. */static const char posix_names[] = "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0" "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0" "word\0" "xdigit";static const uschar posix_name_lengths[] = { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };/* Table of class bit maps for each POSIX class. Each class is formed from abase map, with an optional addition or removal of another map. Then, for someclasses, there is some additional tweaking: for [:blank:] the vertical spacecharacters are removed, and for [:alpha:] and [:alnum:] the underscorecharacter is removed. The triples in the table consist of the base map offset,second map offset or -1 if no second map, and a non-negative value for mapaddition or a negative value for map subtraction (if there are two maps). Theabsolute value of the third field has these meanings: 0 => no tweaking, 1 =>remove vertical space characters, 2 => remove underscore. */static const int posix_class_maps[] = { cbit_word, cbit_digit, -2, /* alpha */ cbit_lower, -1, 0, /* lower */ cbit_upper, -1, 0, /* upper */ cbit_word, -1, 2, /* alnum - word without underscore */ cbit_print, cbit_cntrl, 0, /* ascii */ cbit_space, -1, 1, /* blank - a GNU extension */ cbit_cntrl, -1, 0, /* cntrl */ cbit_digit, -1, 0, /* digit */ cbit_graph, -1, 0, /* graph */ cbit_print, -1, 0, /* print */ cbit_punct, -1, 0, /* punct */ cbit_space, -1, 0, /* space */ cbit_word, -1, 0, /* word - a Perl extension */ cbit_xdigit,-1, 0 /* xdigit */};#define STRING(a) # a#define XSTRING(s) STRING(s)/* The texts of compile-time error messages. These are "char *" because theyare passed to the outside world. Do not ever re-use any error number, becausethey are documented. Always add a new error instead. Messages marked DEAD beloware no longer used. This used to be a table of strings, but in order to reducethe number of relocations needed when a shared library is loaded dynamically,it is now one long string. We cannot use a table of offsets, because thelengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, wesimply count through to the one we want - this isn't a performance issuebecause these strings are used only when there is a compilation error. */static const char error_texts[] = "no error\0" "\\ at end of pattern\0" "\\c at end of pattern\0" "unrecognized character follows \\\0" "numbers out of order in {} quantifier\0" /* 5 */ "number too big in {} quantifier\0" "missing terminating ] for character class\0" "invalid escape sequence in character class\0" "range out of order in character class\0" "nothing to repeat\0" /* 10 */ "operand of unlimited repeat could match the empty string\0" /** DEAD **/ "internal error: unexpected repeat\0" "unrecognized character after (? or (?-\0" "POSIX named classes are supported only within a class\0" "missing )\0" /* 15 */ "reference to non-existent subpattern\0" "erroffset passed as NULL\0" "unknown option bit(s) set\0" "missing ) after comment\0" "parentheses nested too deeply\0" /** DEAD **/ /* 20 */ "regular expression is too large\0" "failed to get memory\0" "unmatched parentheses\0" "internal error: code overflow\0" "unrecognized character after (?<\0" /* 25 */ "lookbehind assertion is not fixed length\0" "malformed number or name after (?(\0" "conditional group contains more than two branches\0" "assertion expected after (?(\0" "(?R or (?[+-]digits must be followed by )\0" /* 30 */ "unknown POSIX class name\0" "POSIX collating elements are not supported\0" "this version of PCRE is not compiled with PCRE_UTF8 support\0" "spare error\0" /** DEAD **/ "character value in \\x{...} sequence is too large\0" /* 35 */ "invalid condition (?(0)\0" "\\C not allowed in lookbehind assertion\0" "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0" "number after (?C is > 255\0" "closing ) for (?C expected\0" /* 40 */ "recursive call could loop indefinitely\0" "unrecognized character after (?P\0" "syntax error in subpattern name (missing terminator)\0" "two named subpatterns have the same name\0" "invalid UTF-8 string\0" /* 45 */ "support for \\P, \\p, and \\X has not been compiled\0" "malformed \\P or \\p sequence\0" "unknown property name after \\P or \\p\0" "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0" "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0" /* 50 */ "repeated subpattern is too long\0" /** DEAD **/ "octal value is greater than \\377 (not in UTF-8 mode)\0" "internal error: overran compiling workspace\0" "internal error: previously-checked referenced subpattern not found\0" "DEFINE group contains more than one branch\0" /* 55 */ "repeating a DEFINE group is not allowed\0" "inconsistent NEWLINE options\0" "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0" "a numbered reference must not be zero\0" "(*VERB) with an argument is not supported\0" /* 60 */ "(*VERB) not recognized\0" "number is too big\0" "subpattern name expected\0" "digit expected after (?+\0" "] is an invalid data character in JavaScript compatibility mode";/* Definition to allow mutual recursion */static BOOL compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int, int *, int *, branch_chain *, compile_data *, int *);/************************************************** Find an error text **************************************************//* The error texts are now all in one long string, to save on relocations. Assome of the text is of unknown length, we can't use a table of offsets.Instead, just count through the strings. This is not a performance issuebecause it happens only when there has been a compilation error.Argument: the error numberReturns: pointer to the error string*/static const char *find_error_text(int n){const char *s = error_texts;for (; n > 0; n--) while (*s++ != 0);return s;}/************************************************** Handle escapes **************************************************//* This function is called when a \ has been encountered. It either returns apositive value for a simple escape such as \n, or a negative value whichencodes one of the more complicated things such as \d. A backreference to groupn is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. WhenUTF-8 is enabled, a positive value greater than 255 may be returned. On entry,ptr is pointing at the \. On exit, it is on the final character of the escapesequence.Arguments: ptrptr points to the pattern position pointer errorcodeptr points to the errorcode variable bracount number of previous extracting brackets options the options bits isclass TRUE if inside a character classReturns: zero or positive => a data character negative => a special escape sequence on error, errorcodeptr is set*/static intcheck_escape(const uschar **ptrptr, int *errorcodeptr, int bracount, int options, BOOL isclass){BOOL utf8 = (options & PCRE_UTF8) != 0;const uschar *ptr = *ptrptr + 1;int c, i;GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ptr--; /* Set pointer back to the last byte *//* If backslash is at the end of the pattern, it's an error. */if (c == 0) *errorcodeptr = ERR1;/* Non-alphanumerics are literals. For digits or letters, do an initial lookupin a table. A non-zero result is something that can be returned immediately.Otherwise further processing may be required. */#ifndef EBCDIC /* ASCII coding */else if (c < '0' || c > 'z') {} /* Not alphanumeric */else if ((i = escapes[c - '0']) != 0) c = i;#else /* EBCDIC coding */else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */else if ((i = escapes[c - 0x48]) != 0) c = i;#endif/* Escapes that need further processing, or are illegal. */else { const uschar *oldptr; BOOL braced, negated; switch (c) { /* A number of Perl escapes are not handled by PCRE. We give an explicit error. */ case 'l': case 'L': case 'N': case 'u': case 'U': *errorcodeptr = ERR37; break; /* \g must be followed by one of a number of specific things: (1) A number, either plain or braced. If positive, it is an absolute backreference. If negative, it is a relative backreference. This is a Perl 5.10 feature. (2) Perl 5.10 also supports \g{name} as a reference to a named group. This is part of Perl's movement towards a unified syntax for back references. As this is synonymous with \k{name}, we fudge it up by pretending it really was \k. (3) For Oniguruma compatibility we also support \g followed by a name or a number either in angle brackets or in single quotes. However, these are (possibly recursive) subroutine calls, _not_ backreferences. Just return the -ESC_g code (cf \k). */ case 'g': if (ptr[1] == '<' || ptr[1] == '\'') { c = -ESC_g; break; } /* Handle the Perl-compatible cases */ if (ptr[1] == '{') { const uschar *p; for (p = ptr+2; *p != 0 && *p != '}'; p++) if (*p != '-' && g_ascii_isdigit(*p) == 0) break; if (*p != 0 && *p != '}') { c = -ESC_k; break; } braced = TRUE; ptr++; } else braced = FALSE; if (ptr[1] == '-') { negated = TRUE; ptr++; } else negated = FALSE; c = 0; while (g_ascii_isdigit(ptr[1]) != 0)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -