📄 pcre_compile.c
字号:
/************************************************** Perl-Compatible Regular Expressions **************************************************//* PCRE is a library of functions to support regular expressions whose syntaxand semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Copyright (c) 1997-2007 University of Cambridge-----------------------------------------------------------------------------Redistribution and use in source and binary forms, with or withoutmodification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the University of Cambridge nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THEIMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSEARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BELIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, ORCONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OFSUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESSINTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER INCONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THEPOSSIBILITY OF SUCH DAMAGE.-----------------------------------------------------------------------------*//* This module contains the external function pcre_compile(), along withsupporting internal functions that are not used by other modules. */#ifdef HAVE_CONFIG_H#include "config.h"#endif#define NLBLOCK cd /* Block containing newline information */#define PSSTART start_pattern /* Field containing processed string start */#define PSEND end_pattern /* Field containing processed string end */#include "pcre_internal.h"/* When DEBUG is defined, we need the pcre_printint() function, which is alsoused by pcretest. DEBUG is not defined when building a production library. */#ifdef DEBUG#include "pcre_printint.src"#endif/* Macro for setting individual bits in class bitmaps. */#define SETBIT(a,b) a[b/8] |= (1 << (b%8))/* Maximum length value to check against when making sure that the integer thatholds the compiled pattern length does not overflow. We make it a bit less thanINT_MAX to allow for adding in group terminating bytes, so that we don't haveto check them every time. */#define OFLOW_MAX (INT_MAX - 20)/************************************************** Code parameters and static tables **************************************************//* This value specifies the size of stack workspace that is used during thefirst pre-compile phase that determines how much memory is required. The regexis partly compiled into this space, but the compiled parts are discarded assoon as they can be, so that hopefully there will never be an overrun. The codedoes, however, check for an overrun. The largest amount I've seen used is 218,so this number is very generous.The same workspace is used during the second, actual compile phase forremembering forward references to groups so that they can be filled in at theend. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZEis 4 there is plenty of room. */#define COMPILE_WORK_SIZE (4096)/* Table for handling escaped characters in the range '0'-'z'. Positive returnsare simple data values; negative values are for special things like \d and soon. Zero means further processing is needed (for things like \x), or the escapeis invalid. */#ifndef EBCDIC /* This is the "normal" table for ASCII systems */static const short int escapes[] = { 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */ 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */ '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */-ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */-ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */-ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */ '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */-ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */-ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */ 0, 0, -ESC_z /* x - z */};#else /* This is the "abnormal" table for EBCDIC systems */static const short int escapes[] = {/* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',/* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,/* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',/* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,/* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',/* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,/* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',/* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,/* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,/* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,/* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,/* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,/* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,/* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,/* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,/* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,/* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,/* F8 */ 0, 0, 0, 0, 0, 0, 0, 0};#endif/* Table of special "verbs" like (*PRUNE). This is a short table, so it issearched linearly. Put all the names into a single string, in order to reducethe number of relocations when a shared library is dynamically linked. */typedef struct verbitem { int len; int op;} verbitem;static const char verbnames[] = "ACCEPT\0" "COMMIT\0" "F\0" "FAIL\0" "PRUNE\0" "SKIP\0" "THEN";static verbitem verbs[] = { { 6, OP_ACCEPT }, { 6, OP_COMMIT }, { 1, OP_FAIL }, { 4, OP_FAIL }, { 5, OP_PRUNE }, { 4, OP_SKIP }, { 4, OP_THEN }};static int verbcount = sizeof(verbs)/sizeof(verbitem);/* Tables of names of POSIX character classes and their lengths. The names arenow all in a single string, to reduce the number of relocations when a sharedlibrary is dynamically loaded. The list of lengths is terminated by a zerolength entry. The first three must be alpha, lower, upper, as this is assumedfor handling case independence. */static const char posix_names[] = "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0" "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0" "word\0" "xdigit";static const uschar posix_name_lengths[] = { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };/* Table of class bit maps for each POSIX class. Each class is formed from abase map, with an optional addition or removal of another map. Then, for someclasses, there is some additional tweaking: for [:blank:] the vertical spacecharacters are removed, and for [:alpha:] and [:alnum:] the underscorecharacter is removed. The triples in the table consist of the base map offset,second map offset or -1 if no second map, and a non-negative value for mapaddition or a negative value for map subtraction (if there are two maps). Theabsolute value of the third field has these meanings: 0 => no tweaking, 1 =>remove vertical space characters, 2 => remove underscore. */static const int posix_class_maps[] = { cbit_word, cbit_digit, -2, /* alpha */ cbit_lower, -1, 0, /* lower */ cbit_upper, -1, 0, /* upper */ cbit_word, -1, 2, /* alnum - word without underscore */ cbit_print, cbit_cntrl, 0, /* ascii */ cbit_space, -1, 1, /* blank - a GNU extension */ cbit_cntrl, -1, 0, /* cntrl */ cbit_digit, -1, 0, /* digit */ cbit_graph, -1, 0, /* graph */ cbit_print, -1, 0, /* print */ cbit_punct, -1, 0, /* punct */ cbit_space, -1, 0, /* space */ cbit_word, -1, 0, /* word - a Perl extension */ cbit_xdigit,-1, 0 /* xdigit */};#define STRING(a) # a#define XSTRING(s) STRING(s)/* The texts of compile-time error messages. These are "char *" because theyare passed to the outside world. Do not ever re-use any error number, becausethey are documented. Always add a new error instead. Messages marked DEAD beloware no longer used. This used to be a table of strings, but in order to reducethe number of relocations needed when a shared library is loaded dynamically,it is now one long string. We cannot use a table of offsets, because thelengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, wesimply count through to the one we want - this isn't a performance issuebecause these strings are used only when there is a compilation error. */static const char error_texts[] = "no error\0" "\\ at end of pattern\0" "\\c at end of pattern\0" "unrecognized character follows \\\0" "numbers out of order in {} quantifier\0" /* 5 */ "number too big in {} quantifier\0" "missing terminating ] for character class\0" "invalid escape sequence in character class\0" "range out of order in character class\0" "nothing to repeat\0" /* 10 */ "operand of unlimited repeat could match the empty string\0" /** DEAD **/ "internal error: unexpected repeat\0" "unrecognized character after (?\0" "POSIX named classes are supported only within a class\0" "missing )\0" /* 15 */ "reference to non-existent subpattern\0" "erroffset passed as NULL\0" "unknown option bit(s) set\0" "missing ) after comment\0" "parentheses nested too deeply\0" /** DEAD **/ /* 20 */ "regular expression is too large\0" "failed to get memory\0" "unmatched parentheses\0" "internal error: code overflow\0" "unrecognized character after (?<\0" /* 25 */ "lookbehind assertion is not fixed length\0" "malformed number or name after (?(\0" "conditional group contains more than two branches\0" "assertion expected after (?(\0" "(?R or (?[+-]digits must be followed by )\0" /* 30 */ "unknown POSIX class name\0" "POSIX collating elements are not supported\0" "this version of PCRE is not compiled with PCRE_UTF8 support\0" "spare error\0" /** DEAD **/ "character value in \\x{...} sequence is too large\0" /* 35 */ "invalid condition (?(0)\0" "\\C not allowed in lookbehind assertion\0" "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0" "number after (?C is > 255\0" "closing ) for (?C expected\0" /* 40 */ "recursive call could loop indefinitely\0" "unrecognized character after (?P\0" "syntax error in subpattern name (missing terminator)\0" "two named subpatterns have the same name\0" "invalid UTF-8 string\0" /* 45 */ "support for \\P, \\p, and \\X has not been compiled\0" "malformed \\P or \\p sequence\0" "unknown property name after \\P or \\p\0" "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0" "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0" /* 50 */ "repeated subpattern is too long\0" /** DEAD **/ "octal value is greater than \\377 (not in UTF-8 mode)\0" "internal error: overran compiling workspace\0" "internal error: previously-checked referenced subpattern not found\0" "DEFINE group contains more than one branch\0" /* 55 */ "repeating a DEFINE group is not allowed\0" "inconsistent NEWLINE options\0" "\\g is not followed by a braced name or an optionally braced non-zero number\0" "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number\0" "(*VERB) with an argument is not supported\0" /* 60 */ "(*VERB) not recognized\0" "number is too big";/* Table to identify digits and hex digits. This is used when compilingpatterns. Note that the tables in chartables are dependent on the locale, andmay mark arbitrary characters as digits - but the PCRE compiling code expectsto handle only 0-9, a-z, and A-Z as digits when compiling. That is why we havea private table here. It costs 256 bytes, but it is a lot faster than doingcharacter value tests (at least in some simple cases I timed), and in someapplications one wants PCRE to compile efficiently as well as matchefficiently.For convenience, we use the same bit definitions as in chartables: 0x04 decimal digit 0x08 hexadecimal digitThen we can use ctype_digit and ctype_xdigit in the code. */#ifndef EBCDIC /* This is the "normal" case, for ASCII systems */static const unsigned char digitab[] = { 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */ 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */ 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */ 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */ 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */#else /* This is the "abnormal" case, for EBCDIC systems */static const unsigned char digitab[] = { 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */ 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */ 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */ 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */ 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */ 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */ 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */ 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */ 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */ 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */ 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */ 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */ 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */ 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */ 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */ 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */ 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */ 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */ 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */ 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */ 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */ 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */ 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */ 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */ 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */ 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */#endif/* Definition to allow mutual recursion */static BOOL compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int, int *, int *, branch_chain *, compile_data *, int *);/************************************************** Find an error text *
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -