📄 pcre_compile.c
字号:
*************************************************/
/* This function scans along a pattern's text looking for capturing
subpatterns, and counting them. If it finds a named pattern that matches the
name it is given, it returns its number. Alternatively, if the name is NULL, it
returns when it reaches a given numbered subpattern. This is used for forward
references to subpatterns. We know that if (?P< is encountered, the name will
be terminated by '>' because that is checked in the first pass.
Arguments:
ptr current position in the pattern
count current count of capturing parens so far encountered
name name to seek, or NULL if seeking a numbered subpattern
lorn name length, or subpattern number if name is NULL
xmode TRUE if we are in /x mode
Returns: the number of the named subpattern, or -1 if not found
*/
static int
find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
BOOL xmode)
{
const uschar *thisname;
for (; *ptr != 0; ptr++)
{
int term;
/* Skip over backslashed characters and also entire \Q...\E */
if (*ptr == '\\')
{
if (*(++ptr) == 0) return -1;
if (*ptr == 'Q') for (;;)
{
while (*(++ptr) != 0 && *ptr != '\\');
if (*ptr == 0) return -1;
if (*(++ptr) == 'E') break;
}
continue;
}
/* Skip over character classes */
if (*ptr == '[')
{
while (*(++ptr) != ']')
{
if (*ptr == '\\')
{
if (*(++ptr) == 0) return -1;
if (*ptr == 'Q') for (;;)
{
while (*(++ptr) != 0 && *ptr != '\\');
if (*ptr == 0) return -1;
if (*(++ptr) == 'E') break;
}
continue;
}
}
continue;
}
/* Skip comments in /x mode */
if (xmode && *ptr == '#')
{
while (*(++ptr) != 0 && *ptr != '\n');
if (*ptr == 0) return -1;
continue;
}
/* An opening parens must now be a real metacharacter */
if (*ptr != '(') continue;
if (ptr[1] != '?')
{
count++;
if (name == NULL && count == lorn) return count;
continue;
}
ptr += 2;
if (*ptr == 'P') ptr++; /* Allow optional P */
/* We have to disambiguate (?<! and (?<= from (?<name> */
if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
*ptr != '\'')
continue;
count++;
if (name == NULL && count == lorn) return count;
term = *ptr++;
if (term == '<') term = '>';
thisname = ptr;
while (*ptr != term) ptr++;
if (name != NULL && lorn == ptr - thisname &&
strncmp((const char *)name, (const char *)thisname, lorn) == 0)
return count;
}
return -1;
}
/*************************************************
* Find first significant op code *
*************************************************/
/* This is called by several functions that scan a compiled expression looking
for a fixed first character, or an anchoring op code etc. It skips over things
that do not influence this. For some calls, a change of option is important.
For some calls, it makes sense to skip negative forward and all backward
assertions, and also the \b assertion; for others it does not.
Arguments:
code pointer to the start of the group
options pointer to external options
optbit the option bit whose changing is significant, or
zero if none are
skipassert TRUE if certain assertions are to be skipped
Returns: pointer to the first significant opcode
*/
static const uschar*
first_significant_code(const uschar *code, int *options, int optbit,
BOOL skipassert)
{
for (;;)
{
switch ((int)*code)
{
case OP_OPT:
if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
*options = (int)code[1];
code += 2;
break;
case OP_ASSERT_NOT:
case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT:
if (!skipassert) return code;
do code += GET(code, 1); while (*code == OP_ALT);
code += _pcre_OP_lengths[*code];
break;
case OP_WORD_BOUNDARY:
case OP_NOT_WORD_BOUNDARY:
if (!skipassert) return code;
/* Fall through */
case OP_CALLOUT:
case OP_CREF:
case OP_RREF:
case OP_DEF:
code += _pcre_OP_lengths[*code];
break;
default:
return code;
}
}
/* Control never reaches here */
}
/*************************************************
* Find the fixed length of a pattern *
*************************************************/
/* Scan a pattern and compute the fixed length of subject that will match it,
if the length is fixed. This is needed for dealing with backward assertions.
In UTF8 mode, the result is in characters rather than bytes.
Arguments:
code points to the start of the pattern (the bracket)
options the compiling options
Returns: the fixed length, or -1 if there is no fixed length,
or -2 if \C was encountered
*/
static int
find_fixedlength(uschar *code, int options)
{
int length = -1;
register int branchlength = 0;
register uschar *cc = code + 1 + LINK_SIZE;
/* Scan along the opcodes for this branch. If we get to the end of the
branch, check the length against that of the other branches. */
for (;;)
{
int d;
register int op = *cc;
switch (op)
{
case OP_CBRA:
case OP_BRA:
case OP_ONCE:
case OP_COND:
d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
if (d < 0) return d;
branchlength += d;
do cc += GET(cc, 1); while (*cc == OP_ALT);
cc += 1 + LINK_SIZE;
break;
/* Reached end of a branch; if it's a ket it is the end of a nested
call. If it's ALT it is an alternation in a nested call. If it is
END it's the end of the outer call. All can be handled by the same code. */
case OP_ALT:
case OP_KET:
case OP_KETRMAX:
case OP_KETRMIN:
case OP_END:
if (length < 0) length = branchlength;
else if (length != branchlength) return -1;
if (*cc != OP_ALT) return length;
cc += 1 + LINK_SIZE;
branchlength = 0;
break;
/* Skip over assertive subpatterns */
case OP_ASSERT:
case OP_ASSERT_NOT:
case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT:
do cc += GET(cc, 1); while (*cc == OP_ALT);
/* Fall through */
/* Skip over things that don't match chars */
case OP_REVERSE:
case OP_CREF:
case OP_RREF:
case OP_DEF:
case OP_OPT:
case OP_CALLOUT:
case OP_SOD:
case OP_SOM:
case OP_EOD:
case OP_EODN:
case OP_CIRC:
case OP_DOLL:
case OP_NOT_WORD_BOUNDARY:
case OP_WORD_BOUNDARY:
cc += _pcre_OP_lengths[*cc];
break;
/* Handle literal characters */
case OP_CHAR:
case OP_CHARNC:
case OP_NOT:
branchlength++;
cc += 2;
#ifdef SUPPORT_UTF8
if ((options & PCRE_UTF8) != 0)
{
while ((*cc & 0xc0) == 0x80) cc++;
}
#endif
break;
/* Handle exact repetitions. The count is already in characters, but we
need to skip over a multibyte character in UTF8 mode. */
case OP_EXACT:
branchlength += GET2(cc,1);
cc += 4;
#ifdef SUPPORT_UTF8
if ((options & PCRE_UTF8) != 0)
{
while((*cc & 0x80) == 0x80) cc++;
}
#endif
break;
case OP_TYPEEXACT:
branchlength += GET2(cc,1);
cc += 4;
break;
/* Handle single-char matchers */
case OP_PROP:
case OP_NOTPROP:
cc += 2;
/* Fall through */
case OP_NOT_DIGIT:
case OP_DIGIT:
case OP_NOT_WHITESPACE:
case OP_WHITESPACE:
case OP_NOT_WORDCHAR:
case OP_WORDCHAR:
case OP_ANY:
branchlength++;
cc++;
break;
/* The single-byte matcher isn't allowed */
case OP_ANYBYTE:
return -2;
/* Check a class for variable quantification */
#ifdef SUPPORT_UTF8
case OP_XCLASS:
cc += GET(cc, 1) - 33;
/* Fall through */
#endif
case OP_CLASS:
case OP_NCLASS:
cc += 33;
switch (*cc)
{
case OP_CRSTAR:
case OP_CRMINSTAR:
case OP_CRQUERY:
case OP_CRMINQUERY:
return -1;
case OP_CRRANGE:
case OP_CRMINRANGE:
if (GET2(cc,1) != GET2(cc,3)) return -1;
branchlength += GET2(cc,1);
cc += 5;
break;
default:
branchlength++;
}
break;
/* Anything else is variable length */
default:
return -1;
}
}
/* Control never gets here */
}
/*************************************************
* Scan compiled regex for numbered bracket *
*************************************************/
/* This little function scans through a compiled pattern until it finds a
capturing bracket with the given number.
Arguments:
code points to start of expression
utf8 TRUE in UTF-8 mode
number the required bracket number
Returns: pointer to the opcode for the bracket, or NULL if not found
*/
static const uschar *
find_bracket(const uschar *code, BOOL utf8, int number)
{
for (;;)
{
register int c = *code;
if (c == OP_END) return NULL;
/* XCLASS is used for classes that cannot be represented just by a bit
map. This includes negated single high-valued characters. The length in
the table is zero; the actual length is stored in the compiled code. */
if (c == OP_XCLASS) code += GET(code, 1);
/* Handle capturing bracket */
else if (c == OP_CBRA)
{
int n = GET2(code, 1+LINK_SIZE);
if (n == number) return (uschar *)code;
code += _pcre_OP_lengths[c];
}
/* In UTF-8 mode, opcodes that are followed by a character may be followed by
a multi-byte character. The length in the table is a minimum, so we have to
arrange to skip the extra bytes. */
else
{
code += _pcre_OP_lengths[c];
#ifdef SUPPORT_UTF8
if (utf8) switch(c)
{
case OP_CHAR:
case OP_CHARNC:
case OP_EXACT:
case OP_UPTO:
case OP_MINUPTO:
case OP_POSUPTO:
case OP_STAR:
case OP_MINSTAR:
case OP_POSSTAR:
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -