📄 pcre_compile.c
字号:
case OP_PLUS:
case OP_MINPLUS:
case OP_POSPLUS:
case OP_QUERY:
case OP_MINQUERY:
case OP_POSQUERY:
if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
break;
}
#endif
}
}
}
/*************************************************
* Scan compiled regex for recursion reference *
*************************************************/
/* This little function scans through a compiled pattern until it finds an
instance of OP_RECURSE.
Arguments:
code points to start of expression
utf8 TRUE in UTF-8 mode
Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
*/
static const uschar *
find_recurse(const uschar *code, BOOL utf8)
{
for (;;)
{
register int c = *code;
if (c == OP_END) return NULL;
if (c == OP_RECURSE) return code;
/* XCLASS is used for classes that cannot be represented just by a bit
map. This includes negated single high-valued characters. The length in
the table is zero; the actual length is stored in the compiled code. */
if (c == OP_XCLASS) code += GET(code, 1);
/* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
that are followed by a character may be followed by a multi-byte character.
The length in the table is a minimum, so we have to arrange to skip the extra
bytes. */
else
{
code += _pcre_OP_lengths[c];
#ifdef SUPPORT_UTF8
if (utf8) switch(c)
{
case OP_CHAR:
case OP_CHARNC:
case OP_EXACT:
case OP_UPTO:
case OP_MINUPTO:
case OP_POSUPTO:
case OP_STAR:
case OP_MINSTAR:
case OP_POSSTAR:
case OP_PLUS:
case OP_MINPLUS:
case OP_POSPLUS:
case OP_QUERY:
case OP_MINQUERY:
case OP_POSQUERY:
if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
break;
}
#endif
}
}
}
/*************************************************
* Scan compiled branch for non-emptiness *
*************************************************/
/* This function scans through a branch of a compiled pattern to see whether it
can match the empty string or not. It is called from could_be_empty()
below and from compile_branch() when checking for an unlimited repeat of a
group that can match nothing. Note that first_significant_code() skips over
assertions. If we hit an unclosed bracket, we return "empty" - this means we've
struck an inner bracket whose current branch will already have been scanned.
Arguments:
code points to start of search
endcode points to where to stop
utf8 TRUE if in UTF8 mode
Returns: TRUE if what is matched could be empty
*/
static BOOL
could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
{
register int c;
for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
code < endcode;
code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
{
const uschar *ccode;
c = *code;
if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
{
BOOL empty_branch;
if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
/* Scan a closed bracket */
empty_branch = FALSE;
do
{
if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
empty_branch = TRUE;
code += GET(code, 1);
}
while (*code == OP_ALT);
if (!empty_branch) return FALSE; /* All branches are non-empty */
/* Move past the KET and fudge things so that the increment in the "for"
above has no effect. */
c = OP_END;
code += 1 + LINK_SIZE - _pcre_OP_lengths[c];
continue;
}
/* Handle the other opcodes */
switch (c)
{
/* Check for quantifiers after a class */
#ifdef SUPPORT_UTF8
case OP_XCLASS:
ccode = code + GET(code, 1);
goto CHECK_CLASS_REPEAT;
#endif
case OP_CLASS:
case OP_NCLASS:
ccode = code + 33;
#ifdef SUPPORT_UTF8
CHECK_CLASS_REPEAT:
#endif
switch (*ccode)
{
case OP_CRSTAR: /* These could be empty; continue */
case OP_CRMINSTAR:
case OP_CRQUERY:
case OP_CRMINQUERY:
break;
default: /* Non-repeat => class must match */
case OP_CRPLUS: /* These repeats aren't empty */
case OP_CRMINPLUS:
return FALSE;
case OP_CRRANGE:
case OP_CRMINRANGE:
if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
break;
}
break;
/* Opcodes that must match a character */
case OP_PROP:
case OP_NOTPROP:
case OP_EXTUNI:
case OP_NOT_DIGIT:
case OP_DIGIT:
case OP_NOT_WHITESPACE:
case OP_WHITESPACE:
case OP_NOT_WORDCHAR:
case OP_WORDCHAR:
case OP_ANY:
case OP_ANYBYTE:
case OP_CHAR:
case OP_CHARNC:
case OP_NOT:
case OP_PLUS:
case OP_MINPLUS:
case OP_POSPLUS:
case OP_EXACT:
case OP_NOTPLUS:
case OP_NOTMINPLUS:
case OP_NOTPOSPLUS:
case OP_NOTEXACT:
case OP_TYPEPLUS:
case OP_TYPEMINPLUS:
case OP_TYPEPOSPLUS:
case OP_TYPEEXACT:
return FALSE;
/* End of branch */
case OP_KET:
case OP_KETRMAX:
case OP_KETRMIN:
case OP_ALT:
return TRUE;
/* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
MINUPTO, and POSUPTO may be followed by a multibyte character */
#ifdef SUPPORT_UTF8
case OP_STAR:
case OP_MINSTAR:
case OP_POSSTAR:
case OP_QUERY:
case OP_MINQUERY:
case OP_POSQUERY:
case OP_UPTO:
case OP_MINUPTO:
case OP_POSUPTO:
if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
break;
#endif
}
}
return TRUE;
}
/*************************************************
* Scan compiled regex for non-emptiness *
*************************************************/
/* This function is called to check for left recursive calls. We want to check
the current branch of the current pattern to see if it could match the empty
string. If it could, we must look outwards for branches at other levels,
stopping when we pass beyond the bracket which is the subject of the recursion.
Arguments:
code points to start of the recursion
endcode points to where to stop (current RECURSE item)
bcptr points to the chain of current (unclosed) branch starts
utf8 TRUE if in UTF-8 mode
Returns: TRUE if what is matched could be empty
*/
static BOOL
could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
BOOL utf8)
{
while (bcptr != NULL && bcptr->current >= code)
{
if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
bcptr = bcptr->outer;
}
return TRUE;
}
/*************************************************
* Check for POSIX class syntax *
*************************************************/
/* This function is called when the sequence "[:" or "[." or "[=" is
encountered in a character class. It checks whether this is followed by an
optional ^ and then a sequence of letters, terminated by a matching ":]" or
".]" or "=]".
Argument:
ptr pointer to the initial [
endptr where to return the end pointer
cd pointer to compile data
Returns: TRUE or FALSE
*/
static BOOL
check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
{
int terminator; /* Don't combine these lines; the Solaris cc */
terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
if (*(++ptr) == '^') ptr++;
while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
if (*ptr == terminator && ptr[1] == ']')
{
*endptr = ptr;
return TRUE;
}
return FALSE;
}
/*************************************************
* Check POSIX class name *
*************************************************/
/* This function is called to check the name given in a POSIX-style class entry
such as [:alnum:].
Arguments:
ptr points to the first letter
len the length of the name
Returns: a value representing the name, or -1 if unknown
*/
static int
check_posix_name(const uschar *ptr, int len)
{
register int yield = 0;
while (posix_name_lengths[yield] != 0)
{
if (len == posix_name_lengths[yield] &&
strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
yield++;
}
return -1;
}
/*************************************************
* Adjust OP_RECURSE items in repeated group *
*************************************************/
/* OP_RECURSE items contain an offset from the start of the regex to the group
that is referenced. This means that groups can be replicated for fixed
repetition simply by copying (because the recursion is allowed to refer to
earlier groups that are outside the current group). However, when a group is
optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
it, after it has been compiled. This means that any OP_RECURSE items within it
that refer to the group itself or any contained groups have to have their
offsets adjusted. That one of the jobs of this function. Before it is called,
the partially compiled regex must be temporarily terminated with OP_END.
This function has been extended with the possibility of forward references for
recursions and subroutine calls. It must also check the list of such references
for the group we are dealing with. If it finds that one of the recursions in
the current group is on this list, it adjusts the offset in the list, not the
value in the reference (which is a group number).
Arguments:
group points to the start of the group
adjust the amount by which the group is to be moved
utf8 TRUE in UTF-8 mode
cd contains pointers to tables etc.
save_hwm the hwm forward reference pointer at the start of the group
Returns: nothing
*/
static void
adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
uschar *save_hwm)
{
uschar *ptr = group;
while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
{
int offset;
uschar *hc;
/* See if this recursion is on the forward reference list. If so, adjust the
reference. */
for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
{
offset = GET(hc, 0);
if (cd->start_code + offset == ptr + 1)
{
PUT(hc, 0, offset + adjust);
break;
}
}
/* Otherwise, adjust the recursion offset if it's after the start of this
group. */
if (hc >= cd->hwm)
{
offset = GET(ptr, 1);
if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
}
ptr += 1 + LINK_SIZE;
}
}
/*************************************************
* Insert an automatic callout point *
*************************************************/
/* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
callout points before each pattern item.
Arguments:
code current code pointer
ptr current pattern pointer
cd pointers to tables etc
Returns: new code pointer
*/
static uschar *
auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
{
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -