📄 pypcre.c
字号:
}
/* If the maximum is unlimited, set a repeater in the final copy. */
if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
}
/* Else there's some kind of shambles */
else
{
*errorptr = ERR11;
goto FAILED;
}
/* In all case we no longer have a previous item. */
previous = NULL;
break;
/* Start of nested bracket sub-expression, or comment or lookahead.
First deal with special things that can come after a bracket; all are
introduced by ?, and the appearance of any of them means that this is not a
referencing group. They were checked for validity in the first pass over
the string, so we don't have to check for syntax errors here. */
case '(':
previous = code; /* Only real brackets can be repeated */
if (*(++ptr) == '?')
{
bravalue = OP_BRA;
switch (*(++ptr))
{
case '#':
case 'i':
case 'L':
case 'm':
case 's':
case 'x':
ptr++;
while (*ptr != ')') ptr++;
previous = NULL;
continue;
case ':': /* Non-extracting bracket */
ptr++;
break;
case '=': /* Assertions can't be repeated */
bravalue = OP_ASSERT;
ptr++;
previous = NULL;
break;
case '!':
bravalue = OP_ASSERT_NOT;
ptr++;
previous = NULL;
break;
case ('P'):
ptr++;
if (*ptr=='<')
{
/* (?P<groupname>...) */
int idlen;
PyObject *string, *intobj;
ptr++;
idlen = get_group_id(ptr, '>', errorptr);
if (*errorptr) {
goto FAILED;
}
string = PyString_FromStringAndSize((char*)ptr, idlen);
intobj = PyInt_FromLong( brackets[0] + 1 );
if (intobj == NULL || string == NULL)
{
Py_XDECREF(string);
Py_XDECREF(intobj);
*errorptr = "exception raised";
goto FAILED;
}
PyDict_SetItem(dictionary, string, intobj);
Py_DECREF(string); Py_DECREF(intobj); /* XXX DECREF commented out! */
ptr += idlen+1; /* Point to rest of expression */
goto do_grouping_bracket;
}
if (*ptr=='=')
{
/* (?P=groupname) */
int idlen, refnum;
PyObject *string, *intobj;
ptr++;
idlen = get_group_id(ptr, ')', errorptr);
if (*errorptr) {
goto FAILED;
}
string = PyString_FromStringAndSize((char *)ptr, idlen);
if (string==NULL) {
*errorptr = "exception raised";
goto FAILED;
}
intobj = PyDict_GetItem(dictionary, string);
if (intobj==NULL) {
Py_DECREF(string);
*errorptr = "?P= group identifier isn't defined";
goto FAILED;
}
refnum = PyInt_AsLong(intobj);
Py_DECREF(string);
/* The caller doesn't own the reference to the value
returned from PyDict_GetItem, so intobj is not
DECREF'ed. */
*code++ = OP_REF;
*code++ = refnum;
/* The continue will cause the top-level for() loop to
be resumed, so ptr will be immediately incremented.
Therefore, the following line adds just idlen, not
idlen+1 */
ptr += idlen;
continue;
}
/* The character after ?P is neither < nor =, so
report an error. Add more Python-extensions here. */
*errorptr="unknown after (?P";
goto FAILED;
case '>': /* "Match once" brackets */
if ((options & PCRE_EXTRA) != 0) /* Not yet standard */
{
bravalue = OP_ONCE;
ptr++;
previous = NULL;
break;
}
/* Else fall through */
default:
*errorptr = ERR12;
goto FAILED;
}
}
/* Else we have a referencing group */
else
{
do_grouping_bracket:
if (++(*brackets) > EXTRACT_MAX)
{
*errorptr = ERR13;
goto FAILED;
}
bravalue = OP_BRA + *brackets;
}
/* Process nested bracketed re; at end pointer is on the bracket. We copy
code into a non-register variable in order to be able to pass its address
because some compilers complain otherwise. */
*code = bravalue;
{
uschar *mcode = code;
if (!compile_regex(options, brackets, &mcode, &ptr, errorptr, dictionary))
goto FAILED;
code = mcode;
}
if (*ptr != ')')
{
*errorptr = ERR14;
goto FAILED;
}
break;
/* Check \ for being a real metacharacter; if not, fall through and handle
it as a data character at the start of a string. Escape items are checked
for validity in the pre-compiling pass. */
case '\\':
oldptr = ptr;
c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
/* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
are arranged to be the negation of the corresponding OP_values. For the
back references, the values are ESC_REF plus the reference number. Only
back references and those types that consume a character may be repeated.
We can test for values between ESC_b and ESC_Z for the latter; this may
have to change if any new ones are ever created. */
if (c < 0)
{
if (-c >= ESC_REF)
{
int refnum = -c - ESC_REF;
if (*brackets < refnum)
{
*errorptr = ERR15;
goto FAILED;
}
previous = code;
*code++ = OP_REF;
*code++ = refnum;
}
else
{
previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
if ( (options & PCRE_LOCALE) != 0)
{
switch (c)
{
case (-ESC_b): c = -OP_WORD_BOUNDARY_L; break;
case (-ESC_B): c = -OP_NOT_WORD_BOUNDARY_L; break;
case (-ESC_w): c = -OP_WORDCHAR_L; break;
case (-ESC_W): c = -OP_NOT_WORDCHAR_L; break;
}
}
*code++ = -c;
}
continue;
}
/* Data character: Reset and fall through */
ptr = oldptr;
c = '\\';
/* Handle a run of data characters until a metacharacter is encountered.
The first character is guaranteed not to be whitespace or # when the
extended flag is set. */
NORMAL_CHAR:
default:
previous = code;
*code = OP_CHARS;
code += 2;
length = 0;
do
{
if ((options & PCRE_EXTENDED) != 0)
{
if ((pcre_ctypes[c] & ctype_space) != 0) continue;
if (c == '#')
{
while ((c = *(++ptr)) != 0 && c != '\n');
if (c == 0) break;
continue;
}
}
/* Backslash may introduce a data char or a metacharacter. Escaped items
are checked for validity in the pre-compiling pass. Stop the string
before a metaitem. */
if (c == '\\')
{
oldptr = ptr;
c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
if (c < 0) { ptr = oldptr; break; }
}
/* Ordinary character or single-char escape */
*code++ = c;
length++;
}
/* This "while" is the end of the "do" above. */
while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
/* Compute the length and set it in the data vector, and advance to
the next state. */
previous[1] = length;
if (length < 255) ptr--;
break;
}
} /* end of big loop */
/* Control never reaches here by falling through, only by a goto for all the
error states. Pass back the position in the pattern so that it can be displayed
to the user for diagnosing the error. */
FAILED:
*ptrptr = ptr;
return FALSE;
}
/*************************************************
* Compile sequence of alternatives *
*************************************************/
/* On entry, ptr is pointing past the bracket character, but on return
it points to the closing bracket, or vertical bar, or end of string.
The code variable is pointing at the byte into which the BRA operator has been
stored.
Argument:
options the option bits
brackets -> int containing the number of extracting brackets used
codeptr -> the address of the current code pointer
ptrptr -> the address of the current pattern pointer
errorptr -> pointer to error message
Returns: TRUE on success
*/
static BOOL
compile_regex(int options, int *brackets, uschar **codeptr,
const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
{
const uschar *ptr = *ptrptr;
uschar *code = *codeptr;
uschar *start_bracket = code;
for (;;)
{
int length;
uschar *last_branch = code;
code += 3;
if (!compile_branch(options, brackets, &code, &ptr, errorptr, dictionary))
{
*ptrptr = ptr;
return FALSE;
}
/* Fill in the length of the last branch */
length = code - last_branch;
last_branch[1] = length >> 8;
last_branch[2] = length & 255;
/* Reached end of expression, either ')' or end of pattern. Insert a
terminating ket and the length of the whole bracketed item, and return,
leaving the pointer at the terminating char. */
if (*ptr != '|')
{
length = code - start_bracket;
*code++ = OP_KET;
*code++ = length >> 8;
*code++ = length & 255;
*codeptr = code;
*ptrptr = ptr;
return TRUE;
}
/* Another branch follows; insert an "or" node and advance the pointer. */
*code = OP_ALT;
ptr++;
}
/* Control never reaches here */
}
/*************************************************
* Check for anchored expression *
*************************************************/
/* Try to find out if this is an anchored regular expression. Consider each
alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
it's anchored. However, if this is a multiline pattern, then only OP_SOD
counts, since OP_CIRC can match in the middle.
A branch is also implicitly anchored if it starts with .* because that will try
the rest of the pattern at all possible matching points, so there is no point
trying them again.
Argument: points to start of expression (the bracket)
Returns: TRUE or FALSE
*/
static BOOL
is_anchored(register const uschar *code, BOOL multiline)
{
do {
int op = (int)code[3];
if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
{ if (!is_anchored(code+3, multiline)) return FALSE; }
else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
{ if (code[4] != OP_ANY) return FALSE; }
else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
code += (code[1] << 8) + code[2];
}
while (*code == OP_ALT);
return TRUE;
}
/*************************************************
* Check for start with \n line expression *
*************************************************/
/* This is called for multiline expressions to try to find out if every branch
starts with ^ so that "first char" processing can be done to speed things up.
Argument: points to start of expression (the bracket)
Returns: TRUE or FALSE
*/
static BOOL
is_startline(const uschar *code)
{
do {
if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
{ if (!is_startline(code+3)) return FALSE; }
else if (code[3] != OP_CIRC) return FALSE;
code += (code[1] << 8) + code[2];
}
while (*code == OP_ALT);
return TRUE;
}
/*************************************************
* Check for fixed first char *
*************************************************/
/* Try to find out if there is a fixed first character. This is called for
unanchored expressions, as it speeds up their processing quite considerably.
Consider each alternative branch. If they all start with the same char, or with
a bracket all of whose alternatives start with the same char (recurse ad lib),
then
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -