📄 pypcre.c
字号:
case OP_CLASS:
case OP_NEGCLASS:
case OP_REF:
case OP_CLASS_L:
switch(*cc)
{
case (OP_REF): cc += 2; break;
case (OP_CLASS): case (OP_NEGCLASS): cc += 1+32; break;
case (OP_CLASS_L): cc += 1+1+32; break;
}
switch (*cc)
{
case OP_CRSTAR:
case OP_CRMINSTAR:
case OP_CRQUERY:
case OP_CRMINQUERY:
cc++;
break;
case OP_CRRANGE:
case OP_CRMINRANGE:
if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
cc += 3;
break;
default:
goto NEXT_BRANCH;
}
break;
/* Anything else matches at least one character */
default:
goto NEXT_BRANCH;
}
}
NEXT_BRANCH:
code += (code[1] << 8) + code[2];
}
while (*code == OP_ALT);
/* No branches match the empty string */
return FALSE;
}
/* Determine the length of a group ID in an expression like
(?P<foo_123>...)
Arguments:
ptr pattern position pointer (say that 3 times fast)
finalchar the character that will mark the end of the ID
errorptr points to the pointer to the error message
*/
static int
get_group_id(const uschar *ptr, char finalchar, const char **errorptr)
{
const uschar *start = ptr;
/* If the first character is not in \w, or is in \w but is a digit,
report an error */
if (!(pcre_ctypes[*ptr] & ctype_word) ||
(pcre_ctypes[*ptr++] & ctype_digit))
{
*errorptr = "(?P identifier must start with a letter or underscore";
return 0;
}
/* Increment ptr until we either hit a null byte, the desired
final character, or a non-word character */
for(; (*ptr != 0) && (*ptr != finalchar) &&
(pcre_ctypes[*ptr] & ctype_word); ptr++)
{
/* Empty loop body */
}
if (*ptr==finalchar)
return ptr-start;
if (*ptr==0)
{
*errorptr = "unterminated (?P identifier";
return 0;
}
*errorptr = "illegal character in (?P identifier";
return 0;
}
/*************************************************
* Handle escapes *
*************************************************/
/* This function is called when a \ has been encountered. It either returns a
positive value for a simple escape such as \n, or a negative value which
encodes one of the more complicated things such as \d. On entry, ptr is
pointing at the \. On exit, it is on the final character of the escape
sequence.
Arguments:
ptrptr points to the pattern position pointer
errorptr points to the pointer to the error message
bracount number of previous extracting brackets
options the options bits
isclass TRUE if inside a character class
Returns: zero or positive => a data character
negative => a special escape sequence
on error, errorptr is set
*/
static int
check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
int options, BOOL isclass)
{
const uschar *ptr = *ptrptr;
int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
int i;
if (c == 0) *errorptr = ERR1;
/* Digits or letters may have special meaning; all others are literals. */
else if (c < '0' || c > 'z') {}
/* Do an initial lookup in a table. A non-zero result is something that can be
returned immediately. Otherwise further processing may be required. */
else if ((i = escapes[c - '0']) != 0) c = i;
/* Escapes that need further processing, or are illegal. */
else
{
switch (c)
{
/* The handling of escape sequences consisting of a string of digits
starting with one that is not zero is not straightforward. By experiment,
the way Perl works seems to be as follows:
Outside a character class, the digits are read as a decimal number. If the
number is less than 10, or if there are that many previous extracting
left brackets, then it is a back reference. Otherwise, up to three octal
digits are read to form an escaped byte. Thus \123 is likely to be octal
123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
value is greater than 377, the least significant 8 bits are taken. Inside a
character class, \ followed by a digit is always an octal number. */
case '1': case '2': case '3': case '4': case '5':
case '6': case '7': case '8': case '9':
{
/* PYTHON: Try to compute an octal value for a character */
for(c=0, i=0; ptr[i]!=0 && i<3; i++)
{
if (( pcre_ctypes[ ptr[i] ] & ctype_odigit) != 0)
c = (c * 8 + ptr[i]-'0') & 255;
else
break; /* Non-octal character--break out of the loop */
}
/* It's a character if there were exactly 3 octal digits, or if
we're inside a character class and there was at least one
octal digit. */
if ( (i == 3) || (isclass && i!=0) )
{
ptr += i-1;
break;
}
c = ptr[0]; /* Restore the first character after the \ */
c -= '0'; i = 1;
while (i<2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0)
{
c = c * 10 + ptr[1] - '0';
ptr++; i++;
}
if (c > 255 - ESC_REF) *errorptr = "back reference too big";
c = -(ESC_REF + c);
}
break;
/* \0 always starts an octal number, but we may drop through to here with a
larger first octal digit */
case '0':
c -= '0';
while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
ptr[1] != '8' && ptr[1] != '9')
c = (c * 8 + *(++ptr) - '0') & 255;
break;
/* Special escapes not starting with a digit are straightforward */
case 'x':
c = 0;
while ( (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
{
ptr++;
c = c * 16 + pcre_lcc[*ptr] -
(((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
c &= 255;
}
break;
/* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
for Perl compatibility, it is a literal. */
default:
if ((options & PCRE_EXTRA) != 0) switch(c)
{
case 'X':
c = -ESC_X; /* This could be a lookup if it ever got into Perl */
break;
default:
*errorptr = ERR3;
break;
}
break;
}
}
*ptrptr = ptr;
return c;
}
/*************************************************
* Check for counted repeat *
*************************************************/
/* This function is called when a '{' is encountered in a place where it might
start a quantifier. It looks ahead to see if it really is a quantifier or not.
It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
where the ddds are digits.
Arguments:
p pointer to the first char after '{'
Returns: TRUE or FALSE
*/
static BOOL
is_counted_repeat(const uschar *p)
{
if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
if (*p == '}') return TRUE;
if (*p++ != ',') return FALSE;
if (*p == '}') return TRUE;
if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
return (*p == '}');
}
/*************************************************
* Read repeat counts *
*************************************************/
/* Read an item of the form {n,m} and return the values. This is called only
after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
so the syntax is guaranteed to be correct, but we need to check the values.
Arguments:
p pointer to first char after '{'
minp pointer to int for min
maxp pointer to int for max
returned as -1 if no max
errorptr points to pointer to error message
Returns: pointer to '}' on success;
current ptr on error, with errorptr set
*/
static const uschar *
read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
{
int min = 0;
int max = -1;
while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
if (*p == '}') max = min; else
{
if (*(++p) != '}')
{
max = 0;
while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
if (max < min)
{
*errorptr = ERR4;
return p;
}
}
}
/* Do paranoid checks, then fill in the required variables, and pass back the
pointer to the terminating '}'. */
if (min > 65535 || max > 65535)
*errorptr = ERR5;
else
{
*minp = min;
*maxp = max;
}
return p;
}
/*************************************************
* Compile one branch *
*************************************************/
/* Scan the pattern, compiling it into the code vector.
Arguments:
options the option bits
bracket points to number of brackets used
code points to the pointer to the current code point
ptrptr points to the current pattern pointer
errorptr points to pointer to error message
Returns: TRUE on success
FALSE, with *errorptr set on error
*/
static BOOL
compile_branch(int options, int *brackets, uschar **codeptr,
const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
{
int repeat_type, op_type;
int repeat_min, repeat_max;
int bravalue, length;
int greedy_default, greedy_non_default;
register int c;
register uschar *code = *codeptr;
const uschar *ptr = *ptrptr;
const uschar *oldptr;
uschar *previous = NULL;
uschar class[32];
uschar *class_flag; /* Pointer to the single-byte flag for OP_CLASS_L */
/* Set up the default and non-default settings for greediness */
greedy_default = ((options & PCRE_UNGREEDY) != 0);
greedy_non_default = greedy_default ^ 1;
/* Switch on next character until the end of the branch */
for (;; ptr++)
{
BOOL negate_class;
int class_charcount;
int class_lastchar;
c = *ptr;
if ((options & PCRE_EXTENDED) != 0)
{
if ((pcre_ctypes[c] & ctype_space) != 0) continue;
if (c == '#')
{
while ((c = *(++ptr)) != 0 && c != '\n');
continue;
}
}
switch(c)
{
/* The branch terminates at end of string, |, or ). */
case 0:
case '|':
case ')':
*codeptr = code;
*ptrptr = ptr;
return TRUE;
/* Handle single-character metacharacters */
case '^':
previous = NULL;
*code++ = OP_CIRC;
break;
case '$':
previous = NULL;
*code++ = OP_DOLL;
break;
case '.':
previous = code;
*code++ = OP_ANY;
break;
/* Character classes. These always build a 32-byte bitmap of the permitted
characters, except in the special case where there is only one character.
For negated classes, we build the map as usual, then invert it at the end.
*/
case '[':
previous = code;
if (options & PCRE_LOCALE)
{
*code++ = OP_CLASS_L;
/* Set the flag for localized classes (like \w) to 0 */
class_flag = code;
*class_flag = 0;
}
else
{
*code++ = OP_CLASS;
class_flag = NULL;
}
/* If the first character is '^', set the negation flag, and use a
different opcode. This only matters if caseless matching is specified at
runtime. */
if ((c = *(++ptr)) == '^')
{
negate_class = TRUE;
if (*(code-1)==OP_CLASS) *(code-1) = OP_NEGCLASS;
c = *(++ptr);
}
else negate_class = FALSE;
/* Keep a count of chars so that we can optimize the case of just a single
character. */
class_charcount = 0;
class_lastchar = -1;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -