📄 pypcre.c
字号:
/* Initialize the 32-char bit map to all zeros. We have to build the
map in a temporary bit of store, in case the class contains only 1
character, because in that case the compiled code doesn't use the
bit map. */
memset(class, 0, 32 * sizeof(uschar));
/* Process characters until ] is reached. By writing this as a "do" it
means that an initial ] is taken as a data character. */
do
{
if (c == 0)
{
*errorptr = ERR6;
goto FAILED;
}
/* Backslash may introduce a single character, or it may introduce one
of the specials, which just set a flag. Escaped items are checked for
validity in the pre-compiling pass. The sequence \b is a special case.
Inside a class (and only there) it is treated as backspace. Elsewhere
it marks a word boundary. Other escapes have preset maps ready to
or into the one we are building. We assume they have more than one
character in them, so set class_count bigger than one. */
if (c == '\\')
{
c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
if (-c == ESC_b) c = '\b';
else if (c < 0)
{
class_charcount = 10;
switch (-c)
{
case ESC_d:
{
for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
}
continue;
case ESC_D:
{
for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
}
continue;
case ESC_w:
if (options & PCRE_LOCALE)
{
*class_flag |= 1;
}
else
{
for (c = 0; c < 32; c++)
class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
}
continue;
case ESC_W:
if (options & PCRE_LOCALE)
{
*class_flag |= 2;
}
else
{
for (c = 0; c < 32; c++)
class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
}
continue;
case ESC_s:
{
for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
}
continue;
case ESC_S:
{
for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
}
continue;
default:
*errorptr = ERR7;
goto FAILED;
}
}
/* Fall through if single character */
}
/* A single character may be followed by '-' to form a range. However,
Perl does not permit ']' to be the end of the range. A '-' character
here is treated as a literal. */
if (ptr[1] == '-' && ptr[2] != ']')
{
int d;
ptr += 2;
d = *ptr;
if (d == 0)
{
*errorptr = ERR6;
goto FAILED;
}
/* The second part of a range can be a single-character escape, but
not any of the other escapes. */
if (d == '\\')
{
d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
if (d < 0)
{
if (d == -ESC_b) d = '\b'; else
{
*errorptr = ERR7;
goto FAILED;
}
}
}
if (d < c)
{
*errorptr = ERR8;
goto FAILED;
}
for (; c <= d; c++)
{
class[c/8] |= (1 << (c&7));
if ((options & PCRE_CASELESS) != 0)
{
int uc = pcre_fcc[c]; /* flip case */
class[uc/8] |= (1 << (uc&7));
}
class_charcount++; /* in case a one-char range */
class_lastchar = c;
}
continue; /* Go get the next char in the class */
}
/* Handle a lone single character - we can get here for a normal
non-escape char, or after \ that introduces a single character. */
class [c/8] |= (1 << (c&7));
if ((options & PCRE_CASELESS) != 0)
{
c = pcre_fcc[c]; /* flip case */
class[c/8] |= (1 << (c&7));
}
class_charcount++;
class_lastchar = c;
}
/* Loop until ']' reached; the check for end of string happens inside the
loop. This "while" is the end of the "do" above. */
while ((c = *(++ptr)) != ']');
/* If class_charcount is 1 and class_lastchar is not negative, we saw
precisely one character. This doesn't need the whole 32-byte bit map.
We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
it's negative. */
if (class_charcount == 1 && class_lastchar >= 0)
{
if (negate_class)
{
code[-1] = OP_NOT;
}
else
{
code[-1] = OP_CHARS;
*code++ = 1;
}
*code++ = class_lastchar;
}
/* Otherwise, negate the 32-byte map if necessary, and copy it into
the code vector. */
else
{
/* If this is a localized opcode, bump the code pointer up */
if (class_flag) code++;
if (negate_class)
{
if (class_flag) *class_flag = (*class_flag) ^ 63;
for (c = 0; c < 32; c++) code[c] = ~class[c];
}
else
memcpy(code, class, 32);
code += 32;
}
break;
/* Various kinds of repeat */
case '{':
if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
if (*errorptr != NULL) goto FAILED;
goto REPEAT;
case '*':
repeat_min = 0;
repeat_max = -1;
goto REPEAT;
case '+':
repeat_min = 1;
repeat_max = -1;
goto REPEAT;
case '?':
repeat_min = 0;
repeat_max = 1;
REPEAT:
if (previous == NULL)
{
*errorptr = ERR9;
goto FAILED;
}
/* If the next character is '?' this is a minimizing repeat, by default,
but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
next character. */
if (ptr[1] == '?')
{ repeat_type = greedy_non_default; ptr++; }
else repeat_type = greedy_default;
/* If the maximum is zero then the minimum must also be zero; Perl allows
this case, so we do too - by simply omitting the item altogether. */
if (repeat_max == 0) code = previous;
/* If previous was a string of characters, chop off the last one and use it
as the subject of the repeat. If there was only one character, we can
abolish the previous item altogether. */
else if (*previous == OP_CHARS)
{
int len = previous[1];
if (len == 1)
{
c = previous[2];
code = previous;
}
else
{
c = previous[len+1];
previous[1]--;
code--;
}
op_type = 0; /* Use single-char op codes */
goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
}
/* If previous was a single negated character ([^a] or similar), we use
one of the special opcodes, replacing it. The code is shared with single-
character repeats by adding a suitable offset into repeat_type. */
else if ((int)*previous == OP_NOT)
{
op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
c = previous[1];
code = previous;
goto OUTPUT_SINGLE_REPEAT;
}
/* If previous was a character type match (\d or similar), abolish it and
create a suitable repeat item. The code is shared with single-character
repeats by adding a suitable offset into repeat_type. */
else if ((int)*previous < OP_CIRC || *previous == OP_ANY)
{
op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
c = *previous;
code = previous;
OUTPUT_SINGLE_REPEAT:
repeat_type += op_type; /* Combine both values for many cases */
/* A minimum of zero is handled either as the special case * or ?, or as
an UPTO, with the maximum given. */
if (repeat_min == 0)
{
if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
else
{
*code++ = OP_UPTO + repeat_type;
*code++ = repeat_max >> 8;
*code++ = (repeat_max & 255);
}
}
/* The case {1,} is handled as the special case + */
else if (repeat_min == 1 && repeat_max == -1)
*code++ = OP_PLUS + repeat_type;
/* The case {n,n} is just an EXACT, while the general case {n,m} is
handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
else
{
if (repeat_min != 1)
{
*code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
*code++ = repeat_min >> 8;
*code++ = (repeat_min & 255);
}
/* If the minimum is 1 and the previous item was a character string,
we either have to put back the item that got canceled if the string
length was 1, or add the character back onto the end of a longer
string. For a character type nothing need be done; it will just get
put back naturally. Note that the final character is always going to
get added below. */
else if (*previous == OP_CHARS)
{
if (code == previous) code += 2; else previous[1]++;
}
/* For a single negated character we also have to put back the
item that got canceled. */
else if (*previous == OP_NOT) code++;
/* If the maximum is unlimited, insert an OP_STAR. */
if (repeat_max < 0)
{
*code++ = c;
*code++ = OP_STAR + repeat_type;
}
/* Else insert an UPTO if the max is greater than the min. */
else if (repeat_max != repeat_min)
{
*code++ = c;
repeat_max -= repeat_min;
*code++ = OP_UPTO + repeat_type;
*code++ = repeat_max >> 8;
*code++ = (repeat_max & 255);
}
}
/* The character or character type itself comes last in all cases. */
*code++ = c;
}
/* If previous was a character class or a back reference, we put the repeat
stuff after it. */
else if (*previous == OP_CLASS || *previous == OP_NEGCLASS ||
*previous==OP_CLASS_L || *previous == OP_REF)
{
if (repeat_min == 0 && repeat_max == -1)
*code++ = OP_CRSTAR + repeat_type;
else if (repeat_min == 1 && repeat_max == -1)
*code++ = OP_CRPLUS + repeat_type;
else if (repeat_min == 0 && repeat_max == 1)
*code++ = OP_CRQUERY + repeat_type;
else
{
*code++ = OP_CRRANGE + repeat_type;
*code++ = repeat_min >> 8;
*code++ = repeat_min & 255;
if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
*code++ = repeat_max >> 8;
*code++ = repeat_max & 255;
}
}
/* If previous was a bracket group, we may have to replicate it in certain
cases. If the maximum repeat count is unlimited, check that the bracket
group cannot match the empty string, and diagnose an error if it can. */
else if ((int)*previous >= OP_BRA)
{
int i;
int len = code - previous;
if (repeat_max == -1 && could_be_empty(previous))
{
*errorptr = ERR10;
goto FAILED;
}
/* If the minimum is greater than zero, and the maximum is unlimited or
equal to the minimum, the first copy remains where it is, and is
replicated up to the minimum number of times. This case includes the +
repeat, but of course no replication is needed in that case. */
if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
{
for (i = 1; i < repeat_min; i++)
{
memcpy(code, previous, len);
code += len;
}
}
/* If the minimum is zero, stick BRAZERO in front of the first copy.
Then, if there is a fixed upper limit, replicated up to that many times,
sticking BRAZERO in front of all the optional ones. */
else
{
if (repeat_min == 0)
{
memmove(previous+1, previous, len);
code++;
*previous++ = OP_BRAZERO + repeat_type;
}
for (i = 1; i < repeat_min; i++)
{
memcpy(code, previous, len);
code += len;
}
for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
{
*code++ = OP_BRAZERO + repeat_type;
memcpy(code, previous, len);
code += len;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -