📄 regexpr.c
字号:
regexp_quoted_ops['\''] = Rendbuf;
}
if (regexp_syntax & RE_ANSI_HEX)
regexp_quoted_ops['v'] = Rextended_memory;
for (a = 0; a < Rnum_ops; a++)
regexp_precedences[a] = 4;
if (regexp_syntax & RE_TIGHT_VBAR)
{
regexp_precedences[Ror] = 3;
regexp_precedences[Rbol] = 2;
regexp_precedences[Reol] = 2;
}
else
{
regexp_precedences[Ror] = 2;
regexp_precedences[Rbol] = 3;
regexp_precedences[Reol] = 3;
}
regexp_precedences[Rclosepar] = 1;
regexp_precedences[Rend] = 0;
regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
}
int re_set_syntax(int syntax)
{
int ret;
ret = regexp_syntax;
regexp_syntax = syntax;
re_syntax = syntax; /* Exported copy */
re_compile_initialize();
return ret;
}
static int hex_char_to_decimal(int ch)
{
if (ch >= '0' && ch <= '9')
return ch - '0';
if (ch >= 'a' && ch <= 'f')
return ch - 'a' + 10;
if (ch >= 'A' && ch <= 'F')
return ch - 'A' + 10;
return 16;
}
static void re_compile_fastmap_aux(unsigned char *code, int pos,
unsigned char *visited,
unsigned char *can_be_null,
unsigned char *fastmap)
{
int a;
int b;
int syntaxcode;
if (visited[pos])
return; /* we have already been here */
visited[pos] = 1;
for (;;)
switch (code[pos++]) {
case Cend:
{
*can_be_null = 1;
return;
}
case Cbol:
case Cbegbuf:
case Cendbuf:
case Cwordbeg:
case Cwordend:
case Cwordbound:
case Cnotwordbound:
{
for (a = 0; a < 256; a++)
fastmap[a] = 1;
break;
}
case Csyntaxspec:
{
syntaxcode = code[pos++];
for (a = 0; a < 256; a++)
if (SYNTAX(a) & syntaxcode)
fastmap[a] = 1;
return;
}
case Cnotsyntaxspec:
{
syntaxcode = code[pos++];
for (a = 0; a < 256; a++)
if (!(SYNTAX(a) & syntaxcode) )
fastmap[a] = 1;
return;
}
case Ceol:
{
fastmap['\n'] = 1;
if (*can_be_null == 0)
*can_be_null = 2; /* can match null, but only at end of buffer*/
return;
}
case Cset:
{
for (a = 0; a < 256/8; a++)
if (code[pos + a] != 0)
for (b = 0; b < 8; b++)
if (code[pos + a] & (1 << b))
fastmap[(a << 3) + b] = 1;
pos += 256/8;
return;
}
case Cexact:
{
fastmap[(unsigned char)code[pos]] = 1;
return;
}
case Canychar:
{
for (a = 0; a < 256; a++)
if (a != '\n')
fastmap[a] = 1;
return;
}
case Cstart_memory:
case Cend_memory:
{
pos++;
break;
}
case Cmatch_memory:
{
for (a = 0; a < 256; a++)
fastmap[a] = 1;
*can_be_null = 1;
return;
}
case Cjump:
case Cdummy_failure_jump:
case Cupdate_failure_jump:
case Cstar_jump:
{
a = (unsigned char)code[pos++];
a |= (unsigned char)code[pos++] << 8;
pos += (int)SHORT(a);
if (visited[pos])
{
/* argh... the regexp contains empty loops. This is not
good, as this may cause a failure stack overflow when
matching. Oh well. */
/* this path leads nowhere; pursue other paths. */
return;
}
visited[pos] = 1;
break;
}
case Cfailure_jump:
{
a = (unsigned char)code[pos++];
a |= (unsigned char)code[pos++] << 8;
a = pos + (int)SHORT(a);
re_compile_fastmap_aux(code, a, visited, can_be_null, fastmap);
break;
}
case Crepeat1:
{
pos += 2;
break;
}
default:
{
PyErr_SetString(PyExc_SystemError, "Unknown regex opcode: memory corrupted?");
return;
/*NOTREACHED*/
}
}
}
static int re_do_compile_fastmap(unsigned char *buffer, int used, int pos,
unsigned char *can_be_null,
unsigned char *fastmap)
{
unsigned char small_visited[512], *visited;
if (used <= sizeof(small_visited))
visited = small_visited;
else
{
visited = malloc(used);
if (!visited)
return 0;
}
*can_be_null = 0;
memset(fastmap, 0, 256);
memset(visited, 0, used);
re_compile_fastmap_aux(buffer, pos, visited, can_be_null, fastmap);
if (visited != small_visited)
free(visited);
return 1;
}
void re_compile_fastmap(regexp_t bufp)
{
if (!bufp->fastmap || bufp->fastmap_accurate)
return;
assert(bufp->used > 0);
if (!re_do_compile_fastmap(bufp->buffer,
bufp->used,
0,
&bufp->can_be_null,
bufp->fastmap))
return;
if (PyErr_Occurred()) return;
if (bufp->buffer[0] == Cbol)
bufp->anchor = 1; /* begline */
else
if (bufp->buffer[0] == Cbegbuf)
bufp->anchor = 2; /* begbuf */
else
bufp->anchor = 0; /* none */
bufp->fastmap_accurate = 1;
}
/*
* star is coded as:
* 1: failure_jump 2
* ... code for operand of star
* star_jump 1
* 2: ... code after star
*
* We change the star_jump to update_failure_jump if we can determine
* that it is safe to do so; otherwise we change it to an ordinary
* jump.
*
* plus is coded as
*
* jump 2
* 1: failure_jump 3
* 2: ... code for operand of plus
* star_jump 1
* 3: ... code after plus
*
* For star_jump considerations this is processed identically to star.
*
*/
static int re_optimize_star_jump(regexp_t bufp, unsigned char *code)
{
unsigned char map[256];
unsigned char can_be_null;
unsigned char *p1;
unsigned char *p2;
unsigned char ch;
int a;
int b;
int num_instructions = 0;
a = (unsigned char)*code++;
a |= (unsigned char)*code++ << 8;
a = (int)SHORT(a);
p1 = code + a + 3; /* skip the failure_jump */
/* Check that the jump is within the pattern */
if (p1<bufp->buffer || bufp->buffer+bufp->used<p1)
{
PyErr_SetString(PyExc_SystemError, "Regex VM jump out of bounds (failure_jump opt)");
return 0;
}
assert(p1[-3] == Cfailure_jump);
p2 = code;
/* p1 points inside loop, p2 points to after loop */
if (!re_do_compile_fastmap(bufp->buffer, bufp->used,
(int)(p2 - bufp->buffer),
&can_be_null, map))
goto make_normal_jump;
/* If we might introduce a new update point inside the
* loop, we can't optimize because then update_jump would
* update a wrong failure point. Thus we have to be
* quite careful here.
*/
/* loop until we find something that consumes a character */
loop_p1:
num_instructions++;
switch (*p1++)
{
case Cbol:
case Ceol:
case Cbegbuf:
case Cendbuf:
case Cwordbeg:
case Cwordend:
case Cwordbound:
case Cnotwordbound:
{
goto loop_p1;
}
case Cstart_memory:
case Cend_memory:
{
p1++;
goto loop_p1;
}
case Cexact:
{
ch = (unsigned char)*p1++;
if (map[(int)ch])
goto make_normal_jump;
break;
}
case Canychar:
{
for (b = 0; b < 256; b++)
if (b != '\n' && map[b])
goto make_normal_jump;
break;
}
case Cset:
{
for (b = 0; b < 256; b++)
if ((p1[b >> 3] & (1 << (b & 7))) && map[b])
goto make_normal_jump;
p1 += 256/8;
break;
}
default:
{
goto make_normal_jump;
}
}
/* now we know that we can't backtrack. */
while (p1 != p2 - 3)
{
num_instructions++;
switch (*p1++)
{
case Cend:
{
return 0;
}
case Cbol:
case Ceol:
case Canychar:
case Cbegbuf:
case Cendbuf:
case Cwordbeg:
case Cwordend:
case Cwordbound:
case Cnotwordbound:
{
break;
}
case Cset:
{
p1 += 256/8;
break;
}
case Cexact:
case Cstart_memory:
case Cend_memory:
case Cmatch_memory:
case Csyntaxspec:
case Cnotsyntaxspec:
{
p1++;
break;
}
case Cjump:
case Cstar_jump:
case Cfailure_jump:
case Cupdate_failure_jump:
case Cdummy_failure_jump:
{
goto make_normal_jump;
}
default:
{
return 0;
}
}
}
/* make_update_jump: */
code -= 3;
a += 3; /* jump to after the Cfailure_jump */
code[0] = Cupdate_failure_jump;
code[1] = a & 0xff;
code[2] = a >> 8;
if (num_instructions > 1)
return 1;
assert(num_instructions == 1);
/* if the only instruction matches a single character, we can do
* better */
p1 = code + 3 + a; /* start of sole instruction */
if (*p1 == Cset || *p1 == Cexact || *p1 == Canychar ||
*p1 == Csyntaxspec || *p1 == Cnotsyntaxspec)
code[0] = Crepeat1;
return 1;
make_normal_jump:
code -= 3;
*code = Cjump;
return 1;
}
static int re_optimize(regexp_t bufp)
{
unsigned char *code;
code = bufp->buffer;
while(1)
{
switch (*code++)
{
case Cend:
{
return 1;
}
case Canychar:
case Cbol:
case Ceol:
case Cbegbuf:
case Cendbuf:
case Cwordbeg:
case Cwordend:
case Cwordbound:
case Cnotwordbound:
{
break;
}
case Cset:
{
code += 256/8;
break;
}
case Cexact:
case Cstart_memory:
case Cend_memory:
case Cmatch_memory:
case Csyntaxspec:
case Cnotsyntaxspec:
{
code++;
break;
}
case Cstar_jump:
{
if (!re_optimize_star_jump(bufp, code))
{
return 0;
}
/* fall through */
}
case Cupdate_failure_jump:
case Cjump:
case Cdummy_failure_jump:
case Cfailure_jump:
case Crepeat1:
{
code += 2;
break;
}
default:
{
return 0;
}
}
}
}
#define NEXTCHAR(var) \
{ \
if (pos >= size) \
goto ends_prematurely; \
(var) = regex[pos]; \
pos++; \
}
#define ALLOC(amount) \
{ \
if (pattern_offset+(amount) > alloc) \
{ \
alloc += 256 + (amount); \
pattern = realloc(pattern, alloc); \
if (!pattern) \
goto out_of_memory; \
} \
}
#define STORE(ch) pattern[pattern_offset++] = (ch)
#define CURRENT_LEVEL_START (starts[starts_base + current_level])
#define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
#define PUSH_LEVEL_STARTS \
if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
starts_base += NUM_LEVELS; \
else \
goto too_complex \
#define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
#define PUT_ADDR(offset,addr) \
{ \
int disp = (addr) - (offset) - 2; \
pattern[(offset)] = disp & 0xff; \
pattern[(offset)+1] = (disp>>8) & 0xff; \
}
#define INSERT_JUMP(pos,type,addr) \
{ \
int a, p = (pos), t = (type), ad = (addr); \
for (a = pattern_offset - 1; a >= p; a--) \
pattern[a + 3] = pattern[a]; \
pattern[p] = t; \
PUT_ADDR(p+1,ad); \
pattern_offset += 3; \
}
#define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
#define SET_FIELDS \
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -