📄 pypcre.c
字号:
} /* End of switch */
} /* End of try_next loop */
code += (code[1] << 8) + code[2]; /* Advance to next branch */
}
while (*code == OP_ALT);
return TRUE;
}
/*************************************************
* Study a compiled expression *
*************************************************/
/* This function is handed a compiled expression that it must study to produce
information that will speed up the matching. It returns a pcre_extra block
which then gets handed back to pcre_exec().
Arguments:
re points to the compiled expression
options contains option bits
errorptr points to where to place error messages;
set NULL unless error
Returns: pointer to a pcre_extra block,
NULL on error or if no optimization possible
*/
pcre_extra *
pcre_study(const pcre *external_re, int options, const char **errorptr)
{
BOOL caseless;
uschar start_bits[32];
real_pcre_extra *extra;
const real_pcre *re = (const real_pcre *)external_re;
*errorptr = NULL;
if (re == NULL || re->magic_number != MAGIC_NUMBER)
{
*errorptr = "argument is not a compiled regular expression";
return NULL;
}
if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
{
*errorptr = "unknown or incorrect option bit(s) set";
return NULL;
}
/* Caseless can either be from the compiled regex or from options. */
caseless = ((re->options | options) & PCRE_CASELESS) != 0;
/* For an anchored pattern, or an unanchored pattern that has a first char, or a
multiline pattern that matches only at "line starts", no further processing at
present. */
if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
return NULL;
/* See if we can find a fixed set of initial characters for the pattern. */
memset(start_bits, 0, 32 * sizeof(uschar));
if (!set_start_bits(re->code, start_bits)) return NULL;
/* If this studying is caseless, scan the created bit map and duplicate the
bits for any letters. */
if (caseless)
{
register int c;
for (c = 0; c < 256; c++)
{
if ((start_bits[c/8] & (1 << (c&7))) != 0 &&
(pcre_ctypes[c] & ctype_letter) != 0)
{
int d = pcre_fcc[c];
start_bits[d/8] |= (1 << (d&7));
}
}
}
/* Get an "extra" block and put the information therein. */
extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
if (extra == NULL)
{
*errorptr = "failed to get memory";
return NULL;
}
extra->options = PCRE_STUDY_MAPPED | (caseless? PCRE_STUDY_CASELESS : 0);
memcpy(extra->start_bits, start_bits, sizeof(start_bits));
return (pcre_extra *)extra;
}
/* End of study.c */
/*************************************************
* Perl-Compatible Regular Expressions *
*************************************************/
/*
This is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language. See
the file Tech.Notes for some information on the internals.
Written by: Philip Hazel <ph10@cam.ac.uk>
Copyright (c) 1998 University of Cambridge
-----------------------------------------------------------------------------
Permission is granted to anyone to use this software for any purpose on any
computer system, and to redistribute it freely, subject to the following
restrictions:
1. This software is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
2. The origin of this software must not be misrepresented, either by
explicit claim or by omission.
3. Altered versions must be plainly marked as such, and must not be
misrepresented as being the original software.
-----------------------------------------------------------------------------
*/
/* Define DEBUG to get debugging output on stdout. */
/* #define DEBUG */
/* Use a macro for debugging printing, 'cause that eliminates the the use
of #ifdef inline, and there are *still* stupid compilers about that don't like
indented pre-processor statements. I suppose it's only been 10 years... */
#ifdef DEBUG
#define DPRINTF(p) printf p
#else
#define DPRINTF(p) /*nothing*/
#endif
/* Include the internals header, which itself includes Standard C headers plus
the external pcre header. */
#ifndef Py_eval_input
/* For Python 1.4, graminit.h has to be explicitly included */
#define Py_eval_input eval_input
#endif /* FOR_PYTHON */
/* Allow compilation as C++ source code, should anybody want to do that. */
#ifdef __cplusplus
#define class pcre_class
#endif
/* Min and max values for the common repeats; for the maxima, 0 => infinity */
static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
/* Text forms of OP_ values and things, for debugging (not all used) */
#ifdef DEBUG
static const char *OP_names[] = {
"End", "\\A", "\\B", "\\b", "\\D", "\\d",
"\\S", "\\s", "\\W", "\\w", "Cut", "\\Z",
"localized \\B", "localized \\b", "localized \\W", "localized \\w",
"^", "$", "Any", "chars",
"not",
"*", "*?", "+", "+?", "?", "??", "{", "{", "{",
"*", "*?", "+", "+?", "?", "??", "{", "{", "{",
"*", "*?", "+", "+?", "?", "??", "{", "{", "{",
"*", "*?", "+", "+?", "?", "??", "{", "{",
"class", "negclass", "classL", "Ref",
"Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
"Brazero", "Braminzero", "Bra"
};
#endif
/* Table for handling escaped characters in the range '0'-'z'. Positive returns
are simple data values; negative values are for special things like \d and so
on. Zero means further processing is needed (for things like \x), or the escape
is invalid. */
static const short int escapes[] = {
0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
'@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
'`', 7, -ESC_b, 0, -ESC_d, 0, '\f', 0, /* ` - g */
0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
0, 0, '\r', -ESC_s, '\t', 0, '\v', -ESC_w, /* p - w */
0, 0, 0 /* x - z */
};
/* Definition to allow mutual recursion */
static BOOL
compile_regex(int, int *, uschar **, const uschar **, const char **,
PyObject *);
/* Structure for passing "static" information around between the functions
doing the matching, so that they are thread-safe. */
typedef struct match_data {
int errorcode; /* As it says */
int *offset_vector; /* Offset vector */
int offset_end; /* One past the end */
BOOL offset_overflow; /* Set if too many extractions */
BOOL caseless; /* Case-independent flag */
BOOL runtime_caseless; /* Caseless forced at run time */
BOOL multiline; /* Multiline flag */
BOOL notbol; /* NOTBOL flag */
BOOL noteol; /* NOTEOL flag */
BOOL dotall; /* Dot matches any char */
BOOL endonly; /* Dollar not before final \n */
const uschar *start_subject; /* Start of the subject string */
const uschar *end_subject; /* End of the subject string */
jmp_buf fail_env; /* Environment for longjump() break out */
const uschar *end_match_ptr; /* Subject position at end match */
int end_offset_top; /* Highwater mark at end of match */
jmp_buf error_env; /* For longjmp() if an error occurs deep inside a
matching operation */
int length; /* Length of the allocated stacks */
int point; /* Point to add next item pushed onto stacks */
/* Pointers to the 6 stacks */
int *off_num, *offset_top, *r1, *r2;
const uschar **eptr, **ecode;
} match_data;
/*************************************************
* Global variables *
*************************************************/
/* PCRE is thread-clean and doesn't use any global variables in the normal
sense. However, it calls memory allocation and free functions via the two
indirections below, which are can be changed by the caller, but are shared
between all threads. */
void *(*pcre_malloc)(size_t) = malloc;
void (*pcre_free)(void *) = free;
/*************************************************
* Return version string *
*************************************************/
const char *
pcre_version(void)
{
return PCRE_VERSION;
}
/*************************************************
* Return info about a compiled pattern *
*************************************************/
/* This function picks potentially useful data out of the private
structure.
Arguments:
external_re points to compiled code
optptr where to pass back the options
first_char where to pass back the first character,
or -1 if multiline and all branches start ^,
or -2 otherwise
Returns: number of identifying extraction brackets
or negative values on error
*/
int
pcre_info(const pcre *external_re, int *optptr, int *first_char)
{
const real_pcre *re = (real_pcre *)external_re;
if (re == NULL) return PCRE_ERROR_NULL;
if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
if (first_char != NULL)
*first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
return re->top_bracket;
}
#ifdef DEBUG
/*************************************************
* Debugging function to print chars *
*************************************************/
/* Print a sequence of chars in printable format, stopping at the end of the
subject if the requested.
Arguments:
p points to characters
length number to print
is_subject TRUE if printing from within md->start_subject
md pointer to matching data block, if is_subject is TRUE
Returns: nothing
*/
static void
pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
{
int c;
if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
while (length-- > 0)
if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
}
#endif
/*************************************************
* Check subpattern for empty operand *
*************************************************/
/* This function checks a bracketed subpattern to see if any of the paths
through it could match an empty string. This is used to diagnose an error if
such a subpattern is followed by a quantifier with an unlimited upper bound.
Argument:
code points to the opening bracket
Returns: TRUE or FALSE
*/
static BOOL
could_be_empty(uschar *code)
{
do {
uschar *cc = code + 3;
/* Scan along the opcodes for this branch; as soon as we find something
that matches a non-empty string, break out and advance to test the next
branch. If we get to the end of the branch, return TRUE for the whole
sub-expression. */
for (;;)
{
/* Test an embedded subpattern; if it could not be empty, break the
loop. Otherwise carry on in the branch. */
if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)
{
if (!could_be_empty(cc)) break;
do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
cc += 3;
}
else switch (*cc)
{
/* Reached end of a branch: the subpattern may match the empty string */
case OP_ALT:
case OP_KET:
case OP_KETRMAX:
case OP_KETRMIN:
return TRUE;
/* Skip over entire bracket groups with zero lower bound */
case OP_BRAZERO:
case OP_BRAMINZERO:
cc++;
/* Fall through */
/* Skip over assertive subpatterns */
case OP_ASSERT:
case OP_ASSERT_NOT:
do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
cc += 3;
break;
/* Skip over things that don't match chars */
case OP_SOD:
case OP_EOD:
case OP_CIRC:
case OP_DOLL:
case OP_NOT_WORD_BOUNDARY:
case OP_WORD_BOUNDARY:
case OP_NOT_WORD_BOUNDARY_L:
case OP_WORD_BOUNDARY_L:
cc++;
break;
/* Skip over simple repeats with zero lower bound */
case OP_STAR:
case OP_MINSTAR:
case OP_QUERY:
case OP_MINQUERY:
case OP_NOTSTAR:
case OP_NOTMINSTAR:
case OP_NOTQUERY:
case OP_NOTMINQUERY:
case OP_TYPESTAR:
case OP_TYPEMINSTAR:
case OP_TYPEQUERY:
case OP_TYPEMINQUERY:
cc += 2;
break;
/* Skip over UPTOs (lower bound is zero) */
case OP_UPTO:
case OP_MINUPTO:
case OP_TYPEUPTO:
case OP_TYPEMINUPTO:
cc += 4;
break;
/* Check a class or a back reference for a zero minimum */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -