📄 _sre.c
字号:
/* Portions Copyright (c) 2005 Nokia Corporation */
/*
* Secret Labs' Regular Expression Engine
*
* regular expression matching engine
*
* partial history:
* 1999-10-24 fl created (based on existing template matcher code)
* 2000-03-06 fl first alpha, sort of
* 2000-06-30 fl added fast search optimization
* 2000-06-30 fl added assert (lookahead) primitives, etc
* 2000-07-02 fl added charset optimizations, etc
* 2000-07-03 fl store code in pattern object, lookbehind, etc
* 2000-07-08 fl added regs attribute
* 2000-07-21 fl reset lastindex in scanner methods
* 2000-08-01 fl fixes for 1.6b1
* 2000-08-03 fl added recursion limit
* 2000-08-07 fl use PyOS_CheckStack() if available
* 2000-08-08 fl changed findall to return empty strings instead of None
* 2000-08-27 fl properly propagate memory errors
* 2000-09-02 fl return -1 instead of None for start/end/span
* 2000-09-20 fl added expand method
* 2000-09-21 fl don't use the buffer interface for unicode strings
* 2000-10-03 fl fixed assert_not primitive; support keyword arguments
* 2000-10-24 fl really fixed assert_not; reset groups in findall
* 2000-12-21 fl fixed memory leak in groupdict
* 2001-01-02 fl properly reset pointer after failed assertion in MIN_UNTIL
* 2001-01-15 fl avoid recursion for MIN_UNTIL; fixed uppercase literal bug
* 2001-01-16 fl fixed memory leak in pattern destructor
* 2001-03-20 fl lots of fixes for 2.1b2
* 2001-04-15 fl export copyright as Python attribute, not global
* 2001-04-28 fl added __copy__ methods (work in progress)
* 2001-05-14 fl fixes for 1.5.2
* 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
* 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
* 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
* 2001-10-21 fl added sub/subn primitive
* 2001-10-22 fl check for literal sub/subn templates
* 2001-10-24 fl added finditer primitive (for 2.2 only)
* 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
*
* Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
*
* This version of the SRE library can be redistributed under CNRI's
* Python 1.6 license. For any other use, please contact Secret Labs
* AB (info@pythonware.com).
*
* Portions of this engine have been developed in cooperation with
* CNRI. Hewlett-Packard provided funding for 1.6 integration and
* other compatibility work.
*/
#ifndef SRE_RECURSIVE
static const char copyright[] =
" SRE 2.2.1 Copyright (c) 1997-2001 by Secret Labs AB ";
#include "Python.h"
#include "structmember.h" /* offsetof */
#include "sre.h"
#include <ctype.h>
/* name of this module, minus the leading underscore */
#if !defined(SRE_MODULE)
#define SRE_MODULE "sre"
#endif
/* defining this one enables tracing */
#undef VERBOSE
#if PY_VERSION_HEX >= 0x01060000
#if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
/* defining this enables unicode support (default under 1.6a1 and later) */
#define HAVE_UNICODE
#endif
#endif
/* -------------------------------------------------------------------- */
/* optional features */
/* prevent run-away recursion (bad patterns on long strings) */
#if !defined(USE_STACKCHECK)
#if defined(MS_WIN64) || defined(__LP64__) || defined(_LP64)
/* require smaller recursion limit for a number of 64-bit platforms:
Win64 (MS_WIN64), Linux64 (__LP64__), Monterey (64-bit AIX) (_LP64) */
/* FIXME: maybe the limit should be 40000 / sizeof(void*) ? */
#define USE_RECURSION_LIMIT 7500
#else
#define USE_RECURSION_LIMIT 10000
#endif
#endif
#if defined(SYMBIAN)
#define USE_RECURSION_LIMIT 5000
#endif
/* enables fast searching */
#define USE_FAST_SEARCH
/* enables aggressive inlining (always on for Visual C) */
#undef USE_INLINE
/* enables copy/deepcopy handling (work in progress) */
#undef USE_BUILTIN_COPY
#if PY_VERSION_HEX < 0x01060000
#define PyObject_DEL(op) PyMem_DEL((op))
#endif
/* -------------------------------------------------------------------- */
#if defined(_MSC_VER)
#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
/* fastest possible local call under MSVC */
#define LOCAL(type) static __inline type __fastcall
#elif defined(USE_INLINE)
#define LOCAL(type) static inline type
#else
#define LOCAL(type) static type
#endif
/* error codes */
#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
#define SRE_ERROR_STATE -2 /* illegal state */
#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
#define SRE_ERROR_MEMORY -9 /* out of memory */
#if defined(VERBOSE)
#define TRACE(v) printf v
#else
#define TRACE(v)
#endif
/* -------------------------------------------------------------------- */
/* search engine state */
/* default character predicates (run sre_chars.py to regenerate tables) */
#define SRE_DIGIT_MASK 1
#define SRE_SPACE_MASK 2
#define SRE_LINEBREAK_MASK 4
#define SRE_ALNUM_MASK 8
#define SRE_WORD_MASK 16
/* FIXME: this assumes ASCII. create tables in init_sre() instead */
static const char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
static const char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
120, 121, 122, 123, 124, 125, 126, 127 };
#define SRE_IS_DIGIT(ch)\
((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
#define SRE_IS_SPACE(ch)\
((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
#define SRE_IS_LINEBREAK(ch)\
((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
#define SRE_IS_ALNUM(ch)\
((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
#define SRE_IS_WORD(ch)\
((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
static unsigned int sre_lower(unsigned int ch)
{
return ((ch) < 128 ? sre_char_lower[ch] : ch);
}
/* locale-specific character predicates */
#define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
#define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
#define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
static unsigned int sre_lower_locale(unsigned int ch)
{
return ((ch) < 256 ? tolower((ch)) : ch);
}
/* unicode-specific character predicates */
#if defined(HAVE_UNICODE)
#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
static unsigned int sre_lower_unicode(unsigned int ch)
{
return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
}
#endif
LOCAL(int)
sre_category(SRE_CODE category, unsigned int ch)
{
switch (category) {
case SRE_CATEGORY_DIGIT:
return SRE_IS_DIGIT(ch);
case SRE_CATEGORY_NOT_DIGIT:
return !SRE_IS_DIGIT(ch);
case SRE_CATEGORY_SPACE:
return SRE_IS_SPACE(ch);
case SRE_CATEGORY_NOT_SPACE:
return !SRE_IS_SPACE(ch);
case SRE_CATEGORY_WORD:
return SRE_IS_WORD(ch);
case SRE_CATEGORY_NOT_WORD:
return !SRE_IS_WORD(ch);
case SRE_CATEGORY_LINEBREAK:
return SRE_IS_LINEBREAK(ch);
case SRE_CATEGORY_NOT_LINEBREAK:
return !SRE_IS_LINEBREAK(ch);
case SRE_CATEGORY_LOC_WORD:
return SRE_LOC_IS_WORD(ch);
case SRE_CATEGORY_LOC_NOT_WORD:
return !SRE_LOC_IS_WORD(ch);
#if defined(HAVE_UNICODE)
case SRE_CATEGORY_UNI_DIGIT:
return SRE_UNI_IS_DIGIT(ch);
case SRE_CATEGORY_UNI_NOT_DIGIT:
return !SRE_UNI_IS_DIGIT(ch);
case SRE_CATEGORY_UNI_SPACE:
return SRE_UNI_IS_SPACE(ch);
case SRE_CATEGORY_UNI_NOT_SPACE:
return !SRE_UNI_IS_SPACE(ch);
case SRE_CATEGORY_UNI_WORD:
return SRE_UNI_IS_WORD(ch);
case SRE_CATEGORY_UNI_NOT_WORD:
return !SRE_UNI_IS_WORD(ch);
case SRE_CATEGORY_UNI_LINEBREAK:
return SRE_UNI_IS_LINEBREAK(ch);
case SRE_CATEGORY_UNI_NOT_LINEBREAK:
return !SRE_UNI_IS_LINEBREAK(ch);
#else
case SRE_CATEGORY_UNI_DIGIT:
return SRE_IS_DIGIT(ch);
case SRE_CATEGORY_UNI_NOT_DIGIT:
return !SRE_IS_DIGIT(ch);
case SRE_CATEGORY_UNI_SPACE:
return SRE_IS_SPACE(ch);
case SRE_CATEGORY_UNI_NOT_SPACE:
return !SRE_IS_SPACE(ch);
case SRE_CATEGORY_UNI_WORD:
return SRE_LOC_IS_WORD(ch);
case SRE_CATEGORY_UNI_NOT_WORD:
return !SRE_LOC_IS_WORD(ch);
case SRE_CATEGORY_UNI_LINEBREAK:
return SRE_IS_LINEBREAK(ch);
case SRE_CATEGORY_UNI_NOT_LINEBREAK:
return !SRE_IS_LINEBREAK(ch);
#endif
}
return 0;
}
/* helpers */
static void
mark_fini(SRE_STATE* state)
{
if (state->mark_stack) {
PyMem_FREE(state->mark_stack);
state->mark_stack = NULL;
}
state->mark_stack_size = state->mark_stack_base = 0;
}
static int
mark_save(SRE_STATE* state, int lo, int hi)
{
void* stack;
int size;
int minsize, newsize;
if (hi <= lo)
return 0;
size = (hi - lo) + 1;
newsize = state->mark_stack_size;
minsize = state->mark_stack_base + size;
if (newsize < minsize) {
/* create new stack */
if (!newsize) {
newsize = 512;
if (newsize < minsize)
newsize = minsize;
TRACE(("allocate stack %d\n", newsize));
stack = PyMem_MALLOC(sizeof(void*) * newsize);
} else {
/* grow the stack */
while (newsize < minsize)
newsize += newsize;
TRACE(("grow stack to %d\n", newsize));
stack = PyMem_REALLOC(state->mark_stack, sizeof(void*) * newsize);
}
if (!stack) {
mark_fini(state);
return SRE_ERROR_MEMORY;
}
state->mark_stack = stack;
state->mark_stack_size = newsize;
}
TRACE(("copy %d:%d to %d (%d)\n", lo, hi, state->mark_stack_base, size));
memcpy(state->mark_stack + state->mark_stack_base, state->mark + lo,
size * sizeof(void*));
state->mark_stack_base += size;
return 0;
}
static int
mark_restore(SRE_STATE* state, int lo, int hi)
{
int size;
if (hi <= lo)
return 0;
size = (hi - lo) + 1;
state->mark_stack_base -= size;
TRACE(("copy %d:%d from %d\n", lo, hi, state->mark_stack_base));
memcpy(state->mark + lo, state->mark_stack + state->mark_stack_base,
size * sizeof(void*));
return 0;
}
/* generate 8-bit version */
#define SRE_CHAR unsigned char
#define SRE_AT sre_at
#define SRE_COUNT sre_count
#define SRE_CHARSET sre_charset
#define SRE_INFO sre_info
#define SRE_MATCH sre_match
#define SRE_SEARCH sre_search
#define SRE_LITERAL_TEMPLATE sre_literal_template
#if defined(HAVE_UNICODE)
#define SRE_RECURSIVE
#include "_sre.c"
#undef SRE_RECURSIVE
#undef SRE_LITERAL_TEMPLATE
#undef SRE_SEARCH
#undef SRE_MATCH
#undef SRE_INFO
#undef SRE_CHARSET
#undef SRE_COUNT
#undef SRE_AT
#undef SRE_CHAR
/* generate 16-bit unicode version */
#define SRE_CHAR Py_UNICODE
#define SRE_AT sre_uat
#define SRE_COUNT sre_ucount
#define SRE_CHARSET sre_ucharset
#define SRE_INFO sre_uinfo
#define SRE_MATCH sre_umatch
#define SRE_SEARCH sre_usearch
#define SRE_LITERAL_TEMPLATE sre_uliteral_template
#endif
#endif /* SRE_RECURSIVE */
/* -------------------------------------------------------------------- */
/* String matching engine */
/* the following section is compiled twice, with different character
settings */
LOCAL(int)
SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
{
/* check if pointer is at given position */
int this, that;
switch (at) {
case SRE_AT_BEGINNING:
case SRE_AT_BEGINNING_STRING:
return ((void*) ptr == state->beginning);
case SRE_AT_BEGINNING_LINE:
return ((void*) ptr == state->beginning ||
SRE_IS_LINEBREAK((int) ptr[-1]));
case SRE_AT_END:
return (((void*) (ptr+1) == state->end &&
SRE_IS_LINEBREAK((int) ptr[0])) ||
((void*) ptr == state->end));
case SRE_AT_END_LINE:
return ((void*) ptr == state->end ||
SRE_IS_LINEBREAK((int) ptr[0]));
case SRE_AT_END_STRING:
return ((void*) ptr == state->end);
case SRE_AT_BOUNDARY:
if (state->beginning == state->end)
return 0;
that = ((void*) ptr > state->beginning) ?
SRE_IS_WORD((int) ptr[-1]) : 0;
this = ((void*) ptr < state->end) ?
SRE_IS_WORD((int) ptr[0]) : 0;
return this != that;
case SRE_AT_NON_BOUNDARY:
if (state->beginning == state->end)
return 0;
that = ((void*) ptr > state->beginning) ?
SRE_IS_WORD((int) ptr[-1]) : 0;
this = ((void*) ptr < state->end) ?
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -