📄 regexpr.c
字号:
/* Portions Copyright (c) 2005 Nokia Corporation */
/* regexpr.c
*
* Author: Tatu Ylonen <ylo@ngs.fi>
*
* Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
*
* Permission to use, copy, modify, distribute, and sell this software
* and its documentation for any purpose is hereby granted without
* fee, provided that the above copyright notice appear in all copies.
* This software is provided "as is" without express or implied
* warranty.
*
* Created: Thu Sep 26 17:14:05 1991 ylo
* Last modified: Mon Nov 4 17:06:48 1991 ylo
* Ported to Think C: 19 Jan 1992 guido@cwi.nl
*
* This code draws many ideas from the regular expression packages by
* Henry Spencer of the University of Toronto and Richard Stallman of
* the Free Software Foundation.
*
* Emacs-specific code and syntax table code is almost directly borrowed
* from GNU regexp.
*
* Bugs fixed and lots of reorganization by Jeffrey C. Ollie, April
* 1997 Thanks for bug reports and ideas from Andrew Kuchling, Tim
* Peters, Guido van Rossum, Ka-Ping Yee, Sjoerd Mullender, and
* probably one or two others that I'm forgetting.
*
* $Id: regexpr.c,v 1.1 2003/10/27 08:23:53 riva Exp $ */
#include "Python.h"
#include "regexpr.h"
/* The original code blithely assumed that sizeof(short) == 2. Not
* always true. Original instances of "(short)x" were replaced by
* SHORT(x), where SHORT is #defined below. */
#define SHORT(x) ((x) & 0x8000 ? (x) - 0x10000 : (x))
/* The stack implementation is taken from an idea by Andrew Kuchling.
* It's a doubly linked list of arrays. The advantages of this over a
* simple linked list are that the number of mallocs required are
* reduced. It also makes it possible to statically allocate enough
* space so that small patterns don't ever need to call malloc.
*
* The advantages over a single array is that is periodically
* realloced when more space is needed is that we avoid ever copying
* the stack. */
/* item_t is the basic stack element. Defined as a union of
* structures so that both registers, failure points, and counters can
* be pushed/popped from the stack. There's nothing built into the
* item to keep track of whether a certain stack item is a register, a
* failure point, or a counter. */
typedef union item_t
{
struct
{
int num;
int level;
unsigned char *start;
unsigned char *end;
} reg;
struct
{
int count;
int level;
int phantom;
unsigned char *code;
unsigned char *text;
} fail;
struct
{
int num;
int level;
int count;
} cntr;
} item_t;
#define STACK_PAGE_SIZE 256
#define NUM_REGISTERS 256
/* A 'page' of stack items. */
typedef struct item_page_t
{
item_t items[STACK_PAGE_SIZE];
struct item_page_t *prev;
struct item_page_t *next;
} item_page_t;
typedef struct match_state
{
/* The number of registers that have been pushed onto the stack
* since the last failure point. */
int count;
/* Used to control when registers need to be pushed onto the
* stack. */
int level;
/* The number of failure points on the stack. */
int point;
/* Storage for the registers. Each register consists of two
* pointers to characters. So register N is represented as
* start[N] and end[N]. The pointers must be converted to
* offsets from the beginning of the string before returning the
* registers to the calling program. */
unsigned char *start[NUM_REGISTERS];
unsigned char *end[NUM_REGISTERS];
/* Keeps track of whether a register has changed recently. */
int changed[NUM_REGISTERS];
/* Structure to encapsulate the stack. */
struct
{
/* index into the current page. If index == 0 and you need
* to pop an item, move to the previous page and set index
* = STACK_PAGE_SIZE - 1. Otherwise decrement index to
* push a page. If index == STACK_PAGE_SIZE and you need
* to push a page move to the next page and set index =
* 0. If there is no new next page, allocate a new page
* and link it in. Otherwise, increment index to push a
* page. */
int index;
item_page_t *current; /* Pointer to the current page. */
item_page_t first; /* First page is statically allocated. */
} stack;
} match_state;
/* Initialize a state object */
/* #define NEW_STATE(state) \ */
/* memset(&state, 0, (void *)(&state.stack) - (void *)(&state)); \ */
/* state.stack.current = &state.stack.first; \ */
/* state.stack.first.prev = NULL; \ */
/* state.stack.first.next = NULL; \ */
/* state.stack.index = 0; \ */
/* state.level = 1 */
#define NEW_STATE(state, nregs) \
{ \
int i; \
for (i = 0; i < nregs; i++) \
{ \
state.start[i] = NULL; \
state.end[i] = NULL; \
state.changed[i] = 0; \
} \
state.stack.current = &state.stack.first; \
state.stack.first.prev = NULL; \
state.stack.first.next = NULL; \
state.stack.index = 0; \
state.level = 1; \
state.count = 0; \
state.level = 0; \
state.point = 0; \
}
/* Free any memory that might have been malloc'd */
#define FREE_STATE(state) \
while(state.stack.first.next != NULL) \
{ \
state.stack.current = state.stack.first.next; \
state.stack.first.next = state.stack.current->next; \
free(state.stack.current); \
}
/* Discard the top 'count' stack items. */
#define STACK_DISCARD(stack, count, on_error) \
stack.index -= count; \
while (stack.index < 0) \
{ \
if (stack.current->prev == NULL) \
on_error; \
stack.current = stack.current->prev; \
stack.index += STACK_PAGE_SIZE; \
}
/* Store a pointer to the previous item on the stack. Used to pop an
* item off of the stack. */
#define STACK_PREV(stack, top, on_error) \
if (stack.index == 0) \
{ \
if (stack.current->prev == NULL) \
on_error; \
stack.current = stack.current->prev; \
stack.index = STACK_PAGE_SIZE - 1; \
} \
else \
{ \
stack.index--; \
} \
top = &(stack.current->items[stack.index])
/* Store a pointer to the next item on the stack. Used to push an item
* on to the stack. */
#define STACK_NEXT(stack, top, on_error) \
if (stack.index == STACK_PAGE_SIZE) \
{ \
if (stack.current->next == NULL) \
{ \
stack.current->next = (item_page_t *)malloc(sizeof(item_page_t)); \
if (stack.current->next == NULL) \
on_error; \
stack.current->next->prev = stack.current; \
stack.current->next->next = NULL; \
} \
stack.current = stack.current->next; \
stack.index = 0; \
} \
top = &(stack.current->items[stack.index++])
/* Store a pointer to the item that is 'count' items back in the
* stack. STACK_BACK(stack, top, 1, on_error) is equivalent to
* STACK_TOP(stack, top, on_error). */
#define STACK_BACK(stack, top, count, on_error) \
{ \
int index; \
item_page_t *current; \
current = stack.current; \
index = stack.index - (count); \
while (index < 0) \
{ \
if (current->prev == NULL) \
on_error; \
current = current->prev; \
index += STACK_PAGE_SIZE; \
} \
top = &(current->items[index]); \
}
/* Store a pointer to the top item on the stack. Execute the
* 'on_error' code if there are no items on the stack. */
#define STACK_TOP(stack, top, on_error) \
if (stack.index == 0) \
{ \
if (stack.current->prev == NULL) \
on_error; \
top = &(stack.current->prev->items[STACK_PAGE_SIZE - 1]); \
} \
else \
{ \
top = &(stack.current->items[stack.index - 1]); \
}
/* Test to see if the stack is empty */
#define STACK_EMPTY(stack) ((stack.index == 0) && \
(stack.current->prev == NULL))
/* Return the start of register 'reg' */
#define GET_REG_START(state, reg) (state.start[reg])
/* Return the end of register 'reg' */
#define GET_REG_END(state, reg) (state.end[reg])
/* Set the start of register 'reg'. If the state of the register needs
* saving, push it on the stack. */
#define SET_REG_START(state, reg, text, on_error) \
if(state.changed[reg] < state.level) \
{ \
item_t *item; \
STACK_NEXT(state.stack, item, on_error); \
item->reg.num = reg; \
item->reg.start = state.start[reg]; \
item->reg.end = state.end[reg]; \
item->reg.level = state.changed[reg]; \
state.changed[reg] = state.level; \
state.count++; \
} \
state.start[reg] = text
/* Set the end of register 'reg'. If the state of the register needs
* saving, push it on the stack. */
#define SET_REG_END(state, reg, text, on_error) \
if(state.changed[reg] < state.level) \
{ \
item_t *item; \
STACK_NEXT(state.stack, item, on_error); \
item->reg.num = reg; \
item->reg.start = state.start[reg]; \
item->reg.end = state.end[reg]; \
item->reg.level = state.changed[reg]; \
state.changed[reg] = state.level; \
state.count++; \
} \
state.end[reg] = text
#define PUSH_FAILURE(state, xcode, xtext, on_error) \
{ \
item_t *item; \
STACK_NEXT(state.stack, item, on_error); \
item->fail.code = xcode; \
item->fail.text = xtext; \
item->fail.count = state.count; \
item->fail.level = state.level; \
item->fail.phantom = 0; \
state.count = 0; \
state.level++; \
state.point++; \
}
/* Update the last failure point with a new position in the text. */
#define UPDATE_FAILURE(state, xtext, on_error) \
{ \
item_t *item; \
STACK_BACK(state.stack, item, state.count + 1, on_error); \
if (!item->fail.phantom) \
{ \
item_t *item2; \
STACK_NEXT(state.stack, item2, on_error); \
item2->fail.code = item->fail.code; \
item2->fail.text = xtext; \
item2->fail.count = state.count; \
item2->fail.level = state.level; \
item2->fail.phantom = 1; \
state.count = 0; \
state.level++; \
state.point++; \
} \
else \
{ \
STACK_DISCARD(state.stack, state.count, on_error); \
STACK_TOP(state.stack, item, on_error); \
item->fail.text = xtext; \
state.count = 0; \
state.level++; \
} \
}
#define POP_FAILURE(state, xcode, xtext, on_empty, on_error) \
{ \
item_t *item; \
do \
{ \
while(state.count > 0) \
{ \
STACK_PREV(state.stack, item, on_error); \
state.start[item->reg.num] = item->reg.start; \
state.end[item->reg.num] = item->reg.end; \
state.changed[item->reg.num] = item->reg.level; \
state.count--; \
} \
STACK_PREV(state.stack, item, on_empty); \
xcode = item->fail.code; \
xtext = item->fail.text; \
state.count = item->fail.count; \
state.level = item->fail.level; \
state.point--; \
} \
while (item->fail.text == NULL); \
}
enum regexp_compiled_ops /* opcodes for compiled regexp */
{
Cend, /* end of pattern reached */
Cbol, /* beginning of line */
Ceol, /* end of line */
Cset, /* character set. Followed by 32 bytes of set. */
Cexact, /* followed by a byte to match */
Canychar, /* matches any character except newline */
Cstart_memory, /* set register start addr (followed by reg number) */
Cend_memory, /* set register end addr (followed by reg number) */
Cmatch_memory, /* match a duplicate of reg contents (regnum follows)*/
Cjump, /* followed by two bytes (lsb,msb) of displacement. */
Cstar_jump, /* will change to jump/update_failure_jump at runtime */
Cfailure_jump, /* jump to addr on failure */
Cupdate_failure_jump, /* update topmost failure point and jump */
Cdummy_failure_jump, /* push a dummy failure point and jump */
Cbegbuf, /* match at beginning of buffer */
Cendbuf, /* match at end of buffer */
Cwordbeg, /* match at beginning of word */
Cwordend, /* match at end of word */
Cwordbound, /* match if at word boundary */
Cnotwordbound, /* match if not at word boundary */
Csyntaxspec, /* matches syntax code (1 byte follows) */
Cnotsyntaxspec, /* matches if syntax code does not match (1 byte follows) */
Crepeat1
};
enum regexp_syntax_op /* syntax codes for plain and quoted characters */
{
Rend, /* special code for end of regexp */
Rnormal, /* normal character */
Ranychar, /* any character except newline */
Rquote, /* the quote character */
Rbol, /* match beginning of line */
Reol, /* match end of line */
Roptional, /* match preceding expression optionally */
Rstar, /* match preceding expr zero or more times */
Rplus, /* match preceding expr one or more times */
Ror, /* match either of alternatives */
Ropenpar, /* opening parenthesis */
Rclosepar, /* closing parenthesis */
Rmemory, /* match memory register */
Rextended_memory, /* \vnn to match registers 10-99 */
Ropenset, /* open set. Internal syntax hard-coded below. */
/* the following are gnu extensions to "normal" regexp syntax */
Rbegbuf, /* beginning of buffer */
Rendbuf, /* end of buffer */
Rwordchar, /* word character */
Rnotwordchar, /* not word character */
Rwordbeg, /* beginning of word */
Rwordend, /* end of word */
Rwordbound, /* word bound */
Rnotwordbound, /* not word bound */
Rnum_ops
};
static int re_compile_initialized = 0;
static int regexp_syntax = 0;
int re_syntax = 0; /* Exported copy of regexp_syntax */
static unsigned char regexp_plain_ops[256];
static unsigned char regexp_quoted_ops[256];
static unsigned char regexp_precedences[Rnum_ops];
static int regexp_context_indep_ops;
static int regexp_ansi_sequences;
#define NUM_LEVELS 5 /* number of precedence levels in use */
#define MAX_NESTING 100 /* max nesting level of operators */
#define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
unsigned char re_syntax_table[256];
void re_compile_initialize(void)
{
int a;
static int syntax_table_inited = 0;
if (!syntax_table_inited)
{
syntax_table_inited = 1;
memset(re_syntax_table, 0, 256);
for (a = 'a'; a <= 'z'; a++)
re_syntax_table[a] = Sword;
for (a = 'A'; a <= 'Z'; a++)
re_syntax_table[a] = Sword;
for (a = '0'; a <= '9'; a++)
re_syntax_table[a] = Sword | Sdigit | Shexdigit;
for (a = '0'; a <= '7'; a++)
re_syntax_table[a] |= Soctaldigit;
for (a = 'A'; a <= 'F'; a++)
re_syntax_table[a] |= Shexdigit;
for (a = 'a'; a <= 'f'; a++)
re_syntax_table[a] |= Shexdigit;
re_syntax_table['_'] = Sword;
for (a = 9; a <= 13; a++)
re_syntax_table[a] = Swhitespace;
re_syntax_table[' '] = Swhitespace;
}
re_compile_initialized = 1;
for (a = 0; a < 256; a++)
{
regexp_plain_ops[a] = Rnormal;
regexp_quoted_ops[a] = Rnormal;
}
for (a = '0'; a <= '9'; a++)
regexp_quoted_ops[a] = Rmemory;
regexp_plain_ops['\134'] = Rquote;
if (regexp_syntax & RE_NO_BK_PARENS)
{
regexp_plain_ops['('] = Ropenpar;
regexp_plain_ops[')'] = Rclosepar;
}
else
{
regexp_quoted_ops['('] = Ropenpar;
regexp_quoted_ops[')'] = Rclosepar;
}
if (regexp_syntax & RE_NO_BK_VBAR)
regexp_plain_ops['\174'] = Ror;
else
regexp_quoted_ops['\174'] = Ror;
regexp_plain_ops['*'] = Rstar;
if (regexp_syntax & RE_BK_PLUS_QM)
{
regexp_quoted_ops['+'] = Rplus;
regexp_quoted_ops['?'] = Roptional;
}
else
{
regexp_plain_ops['+'] = Rplus;
regexp_plain_ops['?'] = Roptional;
}
if (regexp_syntax & RE_NEWLINE_OR)
regexp_plain_ops['\n'] = Ror;
regexp_plain_ops['\133'] = Ropenset;
regexp_plain_ops['\136'] = Rbol;
regexp_plain_ops['$'] = Reol;
regexp_plain_ops['.'] = Ranychar;
if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS))
{
regexp_quoted_ops['w'] = Rwordchar;
regexp_quoted_ops['W'] = Rnotwordchar;
regexp_quoted_ops['<'] = Rwordbeg;
regexp_quoted_ops['>'] = Rwordend;
regexp_quoted_ops['b'] = Rwordbound;
regexp_quoted_ops['B'] = Rnotwordbound;
regexp_quoted_ops['`'] = Rbegbuf;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -