📄 regexpr.c

📁 python s60 1.4.5版本的源代码
💻 C
📖 第 1 页 / 共 4 页
字号:
12 3 4 下一页
/* Portions Copyright (c) 2005 Nokia Corporation */
/* regexpr.c
 *
 * Author: Tatu Ylonen <ylo@ngs.fi>
 *
 * Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
 *
 * Permission to use, copy, modify, distribute, and sell this software
 * and its documentation for any purpose is hereby granted without
 * fee, provided that the above copyright notice appear in all copies.
 * This software is provided "as is" without express or implied
 * warranty.
 *
 * Created: Thu Sep 26 17:14:05 1991 ylo
 * Last modified: Mon Nov  4 17:06:48 1991 ylo
 * Ported to Think C: 19 Jan 1992 guido@cwi.nl
 *
 * This code draws many ideas from the regular expression packages by
 * Henry Spencer of the University of Toronto and Richard Stallman of
 * the Free Software Foundation.
 *
 * Emacs-specific code and syntax table code is almost directly borrowed
 * from GNU regexp.
 *
 * Bugs fixed and lots of reorganization by Jeffrey C. Ollie, April
 * 1997 Thanks for bug reports and ideas from Andrew Kuchling, Tim
 * Peters, Guido van Rossum, Ka-Ping Yee, Sjoerd Mullender, and
 * probably one or two others that I'm forgetting.
 *
 * $Id: regexpr.c,v 1.1 2003/10/27 08:23:53 riva Exp $ */

#include "Python.h"
#include "regexpr.h"

/* The original code blithely assumed that sizeof(short) == 2.  Not
 * always true.  Original instances of "(short)x" were replaced by
 * SHORT(x), where SHORT is #defined below.  */

#define SHORT(x) ((x) & 0x8000 ? (x) - 0x10000 : (x))

/* The stack implementation is taken from an idea by Andrew Kuchling.
 * It's a doubly linked list of arrays. The advantages of this over a
 * simple linked list are that the number of mallocs required are
 * reduced. It also makes it possible to statically allocate enough
 * space so that small patterns don't ever need to call malloc.
 *
 * The advantages over a single array is that is periodically
 * realloced when more space is needed is that we avoid ever copying
 * the stack. */

/* item_t is the basic stack element.  Defined as a union of
 * structures so that both registers, failure points, and counters can
 * be pushed/popped from the stack.  There's nothing built into the
 * item to keep track of whether a certain stack item is a register, a
 * failure point, or a counter. */

typedef union item_t
{
	struct
	{
		int num;
		int level;
		unsigned char *start;
		unsigned char *end;
	} reg;
	struct
	{
		int count;
		int level;
		int phantom;
		unsigned char *code;
		unsigned char *text;
	} fail;
	struct
	{
		int num;
		int level;
		int count;
	} cntr;
} item_t;

#define STACK_PAGE_SIZE 256
#define NUM_REGISTERS 256

/* A 'page' of stack items. */

typedef struct item_page_t
{
	item_t items[STACK_PAGE_SIZE];
	struct item_page_t *prev;
	struct item_page_t *next;
} item_page_t;


typedef struct match_state
{
	/* The number of registers that have been pushed onto the stack
	 * since the last failure point. */

	int count;

	/* Used to control when registers need to be pushed onto the
	 * stack. */
	
	int level;
	
	/* The number of failure points on the stack. */
	
	int point;
	
	/* Storage for the registers.  Each register consists of two
	 * pointers to characters.  So register N is represented as
	 * start[N] and end[N].  The pointers must be converted to
	 * offsets from the beginning of the string before returning the
	 * registers to the calling program. */
	
	unsigned char *start[NUM_REGISTERS];
	unsigned char *end[NUM_REGISTERS];
	
	/* Keeps track of whether a register has changed recently. */
	
	int changed[NUM_REGISTERS];
	
	/* Structure to encapsulate the stack. */
	struct
	{
		/* index into the current page.  If index == 0 and you need
		 * to pop an item, move to the previous page and set index
		 * = STACK_PAGE_SIZE - 1.  Otherwise decrement index to
		 * push a page. If index == STACK_PAGE_SIZE and you need
		 * to push a page move to the next page and set index =
		 * 0. If there is no new next page, allocate a new page
		 * and link it in. Otherwise, increment index to push a
		 * page. */
		
		int index;
		item_page_t *current; /* Pointer to the current page. */
		item_page_t first; /* First page is statically allocated. */
	} stack;
} match_state;

/* Initialize a state object */

/* #define NEW_STATE(state) \ */
/* memset(&state, 0, (void *)(&state.stack) - (void *)(&state)); \ */
/* state.stack.current = &state.stack.first; \ */
/* state.stack.first.prev = NULL; \ */
/* state.stack.first.next = NULL; \ */
/* state.stack.index = 0; \ */
/* state.level = 1 */

#define NEW_STATE(state, nregs) \
{ \
	int i; \
	for (i = 0; i < nregs; i++) \
	{ \
		state.start[i] = NULL; \
		state.end[i] = NULL; \
		state.changed[i] = 0; \
	} \
	state.stack.current = &state.stack.first; \
	state.stack.first.prev = NULL; \
	state.stack.first.next = NULL; \
	state.stack.index = 0; \
	state.level = 1; \
	state.count = 0; \
	state.level = 0; \
	state.point = 0; \
}

/* Free any memory that might have been malloc'd */

#define FREE_STATE(state) \
while(state.stack.first.next != NULL) \
{ \
	state.stack.current = state.stack.first.next; \
	state.stack.first.next = state.stack.current->next; \
	free(state.stack.current); \
}

/* Discard the top 'count' stack items. */

#define STACK_DISCARD(stack, count, on_error) \
stack.index -= count; \
while (stack.index < 0) \
{ \
	if (stack.current->prev == NULL) \
		on_error; \
	stack.current = stack.current->prev; \
	stack.index += STACK_PAGE_SIZE; \
}

/* Store a pointer to the previous item on the stack. Used to pop an
 * item off of the stack. */

#define STACK_PREV(stack, top, on_error) \
if (stack.index == 0) \
{ \
	if (stack.current->prev == NULL) \
		on_error; \
	stack.current = stack.current->prev; \
	stack.index = STACK_PAGE_SIZE - 1; \
} \
else \
{ \
	stack.index--; \
} \
top = &(stack.current->items[stack.index])

/* Store a pointer to the next item on the stack. Used to push an item
 * on to the stack. */

#define STACK_NEXT(stack, top, on_error) \
if (stack.index == STACK_PAGE_SIZE) \
{ \
	if (stack.current->next == NULL) \
	{ \
		stack.current->next = (item_page_t *)malloc(sizeof(item_page_t)); \
		if (stack.current->next == NULL) \
			on_error; \
		stack.current->next->prev = stack.current; \
		stack.current->next->next = NULL; \
	} \
	stack.current = stack.current->next; \
	stack.index = 0; \
} \
top = &(stack.current->items[stack.index++])

/* Store a pointer to the item that is 'count' items back in the
 * stack. STACK_BACK(stack, top, 1, on_error) is equivalent to
 * STACK_TOP(stack, top, on_error).  */

#define STACK_BACK(stack, top, count, on_error) \
{ \
	int index; \
	item_page_t *current; \
	current = stack.current; \
	index = stack.index - (count); \
	while (index < 0) \
	{ \
		if (current->prev == NULL) \
			on_error; \
		current = current->prev; \
		index += STACK_PAGE_SIZE; \
	} \
	top = &(current->items[index]); \
}

/* Store a pointer to the top item on the stack. Execute the
 * 'on_error' code if there are no items on the stack. */

#define STACK_TOP(stack, top, on_error) \
if (stack.index == 0) \
{ \
	if (stack.current->prev == NULL) \
		on_error; \
	top = &(stack.current->prev->items[STACK_PAGE_SIZE - 1]); \
} \
else \
{ \
	top = &(stack.current->items[stack.index - 1]); \
}

/* Test to see if the stack is empty */

#define STACK_EMPTY(stack) ((stack.index == 0) && \
			    (stack.current->prev == NULL))

/* Return the start of register 'reg' */

#define GET_REG_START(state, reg) (state.start[reg])

/* Return the end of register 'reg' */

#define GET_REG_END(state, reg) (state.end[reg])

/* Set the start of register 'reg'. If the state of the register needs
 * saving, push it on the stack. */

#define SET_REG_START(state, reg, text, on_error) \
if(state.changed[reg] < state.level) \
{ \
	item_t *item; \
	STACK_NEXT(state.stack, item, on_error); \
	item->reg.num = reg; \
	item->reg.start = state.start[reg]; \
	item->reg.end = state.end[reg]; \
	item->reg.level = state.changed[reg]; \
	state.changed[reg] = state.level; \
	state.count++; \
} \
state.start[reg] = text

/* Set the end of register 'reg'. If the state of the register needs
 * saving, push it on the stack. */

#define SET_REG_END(state, reg, text, on_error) \
if(state.changed[reg] < state.level) \
{ \
	item_t *item; \
	STACK_NEXT(state.stack, item, on_error); \
	item->reg.num = reg; \
	item->reg.start = state.start[reg]; \
	item->reg.end = state.end[reg]; \
	item->reg.level = state.changed[reg]; \
	state.changed[reg] = state.level; \
	state.count++; \
} \
state.end[reg] = text

#define PUSH_FAILURE(state, xcode, xtext, on_error) \
{ \
	item_t *item; \
	STACK_NEXT(state.stack, item, on_error); \
	item->fail.code = xcode; \
	item->fail.text = xtext; \
	item->fail.count = state.count; \
	item->fail.level = state.level; \
	item->fail.phantom = 0; \
	state.count = 0; \
	state.level++; \
	state.point++; \
}

/* Update the last failure point with a new position in the text. */

#define UPDATE_FAILURE(state, xtext, on_error) \
{ \
	item_t *item; \
	STACK_BACK(state.stack, item, state.count + 1, on_error); \
	if (!item->fail.phantom) \
	{ \
		item_t *item2; \
		STACK_NEXT(state.stack, item2, on_error); \
		item2->fail.code = item->fail.code; \
		item2->fail.text = xtext; \
		item2->fail.count = state.count; \
		item2->fail.level = state.level; \
		item2->fail.phantom = 1; \
		state.count = 0; \
		state.level++; \
		state.point++; \
	} \
	else \
	{ \
		STACK_DISCARD(state.stack, state.count, on_error); \
		STACK_TOP(state.stack, item, on_error); \
		item->fail.text = xtext; \
		state.count = 0; \
		state.level++; \
	} \
}

#define POP_FAILURE(state, xcode, xtext, on_empty, on_error) \
{ \
	item_t *item; \
	do \
	{ \
		while(state.count > 0) \
		{ \
			STACK_PREV(state.stack, item, on_error); \
			state.start[item->reg.num] = item->reg.start; \
			state.end[item->reg.num] = item->reg.end; \
			state.changed[item->reg.num] = item->reg.level; \
			state.count--; \
		} \
		STACK_PREV(state.stack, item, on_empty); \
		xcode = item->fail.code; \
		xtext = item->fail.text; \
		state.count = item->fail.count; \
		state.level = item->fail.level; \
		state.point--; \
	} \
	while (item->fail.text == NULL); \
}

enum regexp_compiled_ops /* opcodes for compiled regexp */
{
	Cend,		      /* end of pattern reached */
	Cbol,		      /* beginning of line */
	Ceol,		      /* end of line */
	Cset,		      /* character set.  Followed by 32 bytes of set. */
	Cexact,		      /* followed by a byte to match */
	Canychar,	      /* matches any character except newline */
	Cstart_memory,	      /* set register start addr (followed by reg number) */
	Cend_memory,	      /* set register end addr (followed by reg number) */
	Cmatch_memory,	      /* match a duplicate of reg contents (regnum follows)*/
	Cjump,		      /* followed by two bytes (lsb,msb) of displacement. */
	Cstar_jump,	      /* will change to jump/update_failure_jump at runtime */
	Cfailure_jump,	      /* jump to addr on failure */
	Cupdate_failure_jump, /* update topmost failure point and jump */
	Cdummy_failure_jump,  /* push a dummy failure point and jump */
	Cbegbuf,	      /* match at beginning of buffer */
	Cendbuf,	      /* match at end of buffer */
	Cwordbeg,	      /* match at beginning of word */
	Cwordend,	      /* match at end of word */
	Cwordbound,	      /* match if at word boundary */
	Cnotwordbound,        /* match if not at word boundary */
	Csyntaxspec,	      /* matches syntax code (1 byte follows) */
	Cnotsyntaxspec,       /* matches if syntax code does not match (1 byte follows) */
	Crepeat1
};

enum regexp_syntax_op	/* syntax codes for plain and quoted characters */
{
	Rend,		  /* special code for end of regexp */
	Rnormal,	  /* normal character */
	Ranychar,	  /* any character except newline */
	Rquote,		  /* the quote character */
	Rbol,		  /* match beginning of line */
	Reol,		  /* match end of line */
	Roptional,	  /* match preceding expression optionally */
	Rstar,		  /* match preceding expr zero or more times */
	Rplus,		  /* match preceding expr one or more times */
	Ror,		  /* match either of alternatives */
	Ropenpar,	  /* opening parenthesis */
	Rclosepar,	  /* closing parenthesis */
	Rmemory,	  /* match memory register */
	Rextended_memory, /* \vnn to match registers 10-99 */
	Ropenset,	  /* open set.  Internal syntax hard-coded below. */
	/* the following are gnu extensions to "normal" regexp syntax */
	Rbegbuf,	  /* beginning of buffer */
	Rendbuf,	  /* end of buffer */
	Rwordchar,	  /* word character */
	Rnotwordchar,	  /* not word character */
	Rwordbeg,	  /* beginning of word */
	Rwordend,	  /* end of word */
	Rwordbound,	  /* word bound */
	Rnotwordbound,	  /* not word bound */
	Rnum_ops
};

static int re_compile_initialized = 0;
static int regexp_syntax = 0;
int re_syntax = 0; /* Exported copy of regexp_syntax */
static unsigned char regexp_plain_ops[256];
static unsigned char regexp_quoted_ops[256];
static unsigned char regexp_precedences[Rnum_ops];
static int regexp_context_indep_ops;
static int regexp_ansi_sequences;

#define NUM_LEVELS  5    /* number of precedence levels in use */
#define MAX_NESTING 100  /* max nesting level of operators */

#define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]

unsigned char re_syntax_table[256];

void re_compile_initialize(void)
{
	int a;
  
	static int syntax_table_inited = 0;

	if (!syntax_table_inited)
	{
		syntax_table_inited = 1;
		memset(re_syntax_table, 0, 256);
		for (a = 'a'; a <= 'z'; a++)
			re_syntax_table[a] = Sword;
		for (a = 'A'; a <= 'Z'; a++)
			re_syntax_table[a] = Sword;
		for (a = '0'; a <= '9'; a++)
			re_syntax_table[a] = Sword | Sdigit | Shexdigit;
		for (a = '0'; a <= '7'; a++)
			re_syntax_table[a] |= Soctaldigit;
		for (a = 'A'; a <= 'F'; a++)
			re_syntax_table[a] |= Shexdigit;
		for (a = 'a'; a <= 'f'; a++)
			re_syntax_table[a] |= Shexdigit;
		re_syntax_table['_'] = Sword;
		for (a = 9; a <= 13; a++)
			re_syntax_table[a] = Swhitespace;
		re_syntax_table[' '] = Swhitespace;
	}
	re_compile_initialized = 1;
	for (a = 0; a < 256; a++)
	{
		regexp_plain_ops[a] = Rnormal;
		regexp_quoted_ops[a] = Rnormal;
	}
	for (a = '0'; a <= '9'; a++)
		regexp_quoted_ops[a] = Rmemory;
	regexp_plain_ops['\134'] = Rquote;
	if (regexp_syntax & RE_NO_BK_PARENS)
	{
		regexp_plain_ops['('] = Ropenpar;
		regexp_plain_ops[')'] = Rclosepar;
	}
	else
	{
		regexp_quoted_ops['('] = Ropenpar;
		regexp_quoted_ops[')'] = Rclosepar;
	}
	if (regexp_syntax & RE_NO_BK_VBAR)
		regexp_plain_ops['\174'] = Ror;
	else
		regexp_quoted_ops['\174'] = Ror;
	regexp_plain_ops['*'] = Rstar;
	if (regexp_syntax & RE_BK_PLUS_QM)
	{
		regexp_quoted_ops['+'] = Rplus;
		regexp_quoted_ops['?'] = Roptional;
	}
	else
	{
		regexp_plain_ops['+'] = Rplus;
		regexp_plain_ops['?'] = Roptional;
	}
	if (regexp_syntax & RE_NEWLINE_OR)
		regexp_plain_ops['\n'] = Ror;
	regexp_plain_ops['\133'] = Ropenset;
	regexp_plain_ops['\136'] = Rbol;
	regexp_plain_ops['$'] = Reol;
	regexp_plain_ops['.'] = Ranychar;
	if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS))
	{
		regexp_quoted_ops['w'] = Rwordchar;
		regexp_quoted_ops['W'] = Rnotwordchar;
		regexp_quoted_ops['<'] = Rwordbeg;
		regexp_quoted_ops['>'] = Rwordend;
		regexp_quoted_ops['b'] = Rwordbound;
		regexp_quoted_ops['B'] = Rnotwordbound;
		regexp_quoted_ops['`'] = Rbegbuf;
12 3 4 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -