regexp.c

来自「操作系统源代码」· C语言代码 · 共 935 行 · 第 1/2 页
935 行
/* regexp.c *//* This file contains the code that compiles regular expressions and executes * them.  It supports the same syntax and features as vi's regular expression * code.  Specifically, the meta characters are: *	^	matches the beginning of a line *	$	matches the end of a line *	\<	matches the beginning of a word *	\>	matches the end of a word *	.	matches any single character *	[]	matches any character in a character class *	\(	delimits the start of a subexpression *	\)	delimits the end of a subexpression *	*	repeats the preceding 0 or more times * NOTE: You cannot follow a \) with a *. * * The physical structure of a compiled RE is as follows: *	- First, there is a one-byte value that says how many character classes *	  are used in this regular expression *	- Next, each character class is stored as a bitmap that is 256 bits *	  (32 bytes) long. *	- A mixture of literal characters and compiled meta characters follows. *	  This begins with M_BEGIN(0) and ends with M_END(0).  All meta chars *	  are stored as a \n followed by a one-byte code, so they take up two *	  bytes apiece.  Literal characters take up one byte apiece.  \n can't *	  be used as a literal character. * * If NO_MAGIC is defined, then a different set of functions is used instead. * That right, this file contains TWO versions of the code. */#include <setjmp.h>#include "config.h"#include "ctype.h"#include "vi.h"#include "regexp.h"static char	*previous;	/* the previous regexp, used when null regexp is given */#ifndef NO_MAGIC/* THE REAL REGEXP PACKAGE IS USED UNLESS "NO_MAGIC" IS DEFINED *//* These are used to classify or recognize meta-characters */#define META		'\0'#define BASE_META(m)	((m) - 256)#define INT_META(c)	((c) + 256)#define IS_META(m)	((m) >= 256)#define IS_CLASS(m)	((m) >= M_CLASS(0) && (m) <= M_CLASS(9))#define IS_START(m)	((m) >= M_START(0) && (m) <= M_START(9))#define IS_END(m)	((m) >= M_END(0) && (m) <= M_END(9))#define IS_CLOSURE(m)	((m) >= M_SPLAT && (m) <= M_RANGE)#define ADD_META(s,m)	(*(s)++ = META, *(s)++ = BASE_META(m))#define GET_META(s)	(*(s) == META ? INT_META(*++(s)) : *s)/* These are the internal codes used for each type of meta-character */#define M_BEGLINE	256		/* internal code for ^ */#define M_ENDLINE	257		/* internal code for $ */#define M_BEGWORD	258		/* internal code for \< */#define M_ENDWORD	259		/* internal code for \> */#define M_ANY		260		/* internal code for . */#define M_SPLAT		261		/* internal code for * */#define M_PLUS		262		/* internal code for \+ */#define M_QMARK		263		/* internal code for \? */#define M_RANGE		264		/* internal code for \{ */#define M_CLASS(n)	(265+(n))	/* internal code for [] */#define M_START(n)	(275+(n))	/* internal code for \( */#define M_END(n)	(285+(n))	/* internal code for \) *//* These are used during compilation */static int	class_cnt;	/* used to assign class IDs */static int	start_cnt;	/* used to assign start IDs */static int	end_stk[NSUBEXP];/* used to assign end IDs */static int	end_sp;static char	*retext;	/* points to the text being compiled *//* error-handling stuff */jmp_buf	errorhandler;#define FAIL(why)	regerror(why); longjmp(errorhandler, 1)/* This function builds a bitmap for a particular class */static char *makeclass(text, bmap)	REG char	*text;	/* start of the class */	REG char	*bmap;	/* the bitmap */{	REG int		i;	int		complement = 0;	/* zero the bitmap */	for (i = 0; bmap && i < 32; i++)	{		bmap[i] = 0;	}	/* see if we're going to complement this class */	if (*text == '^')	{		text++;		complement = 1;	}	/* add in the characters */	while (*text && *text != ']')	{		/* is this a span of characters? */		if (text[1] == '-' && text[2])		{			/* spans can't be backwards */			if (text[0] > text[2])			{				FAIL("Backwards span in []");			}			/* add each character in the span to the bitmap */			for (i = text[0]; bmap && i <= text[2]; i++)			{				bmap[i >> 3] |= (1 << (i & 7));			}			/* move past this span */			text += 3;		}		else		{			/* add this single character to the span */			i = *text++;			if (bmap)			{				bmap[i >> 3] |= (1 << (i & 7));			}		}	}	/* make sure the closing ] is missing */	if (*text++ != ']')	{		FAIL("] missing");	}	/* if we're supposed to complement this class, then do so */	if (complement && bmap)	{		for (i = 0; i < 32; i++)		{			bmap[i] = ~bmap[i];		}	}	return text;}/* This function gets the next character or meta character from a string. * The pointer is incremented by 1, or by 2 for \-quoted characters.  For [], * a bitmap is generated via makeclass() (if re is given), and the * character-class text is skipped. */static int gettoken(sptr, re)	char	**sptr;	regexp	*re;{	int	c;	c = **sptr;	++*sptr;	if (c == '\\')	{		c = **sptr;		++*sptr;		switch (c)		{		  case '<':			return M_BEGWORD;		  case '>':			return M_ENDWORD;		  case '(':			if (start_cnt >= NSUBEXP)			{				FAIL("Too many \\(s");			}			end_stk[end_sp++] = start_cnt;			return M_START(start_cnt++);		  case ')':			if (end_sp <= 0)			{				FAIL("Mismatched \\)");			}			return M_END(end_stk[--end_sp]);		  case '*':			return (*o_magic ? c : M_SPLAT);		  case '.':			return (*o_magic ? c : M_ANY);		  case '+':			return M_PLUS;		  case '?':			return M_QMARK;#ifndef CRUNCH		  case '{':			return M_RANGE;#endif		  default:			return c;		}	}	else if (*o_magic)	{		switch (c)		{		  case '^':			if (*sptr == retext + 1)			{				return M_BEGLINE;			}			return c;		  case '$':			if (!**sptr)			{				return M_ENDLINE;			}			return c;		  case '.':			return M_ANY;		  case '*':			return M_SPLAT;		  case '[':			/* make sure we don't have too many classes */			if (class_cnt >= 10)			{				FAIL("Too many []s");			}			/* process the character list for this class */			if (re)			{				/* generate the bitmap for this class */				*sptr = makeclass(*sptr, re->program + 1 + 32 * class_cnt);			}			else			{				/* skip to end of the class */				*sptr = makeclass(*sptr, (char *)0);			}			return M_CLASS(class_cnt++);		  default:			return c;		}	}	else	/* unquoted nomagic */	{		switch (c)		{		  case '^':			if (*sptr == retext + 1)			{				return M_BEGLINE;			}			return c;		  case '$':			if (!**sptr)			{				return M_ENDLINE;			}			return c;		  default:			return c;		}	}	/*NOTREACHED*/}/* This function calculates the number of bytes that will be needed for a * compiled RE.  Its argument is the uncompiled version.  It is not clever * about catching syntax errors; that is done in a later pass. */static unsigned calcsize(text)	char		*text;{	unsigned	size;	int		token;	retext = text;	class_cnt = 0;	start_cnt = 1;	end_sp = 0;	size = 5;	while ((token = gettoken(&text, (regexp *)0)) != 0)	{		if (IS_CLASS(token))		{			size += 34;		}#ifndef CRUNCH		else if (token == M_RANGE)		{			size += 4;			while ((token = gettoken(&text, (regexp *)0)) != 0			    && token != '}')			{			}			if (!token)			{				return size;			}		}#endif		else if (IS_META(token))		{			size += 2;		}		else		{			size++;		}	}	return size;}/* This function compiles a regexp. */regexp *regcomp(exp)	char		*exp;{	int		needfirst;	unsigned	size;	int		token;	int		peek;	char		*build;	regexp		*re;#ifndef CRUNCH	int		from;	int		to;	int		digit;#endif	/* prepare for error handling */	re = (regexp *)0;	if (setjmp(errorhandler))	{		if (re)		{			free(re);		}		return (regexp *)0;	}	/* if an empty regexp string was given, use the previous one */	if (*exp == 0)	{		if (!previous)		{			FAIL("No previous RE");		}		exp = previous;	}	else /* non-empty regexp given, so remember it */	{		if (previous)			free(previous);		previous = (char *)malloc((unsigned)(strlen(exp) + 1));		if (previous)			strcpy(previous, exp);	}	/* allocate memory */	class_cnt = 0;	start_cnt = 1;	end_sp = 0;	retext = exp;	size = calcsize(exp) + sizeof(regexp) + 10; /* !!! 10 bytes for slop */#ifdef lint	re = ((regexp *)0) + size;#else	re = (regexp *)malloc((unsigned)size);#endif	if (!re)	{		FAIL("Not enough memory for this RE");	}	/* compile it */	build = &re->program[1 + 32 * class_cnt];	re->program[0] = class_cnt;	for (token = 0; token < NSUBEXP; token++)	{		re->startp[token] = re->endp[token] = (char *)0;	}	re->first = 0;	re->bol = 0;	re->minlen = 0;	needfirst = 1;	class_cnt = 0;	start_cnt = 1;	end_sp = 0;	retext = exp;	for (token = M_START(0), peek = gettoken(&exp, re);	     token;	     token = peek, peek = gettoken(&exp, re))	{		/* special processing for the closure operator */		if (IS_CLOSURE(peek))		{			/* detect misuse of closure operator */			if (IS_START(token))			{				FAIL("Closure operator follows nothing");			}			else if (IS_META(token) && token != M_ANY && !IS_CLASS(token))			{				FAIL("Closure operators can only follow a normal character or . or []");			}#ifndef CRUNCH			/* if \{ \} then read the range */			if (peek == M_RANGE)			{				from = 0;				for (digit = gettoken(&exp, re);				     !IS_META(digit) && isdigit(digit);				     digit = gettoken(&exp, re))				{					from = from * 10 + digit - '0';				}				if (digit == '}')				{					to = from;				}				else if (digit == ',')				{					to = 0;					for (digit = gettoken(&exp, re);					     !IS_META(digit) && isdigit(digit);					     digit = gettoken(&exp, re))					{						to = to * 10 + digit - '0';					}					if (to == 0)					{						to = 255;					}
regexp.c - 源码说明

本页面展示了「操作系统源代码」中的 regexp.c 源码文件，采用 C语言编程语言编写，共 935 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与操作系统相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?