📄 regexp.c

📁 VIM文本编辑器
💻 C
📖 第 1 页 / 共 5 页
字号:
12 3 4 5 下一页
/* vi:set ts=8 sts=4 sw=4:
 *
 * Handling of regular expressions: vim_regcomp(), vim_regexec(), vim_regsub()
 *
 * NOTICE:
 *
 * This is NOT the original regular expression code as written by Henry
 * Spencer.  This code has been modified specifically for use with the VIM
 * editor, and should not be used apart from compiling VIM.  If you want a
 * good regular expression library, get the original code.  The copyright
 * notice that follows is from the original.
 *
 * END NOTICE
 *
 *	Copyright (c) 1986 by University of Toronto.
 *	Written by Henry Spencer.  Not derived from licensed software.
 *
 *	Permission is granted to anyone to use this software for any
 *	purpose on any computer system, and to redistribute it freely,
 *	subject to the following restrictions:
 *
 *	1. The author is not responsible for the consequences of use of
 *		this software, no matter how awful, even if they arise
 *		from defects in it.
 *
 *	2. The origin of this software must not be misrepresented, either
 *		by explicit claim or by omission.
 *
 *	3. Altered versions must be plainly marked as such, and must not
 *		be misrepresented as being the original software.
 *
 * Beware that some of this code is subtly aware of the way operator
 * precedence is structured in regular expressions.  Serious changes in
 * regular-expression syntax might require a total rethink.
 *
 * Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert Webb
 * and Bram Moolenaar.
 * Named character class support added by Walter Briscoe (1998 Jul 01)
 */

#include "vim.h"

#undef DEBUG

#include <stdio.h>
#include "option.h"

/*
 * Get around a problem with #defined char class functions.
 */
#ifdef isalnum
static int myisalnum __ARGS((int c));
static int myisalnum(c) int c; { return isalnum(c); }
# undef isalnum
# define isalnum myisalnum
#endif
#ifdef isalpha
static int myisalpha __ARGS((int c));
static int myisalpha(c) int c; { return isalpha(c); }
# undef isalpha
# define isalpha myisalpha
#endif
#ifdef iscntrl
static int myiscntrl __ARGS((int c));
static int myiscntrl(c) int c; { return iscntrl(c); }
# undef iscntrl
# define iscntrl myiscntrl
#endif
#ifdef isdigit
static int myisdigit __ARGS((int c));
static int myisdigit(c) int c; { return isdigit(c); }
# undef isdigit
# define isdigit myisdigit
#endif
# ifdef isgraph
static int myisgraph __ARGS((int c));
static int myisgraph(c) int c; { return isgraph(c); }
# undef isgraph
# define isgraph myisgraph
#endif
#ifdef islower
static int myislower __ARGS((int c));
static int myislower(c) int c; { return islower(c); }
# undef islower
# define islower myislower
#endif
#ifdef ispunct
static int myispunct __ARGS((int c));
static int myispunct(c) int c; { return ispunct(c); }
# undef ispunct
# define ispunct myispunct
#endif
#ifdef isupper
static int myisupper __ARGS((int c));
static int myisupper(c) int c; { return isupper(c); }
# undef isupper
# define isupper myisupper
#endif
#ifdef isxdigit
static int myisxdigit __ARGS((int c));
static int myisxdigit(c) int c; { return isxdigit(c); }
# undef isxdigit
# define isxdigit myisxdigit
#endif

/*
 * The "internal use only" fields in regexp.h are present to pass info from
 * compile to execute that permits the execute phase to run lots faster on
 * simple cases.  They are:
 *
 * regstart	char that must begin a match; '\0' if none obvious
 * reganch	is the match anchored (at beginning-of-line only)?
 * regmust	string (pointer into program) that match must include, or NULL
 * regmlen	length of regmust string
 *
 * Regstart and reganch permit very fast decisions on suitable starting points
 * for a match, cutting down the work a lot.  Regmust permits fast rejection
 * of lines that cannot possibly match.  The regmust tests are costly enough
 * that vim_regcomp() supplies a regmust only if the r.e. contains something
 * potentially expensive (at present, the only such thing detected is * or +
 * at the start of the r.e., which can involve a lot of backup).  Regmlen is
 * supplied because the test in vim_regexec() needs it and vim_regcomp() is
 * computing it anyway.
 */

/*
 * Structure for regexp "program".  This is essentially a linear encoding
 * of a nondeterministic finite-state machine (aka syntax charts or
 * "railroad normal form" in parsing technology).  Each node is an opcode
 * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
 * all nodes except BRANCH and BRACES_COMPLEX implement concatenation; a "next"
 * pointer with a BRANCH on both ends of it is connecting two alternatives.
 * (Here we have one of the subtle syntax dependencies:	an individual BRANCH
 * (as opposed to a collection of them) is never concatenated with anything
 * because of operator precedence).  The "next" pointer of a BRACES_COMPLEX
 * node points to the node after the stuff to be repeated.  The operand of some
 * types of node is a literal string; for others, it is a node leading into a
 * sub-FSM.  In particular, the operand of a BRANCH node is the first node of
 * the branch.	(NB this is *not* a tree structure: the tail of the branch
 * connects to the thing following the set of BRANCHes.)  The opcodes are:
 */

/* definition	number		   opnd?    meaning */
#define END		0	/* no	End of program. */
#define BOL		1	/* no	Match "" at beginning of line. */
#define EOL		2	/* no	Match "" at end of line. */
#define ANY		3	/* no	Match any one character. */
#define ANYOF		4	/* str	Match any character in this string. */
#define ANYBUT		5	/* str	Match any character not in this
				 *	string. */
#define BRANCH		6	/* node Match this alternative, or the
				 *	next... */
#define BACK		7	/* no	Match "", "next" ptr points backward. */
#define EXACTLY		8	/* str	Match this string. */
#define NOTHING		9	/* no	Match empty string. */
#define STAR		10	/* node Match this (simple) thing 0 or more
				 *	times. */
#define PLUS		11	/* node Match this (simple) thing 1 or more
				 *	times. */
#define BRACE_SIMPLE	12	/* node Match this (simple) thing between m and
				 *	n times (\{m,n\}). */
#define BOW		13	/* no	Match "" after [^a-zA-Z0-9_] */
#define EOW		14	/* no	Match "" at    [^a-zA-Z0-9_] */
#define IDENT		15	/* no	Match identifier char */
#define KWORD		16	/* no	Match keyword char */
#define FNAME		17	/* no	Match file name char */
#define PRINT		18	/* no	Match printable char */
#define SIDENT		19	/* no	Match identifier char but no digit */
#define SWORD		20	/* no	Match word char but no digit */
#define SFNAME		21	/* no	Match file name char but no digit */
#define SPRINT		22	/* no	Match printable char but no digit */
#define BRACE_LIMITS	23	/* 2int	define the min & max for BRACE_SIMPLE
				 *	and BRACE_COMPLEX. */
#define WHITE		24	/* no	Match whitespace char */
#define NWHITE		25	/* no	Match non-whitespace char */
#define DIGIT		26	/* no	Match digit char */
#define NDIGIT		27	/* no	Match non-digit char */
#define HEX		28	/* no   Match hex char */
#define NHEX		29	/* no   Match non-hex char */
#define OCTAL		30	/* no	Match octal char */
#define NOCTAL		31	/* no	Match non-octal char */
#define WORD		32	/* no	Match word char */
#define NWORD		33	/* no	Match non-word char */
#define HEAD		34	/* no	Match head char */
#define NHEAD		35	/* no	Match non-head char */
#define ALPHA		36	/* no	Match alpha char */
#define NALPHA		37	/* no	Match non-alpha char */
#define LOWER		38	/* no	Match lowercase char */
#define NLOWER		39	/* no	Match non-lowercase char */
#define UPPER		40	/* no	Match uppercase char */
#define NUPPER		41	/* no	Match non-uppercase char */
#define MOPEN		60 /* -69  no	Mark this point in input as start of
				 *	\( subexpr. */
#define MCLOSE		70 /* -79  no	Analogous to MOPEN. */
#define BACKREF		80 /* -89  node Match same string again \1-\9 */
#define BRACE_COMPLEX	90 /* -99  node Match nodes between m & n times */

#define Magic(x)    ((x) | ('\\' << 8))

/*
 * Opcode notes:
 *
 * BRANCH	The set of branches constituting a single choice are hooked
 *		together with their "next" pointers, since precedence prevents
 *		anything being concatenated to any individual branch.  The
 *		"next" pointer of the last BRANCH in a choice points to the
 *		thing following the whole choice.  This is also where the
 *		final "next" pointer of each individual branch points; each
 *		branch starts with the operand node of a BRANCH node.
 *
 * BACK		Normal "next" pointers all implicitly point forward; BACK
 *		exists to make loop structures possible.
 *
 * STAR,PLUS	'=', and complex '*' and '+', are implemented as circular
 *		BRANCH structures using BACK.  Simple cases (one character
 *		per match) are implemented with STAR and PLUS for speed
 *		and to minimize recursive plunges.
 *		Note: We would like to use "\?" instead of "\=", but a "\?"
 *		can be part of a pattern to escape the special meaning of '?'
 *		at the end of the pattern in "?pattern?e".
 *
 * BRACE_LIMITS	This is always followed by a BRACE_SIMPLE or BRACE_COMPLEX
 *		node, and defines the min and max limits to be used for that
 *		node.
 *
 * MOPEN,MCLOSE	...are numbered at compile time.
 */

/*
 * A node is one char of opcode followed by two chars of "next" pointer.
 * "Next" pointers are stored as two 8-bit pieces, high order first.  The
 * value is a positive offset from the opcode of the node containing it.
 * An operand, if any, simply follows the node.  (Note that much of the
 * code generation knows about this implicit relationship.)
 *
 * Using two bytes for the "next" pointer is vast overkill for most things,
 * but allows patterns to get big without disasters.
 */
#define OP(p)		((int)*(p))
#define NEXT(p)		(((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
#define OPERAND(p)	((p) + 3)
#define OPERAND_MIN(p)	(((*((p)+3)&0377)<<8) + (*((p)+4)&0377))
#define OPERAND_MAX(p)	(((*((p)+5)&0377)<<8) + (*((p)+6)&0377))

/*
 * Utility definitions.
 */
#ifndef CHARBITS
# define UCHARAT(p)	((int)*(unsigned char *)(p))
#else
# define UCHARAT(p)	((int)*(p)&CHARBITS)
#endif

#define EMSG_RETURN(m) { emsg(m); rc_did_emsg = TRUE; return NULL; }

#define MAX_LIMIT	32767

static int re_ismult __ARGS((int));
static int cstrncmp __ARGS((char_u *s1, char_u *s2, int n));
static char_u *cstrchr __ARGS((char_u *, int));

#ifdef DEBUG
static void	regdump __ARGS((char_u *, vim_regexp *));
static char_u	*regprop __ARGS((char_u *));
#endif

    static int
re_ismult(c)
    int c;
{
    return (c == Magic('*') || c == Magic('+') || c == Magic('=') ||
	    c == Magic('{'));
}

/*
 * Flags to be passed up and down.
 */
#define HASWIDTH	01	/* Known never to match null string. */
#define SIMPLE		02	/* Simple enough to be STAR/PLUS operand. */
#define SPSTART		04	/* Starts with * or +. */
#define WORST		0	/* Worst case. */

/*
 * When regcode is set to this value, code is not emitted and size is computed
 * instead.
 */
#define JUST_CALC_SIZE	((char_u *) -1)

static char_u		*reg_prev_sub;

/*
 * REGEXP_INRANGE contains all characters which are always special in a []
 * range after '\'.
 * REGEXP_ABBR contains all characters which act as abbreviations after '\'.
 * These are:
 *  \r	- New line (CR).
 *  \t	- Tab (TAB).
 *  \e	- Escape (ESC).
 *  \b	- Backspace (Ctrl('H')).
 */
static char_u REGEXP_INRANGE[] = "]^-\\";
static char_u REGEXP_ABBR[] = "rteb";

static int	backslash_trans __ARGS((int c));
static int	my_isblank __ARGS((int c));
static int	(* skip_class_name __ARGS((char_u **pp)))__ARGS((int));
static char_u * skip_range __ARGS((char_u *p));
static void	init_class_tab __ARGS((void));

    static int
backslash_trans(c)
    int	    c;
{
    switch (c)
    {
	case 'r':   return CR;
	case 't':   return TAB;
	case 'e':   return ESC;
	case 'b':   return Ctrl('H');
    }
    return c;
}

/*
 * Function version of the macro vim_iswhite().
 */
    static int
my_isblank(c)
    int		c;
{
    return vim_iswhite(c);
}

/*
 * Check for a character class name.  "pp" is at the '['.
 * If not: NULL is returned; If so, a function of the sort is* is returned and
 * the name is skipped.
 */
    static int (*
skip_class_name(pp))__ARGS((int))
    char_u	**pp;
{
    typedef struct
    {
	size_t	    len;
	int	    (*func)__ARGS((int));
	char_u	    name[sizeof("xdigit:]")];
    } namedata_t;

#define t(n, func) { sizeof(n) - 1, func, n }
    static const namedata_t class_names[] =
    {
	t("alnum:]", isalnum),		t("alpha:]", isalpha),
	t("blank:]", my_isblank),	t("cntrl:]", iscntrl),
	t("digit:]", isdigit),		t("graph:]", isgraph),
	t("lower:]", islower),		t("print:]", vim_isprintc),
	t("punct:]", ispunct),		t("space:]", vim_isspace),
	t("upper:]", isupper),		t("xdigit:]", isxdigit)
    };
#undef t

    const namedata_t *np;

    if ((*pp)[1] != ':')
	return NULL;
    for (   np = class_names;
	    np < class_names + sizeof(class_names) / sizeof(*class_names);
	    np++)
	if (STRNCMP(*pp+2, np->name, np->len) == 0)
	{
	    *pp += np->len + 2;
	    return np->func;
	}
    return NULL;
}

/*
 * Skip over a "[]" range.
 * "p" must point to the character after the '['.
 * The returned pointer is on the matching ']', or the terminating NUL.
 */
    static char_u *
skip_range(p)
    char_u	*p;
{
    int		    cpo_lit;	    /* 'cpoptions' contains 'l' flag */

    cpo_lit = (!reg_syn && vim_strchr(p_cpo, CPO_LITERAL) != NULL);

    if (*p == '^')	/* Complement of range. */
	++p;
    if (*p == ']' || *p == '-')
	++p;
    while (*p != NUL && *p != ']')
    {
	if (*p == '-')
	{
	    ++p;
	    if (*p != ']' && *p != NUL)
		++p;
	}
	else if (*p == '\\'
		&& (vim_strchr(REGEXP_INRANGE, p[1]) != NULL
		    || (!cpo_lit && vim_strchr(REGEXP_ABBR, p[1]) != NULL)))
	    p += 2;
	else if (*p == '[')
	{
	    if (skip_class_name(&p) == NULL)
		++p; /* It was not a class name */
	}
	else
	    ++p;
    }

    return p;
}

/*
 * Specific version of character class functions.
 * Using a table to keep this fast.
 */
static char_u	class_tab[256];

#define	    RI_DIGIT	0x01
#define	    RI_HEX	0x02
#define	    RI_OCTAL	0x04
#define	    RI_WORD	0x08
#define	    RI_HEAD	0x10
#define	    RI_ALPHA	0x20
#define	    RI_LOWER	0x40
#define	    RI_UPPER	0x80

    static void
init_class_tab()
{
    int		i;
    static int	done = FALSE;

    if (done)
	return;

    for (i = 0; i < 256; ++i)
    {
	if (i >= '0' && i <= '7')
	    class_tab[i] = RI_DIGIT + RI_HEX + RI_OCTAL + RI_WORD;
	else if (i >= '8' && i <= '9')
	    class_tab[i] = RI_DIGIT + RI_HEX + RI_WORD;
	else if (i >= 'a' && i <= 'f')
	    class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
	else if (i >= 'g' && i <= 'z')
	    class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
	else if (i >= 'A' && i <= 'F')
	    class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
	else if (i >= 'G' && i <= 'Z')
	    class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
	else if (i == '_')
	    class_tab[i] = RI_WORD + RI_HEAD;
    }
    done = TRUE;
}

#define ri_digit(c)	(class_tab[c] & RI_DIGIT)
#define ri_hex(c)	(class_tab[c] & RI_HEX)
#define ri_octal(c)	(class_tab[c] & RI_OCTAL)
#define ri_word(c)	(class_tab[c] & RI_WORD)
#define ri_head(c)	(class_tab[c] & RI_HEAD)
#define ri_alpha(c)	(class_tab[c] & RI_ALPHA)
#define ri_lower(c)	(class_tab[c] & RI_LOWER)
#define ri_upper(c)	(class_tab[c] & RI_UPPER)

/*
 * Global work variables for vim_regcomp().
 */

static char_u	*regparse;  /* Input-scan pointer. */
static int	num_complex_braces; /* Complex \{...} count */
static int	regnpar;	/* () count. */
static char_u	*regcode;	/* Code-emit pointer, or JUST_CALC_SIZE */
static long	regsize;	/* Code size. */
static char_u	**regendp;	/* Pointer to endp array */
static int	brace_min[10];	/* Minimums for complex brace repeats */
static int	brace_max[10];	/* Maximums for complex brace repeats */
static int	brace_count[10]; /* Current counts for complex brace repeats */
static int	had_eol;	/* TRUE when EOL found by vim_regcomp() */
static int	reg_magic;	/* p_magic passed to vim_regexec() */

/*
 * META contains all characters that may be magic, except '^' and '$'.
 */

static char_u META[] = ".[()|=+*<>iIkKfFpPsSdDxXoOwWhHaAlLuU~123456789{";

/*
 * Forward declarations for vim_regcomp()'s friends.
 */
static void	initchr __ARGS((char_u *));
static int	getchr __ARGS((void));
static int	peekchr __ARGS((void));
#define PeekChr() curchr    /* shortcut only when last action was peekchr() */
static int	curchr;
static void	skipchr __ARGS((void));
static void	ungetchr __ARGS((void));
static char_u	*reg __ARGS((int, int *));
static char_u	*regbranch __ARGS((int *));
static char_u	*regpiece __ARGS((int *));
static char_u	*regatom __ARGS((int *));
static char_u	*regnode __ARGS((int));
static char_u	*regnext __ARGS((char_u *));
static void	regc __ARGS((int));
static void	unregc __ARGS((void));
static void	reginsert __ARGS((int, char_u *));
static void	reginsert_limits __ARGS((int, int, int, char_u *));
static int	read_limits __ARGS((int, int, int *, int *));
static void	regtail __ARGS((char_u *, char_u *));
static void	regoptail __ARGS((char_u *, char_u *));

/*
 * Skip past regular expression.
 * Stop at end of 'p' of where 'dirc' is found ('/', '?', etc).
 * Take care of characters with a backslash in front of it.
 * Skip strings inside [ and ].
 */
    char_u *
skip_regexp(p, dirc, magic)
    char_u  *p;
    int	    dirc;
    int	    magic;
{
    for (; p[0] != NUL; ++p)
    {
12 3 4 5 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -