📄 regexp.c
字号:
/* vi:set ts=8 sts=4 sw=4:
*
* Handling of regular expressions: vim_regcomp(), vim_regexec(), vim_regsub()
*
* NOTICE:
*
* This is NOT the original regular expression code as written by Henry
* Spencer. This code has been modified specifically for use with the VIM
* editor, and should not be used apart from compiling VIM. If you want a
* good regular expression library, get the original code. The copyright
* notice that follows is from the original.
*
* END NOTICE
*
* Copyright (c) 1986 by University of Toronto.
* Written by Henry Spencer. Not derived from licensed software.
*
* Permission is granted to anyone to use this software for any
* purpose on any computer system, and to redistribute it freely,
* subject to the following restrictions:
*
* 1. The author is not responsible for the consequences of use of
* this software, no matter how awful, even if they arise
* from defects in it.
*
* 2. The origin of this software must not be misrepresented, either
* by explicit claim or by omission.
*
* 3. Altered versions must be plainly marked as such, and must not
* be misrepresented as being the original software.
*
* Beware that some of this code is subtly aware of the way operator
* precedence is structured in regular expressions. Serious changes in
* regular-expression syntax might require a total rethink.
*
* Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert Webb
* and Bram Moolenaar.
* Named character class support added by Walter Briscoe (1998 Jul 01)
*/
#include "vim.h"
#undef DEBUG
#include <stdio.h>
#include "option.h"
/*
* Get around a problem with #defined char class functions.
*/
#ifdef isalnum
static int myisalnum __ARGS((int c));
static int myisalnum(c) int c; { return isalnum(c); }
# undef isalnum
# define isalnum myisalnum
#endif
#ifdef isalpha
static int myisalpha __ARGS((int c));
static int myisalpha(c) int c; { return isalpha(c); }
# undef isalpha
# define isalpha myisalpha
#endif
#ifdef iscntrl
static int myiscntrl __ARGS((int c));
static int myiscntrl(c) int c; { return iscntrl(c); }
# undef iscntrl
# define iscntrl myiscntrl
#endif
#ifdef isdigit
static int myisdigit __ARGS((int c));
static int myisdigit(c) int c; { return isdigit(c); }
# undef isdigit
# define isdigit myisdigit
#endif
# ifdef isgraph
static int myisgraph __ARGS((int c));
static int myisgraph(c) int c; { return isgraph(c); }
# undef isgraph
# define isgraph myisgraph
#endif
#ifdef islower
static int myislower __ARGS((int c));
static int myislower(c) int c; { return islower(c); }
# undef islower
# define islower myislower
#endif
#ifdef ispunct
static int myispunct __ARGS((int c));
static int myispunct(c) int c; { return ispunct(c); }
# undef ispunct
# define ispunct myispunct
#endif
#ifdef isupper
static int myisupper __ARGS((int c));
static int myisupper(c) int c; { return isupper(c); }
# undef isupper
# define isupper myisupper
#endif
#ifdef isxdigit
static int myisxdigit __ARGS((int c));
static int myisxdigit(c) int c; { return isxdigit(c); }
# undef isxdigit
# define isxdigit myisxdigit
#endif
/*
* The "internal use only" fields in regexp.h are present to pass info from
* compile to execute that permits the execute phase to run lots faster on
* simple cases. They are:
*
* regstart char that must begin a match; '\0' if none obvious
* reganch is the match anchored (at beginning-of-line only)?
* regmust string (pointer into program) that match must include, or NULL
* regmlen length of regmust string
*
* Regstart and reganch permit very fast decisions on suitable starting points
* for a match, cutting down the work a lot. Regmust permits fast rejection
* of lines that cannot possibly match. The regmust tests are costly enough
* that vim_regcomp() supplies a regmust only if the r.e. contains something
* potentially expensive (at present, the only such thing detected is * or +
* at the start of the r.e., which can involve a lot of backup). Regmlen is
* supplied because the test in vim_regexec() needs it and vim_regcomp() is
* computing it anyway.
*/
/*
* Structure for regexp "program". This is essentially a linear encoding
* of a nondeterministic finite-state machine (aka syntax charts or
* "railroad normal form" in parsing technology). Each node is an opcode
* plus a "next" pointer, possibly plus an operand. "Next" pointers of
* all nodes except BRANCH and BRACES_COMPLEX implement concatenation; a "next"
* pointer with a BRANCH on both ends of it is connecting two alternatives.
* (Here we have one of the subtle syntax dependencies: an individual BRANCH
* (as opposed to a collection of them) is never concatenated with anything
* because of operator precedence). The "next" pointer of a BRACES_COMPLEX
* node points to the node after the stuff to be repeated. The operand of some
* types of node is a literal string; for others, it is a node leading into a
* sub-FSM. In particular, the operand of a BRANCH node is the first node of
* the branch. (NB this is *not* a tree structure: the tail of the branch
* connects to the thing following the set of BRANCHes.) The opcodes are:
*/
/* definition number opnd? meaning */
#define END 0 /* no End of program. */
#define BOL 1 /* no Match "" at beginning of line. */
#define EOL 2 /* no Match "" at end of line. */
#define ANY 3 /* no Match any one character. */
#define ANYOF 4 /* str Match any character in this string. */
#define ANYBUT 5 /* str Match any character not in this
* string. */
#define BRANCH 6 /* node Match this alternative, or the
* next... */
#define BACK 7 /* no Match "", "next" ptr points backward. */
#define EXACTLY 8 /* str Match this string. */
#define NOTHING 9 /* no Match empty string. */
#define STAR 10 /* node Match this (simple) thing 0 or more
* times. */
#define PLUS 11 /* node Match this (simple) thing 1 or more
* times. */
#define BRACE_SIMPLE 12 /* node Match this (simple) thing between m and
* n times (\{m,n\}). */
#define BOW 13 /* no Match "" after [^a-zA-Z0-9_] */
#define EOW 14 /* no Match "" at [^a-zA-Z0-9_] */
#define IDENT 15 /* no Match identifier char */
#define KWORD 16 /* no Match keyword char */
#define FNAME 17 /* no Match file name char */
#define PRINT 18 /* no Match printable char */
#define SIDENT 19 /* no Match identifier char but no digit */
#define SWORD 20 /* no Match word char but no digit */
#define SFNAME 21 /* no Match file name char but no digit */
#define SPRINT 22 /* no Match printable char but no digit */
#define BRACE_LIMITS 23 /* 2int define the min & max for BRACE_SIMPLE
* and BRACE_COMPLEX. */
#define WHITE 24 /* no Match whitespace char */
#define NWHITE 25 /* no Match non-whitespace char */
#define DIGIT 26 /* no Match digit char */
#define NDIGIT 27 /* no Match non-digit char */
#define HEX 28 /* no Match hex char */
#define NHEX 29 /* no Match non-hex char */
#define OCTAL 30 /* no Match octal char */
#define NOCTAL 31 /* no Match non-octal char */
#define WORD 32 /* no Match word char */
#define NWORD 33 /* no Match non-word char */
#define HEAD 34 /* no Match head char */
#define NHEAD 35 /* no Match non-head char */
#define ALPHA 36 /* no Match alpha char */
#define NALPHA 37 /* no Match non-alpha char */
#define LOWER 38 /* no Match lowercase char */
#define NLOWER 39 /* no Match non-lowercase char */
#define UPPER 40 /* no Match uppercase char */
#define NUPPER 41 /* no Match non-uppercase char */
#define MOPEN 60 /* -69 no Mark this point in input as start of
* \( subexpr. */
#define MCLOSE 70 /* -79 no Analogous to MOPEN. */
#define BACKREF 80 /* -89 node Match same string again \1-\9 */
#define BRACE_COMPLEX 90 /* -99 node Match nodes between m & n times */
#define Magic(x) ((x) | ('\\' << 8))
/*
* Opcode notes:
*
* BRANCH The set of branches constituting a single choice are hooked
* together with their "next" pointers, since precedence prevents
* anything being concatenated to any individual branch. The
* "next" pointer of the last BRANCH in a choice points to the
* thing following the whole choice. This is also where the
* final "next" pointer of each individual branch points; each
* branch starts with the operand node of a BRANCH node.
*
* BACK Normal "next" pointers all implicitly point forward; BACK
* exists to make loop structures possible.
*
* STAR,PLUS '=', and complex '*' and '+', are implemented as circular
* BRANCH structures using BACK. Simple cases (one character
* per match) are implemented with STAR and PLUS for speed
* and to minimize recursive plunges.
* Note: We would like to use "\?" instead of "\=", but a "\?"
* can be part of a pattern to escape the special meaning of '?'
* at the end of the pattern in "?pattern?e".
*
* BRACE_LIMITS This is always followed by a BRACE_SIMPLE or BRACE_COMPLEX
* node, and defines the min and max limits to be used for that
* node.
*
* MOPEN,MCLOSE ...are numbered at compile time.
*/
/*
* A node is one char of opcode followed by two chars of "next" pointer.
* "Next" pointers are stored as two 8-bit pieces, high order first. The
* value is a positive offset from the opcode of the node containing it.
* An operand, if any, simply follows the node. (Note that much of the
* code generation knows about this implicit relationship.)
*
* Using two bytes for the "next" pointer is vast overkill for most things,
* but allows patterns to get big without disasters.
*/
#define OP(p) ((int)*(p))
#define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
#define OPERAND(p) ((p) + 3)
#define OPERAND_MIN(p) (((*((p)+3)&0377)<<8) + (*((p)+4)&0377))
#define OPERAND_MAX(p) (((*((p)+5)&0377)<<8) + (*((p)+6)&0377))
/*
* Utility definitions.
*/
#ifndef CHARBITS
# define UCHARAT(p) ((int)*(unsigned char *)(p))
#else
# define UCHARAT(p) ((int)*(p)&CHARBITS)
#endif
#define EMSG_RETURN(m) { emsg(m); rc_did_emsg = TRUE; return NULL; }
#define MAX_LIMIT 32767
static int re_ismult __ARGS((int));
static int cstrncmp __ARGS((char_u *s1, char_u *s2, int n));
static char_u *cstrchr __ARGS((char_u *, int));
#ifdef DEBUG
static void regdump __ARGS((char_u *, vim_regexp *));
static char_u *regprop __ARGS((char_u *));
#endif
static int
re_ismult(c)
int c;
{
return (c == Magic('*') || c == Magic('+') || c == Magic('=') ||
c == Magic('{'));
}
/*
* Flags to be passed up and down.
*/
#define HASWIDTH 01 /* Known never to match null string. */
#define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
#define SPSTART 04 /* Starts with * or +. */
#define WORST 0 /* Worst case. */
/*
* When regcode is set to this value, code is not emitted and size is computed
* instead.
*/
#define JUST_CALC_SIZE ((char_u *) -1)
static char_u *reg_prev_sub;
/*
* REGEXP_INRANGE contains all characters which are always special in a []
* range after '\'.
* REGEXP_ABBR contains all characters which act as abbreviations after '\'.
* These are:
* \r - New line (CR).
* \t - Tab (TAB).
* \e - Escape (ESC).
* \b - Backspace (Ctrl('H')).
*/
static char_u REGEXP_INRANGE[] = "]^-\\";
static char_u REGEXP_ABBR[] = "rteb";
static int backslash_trans __ARGS((int c));
static int my_isblank __ARGS((int c));
static int (* skip_class_name __ARGS((char_u **pp)))__ARGS((int));
static char_u * skip_range __ARGS((char_u *p));
static void init_class_tab __ARGS((void));
static int
backslash_trans(c)
int c;
{
switch (c)
{
case 'r': return CR;
case 't': return TAB;
case 'e': return ESC;
case 'b': return Ctrl('H');
}
return c;
}
/*
* Function version of the macro vim_iswhite().
*/
static int
my_isblank(c)
int c;
{
return vim_iswhite(c);
}
/*
* Check for a character class name. "pp" is at the '['.
* If not: NULL is returned; If so, a function of the sort is* is returned and
* the name is skipped.
*/
static int (*
skip_class_name(pp))__ARGS((int))
char_u **pp;
{
typedef struct
{
size_t len;
int (*func)__ARGS((int));
char_u name[sizeof("xdigit:]")];
} namedata_t;
#define t(n, func) { sizeof(n) - 1, func, n }
static const namedata_t class_names[] =
{
t("alnum:]", isalnum), t("alpha:]", isalpha),
t("blank:]", my_isblank), t("cntrl:]", iscntrl),
t("digit:]", isdigit), t("graph:]", isgraph),
t("lower:]", islower), t("print:]", vim_isprintc),
t("punct:]", ispunct), t("space:]", vim_isspace),
t("upper:]", isupper), t("xdigit:]", isxdigit)
};
#undef t
const namedata_t *np;
if ((*pp)[1] != ':')
return NULL;
for ( np = class_names;
np < class_names + sizeof(class_names) / sizeof(*class_names);
np++)
if (STRNCMP(*pp+2, np->name, np->len) == 0)
{
*pp += np->len + 2;
return np->func;
}
return NULL;
}
/*
* Skip over a "[]" range.
* "p" must point to the character after the '['.
* The returned pointer is on the matching ']', or the terminating NUL.
*/
static char_u *
skip_range(p)
char_u *p;
{
int cpo_lit; /* 'cpoptions' contains 'l' flag */
cpo_lit = (!reg_syn && vim_strchr(p_cpo, CPO_LITERAL) != NULL);
if (*p == '^') /* Complement of range. */
++p;
if (*p == ']' || *p == '-')
++p;
while (*p != NUL && *p != ']')
{
if (*p == '-')
{
++p;
if (*p != ']' && *p != NUL)
++p;
}
else if (*p == '\\'
&& (vim_strchr(REGEXP_INRANGE, p[1]) != NULL
|| (!cpo_lit && vim_strchr(REGEXP_ABBR, p[1]) != NULL)))
p += 2;
else if (*p == '[')
{
if (skip_class_name(&p) == NULL)
++p; /* It was not a class name */
}
else
++p;
}
return p;
}
/*
* Specific version of character class functions.
* Using a table to keep this fast.
*/
static char_u class_tab[256];
#define RI_DIGIT 0x01
#define RI_HEX 0x02
#define RI_OCTAL 0x04
#define RI_WORD 0x08
#define RI_HEAD 0x10
#define RI_ALPHA 0x20
#define RI_LOWER 0x40
#define RI_UPPER 0x80
static void
init_class_tab()
{
int i;
static int done = FALSE;
if (done)
return;
for (i = 0; i < 256; ++i)
{
if (i >= '0' && i <= '7')
class_tab[i] = RI_DIGIT + RI_HEX + RI_OCTAL + RI_WORD;
else if (i >= '8' && i <= '9')
class_tab[i] = RI_DIGIT + RI_HEX + RI_WORD;
else if (i >= 'a' && i <= 'f')
class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
else if (i >= 'g' && i <= 'z')
class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
else if (i >= 'A' && i <= 'F')
class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
else if (i >= 'G' && i <= 'Z')
class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
else if (i == '_')
class_tab[i] = RI_WORD + RI_HEAD;
}
done = TRUE;
}
#define ri_digit(c) (class_tab[c] & RI_DIGIT)
#define ri_hex(c) (class_tab[c] & RI_HEX)
#define ri_octal(c) (class_tab[c] & RI_OCTAL)
#define ri_word(c) (class_tab[c] & RI_WORD)
#define ri_head(c) (class_tab[c] & RI_HEAD)
#define ri_alpha(c) (class_tab[c] & RI_ALPHA)
#define ri_lower(c) (class_tab[c] & RI_LOWER)
#define ri_upper(c) (class_tab[c] & RI_UPPER)
/*
* Global work variables for vim_regcomp().
*/
static char_u *regparse; /* Input-scan pointer. */
static int num_complex_braces; /* Complex \{...} count */
static int regnpar; /* () count. */
static char_u *regcode; /* Code-emit pointer, or JUST_CALC_SIZE */
static long regsize; /* Code size. */
static char_u **regendp; /* Pointer to endp array */
static int brace_min[10]; /* Minimums for complex brace repeats */
static int brace_max[10]; /* Maximums for complex brace repeats */
static int brace_count[10]; /* Current counts for complex brace repeats */
static int had_eol; /* TRUE when EOL found by vim_regcomp() */
static int reg_magic; /* p_magic passed to vim_regexec() */
/*
* META contains all characters that may be magic, except '^' and '$'.
*/
static char_u META[] = ".[()|=+*<>iIkKfFpPsSdDxXoOwWhHaAlLuU~123456789{";
/*
* Forward declarations for vim_regcomp()'s friends.
*/
static void initchr __ARGS((char_u *));
static int getchr __ARGS((void));
static int peekchr __ARGS((void));
#define PeekChr() curchr /* shortcut only when last action was peekchr() */
static int curchr;
static void skipchr __ARGS((void));
static void ungetchr __ARGS((void));
static char_u *reg __ARGS((int, int *));
static char_u *regbranch __ARGS((int *));
static char_u *regpiece __ARGS((int *));
static char_u *regatom __ARGS((int *));
static char_u *regnode __ARGS((int));
static char_u *regnext __ARGS((char_u *));
static void regc __ARGS((int));
static void unregc __ARGS((void));
static void reginsert __ARGS((int, char_u *));
static void reginsert_limits __ARGS((int, int, int, char_u *));
static int read_limits __ARGS((int, int, int *, int *));
static void regtail __ARGS((char_u *, char_u *));
static void regoptail __ARGS((char_u *, char_u *));
/*
* Skip past regular expression.
* Stop at end of 'p' of where 'dirc' is found ('/', '?', etc).
* Take care of characters with a backslash in front of it.
* Skip strings inside [ and ].
*/
char_u *
skip_regexp(p, dirc, magic)
char_u *p;
int dirc;
int magic;
{
for (; p[0] != NUL; ++p)
{
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -