📄 regexp.c
字号:
if (p[0] == dirc) /* found end of regexp */
break;
if ((p[0] == '[' && magic) || (p[0] == '\\' && p[1] == '[' && !magic))
{
p = skip_range(p + 1);
if (p[0] == NUL)
break;
}
else if (p[0] == '\\' && p[1] != NUL)
++p; /* skip next character */
}
return p;
}
/*
* vim_regcomp - compile a regular expression into internal code
*
* We can't allocate space until we know how big the compiled form will be,
* but we can't compile it (and thus know how big it is) until we've got a
* place to put the code. So we cheat: we compile it twice, once with code
* generation turned off and size counting turned on, and once "for real".
* This also means that we don't allocate space until we are sure that the
* thing really will compile successfully, and we never have to move the
* code and thus invalidate pointers into it. (Note that it has to be in
* one piece because vim_free() must be able to free it all.)
*
* Does not use reg_ic, see vim_regexec() for that.
*
* Beware that the optimization-preparation code in here knows about some
* of the structure of the compiled regexp.
*/
vim_regexp *
vim_regcomp(exp, magic)
char_u *exp;
int magic;
{
vim_regexp *r;
char_u *scan;
char_u *longest;
int len;
int flags;
if (exp == NULL)
EMSG_RETURN(e_null);
reg_magic = magic;
init_class_tab();
/* First pass: determine size, legality. */
initchr((char_u *)exp);
num_complex_braces = 0;
regnpar = 1;
regsize = 0L;
regcode = JUST_CALC_SIZE;
regendp = NULL;
had_eol = FALSE;
regc(MAGIC);
if (reg(0, &flags) == NULL)
return NULL;
/* Small enough for pointer-storage convention? */
#ifdef SMALL_MALLOC /* 16 bit storage allocation */
if (regsize >= 65536L - 256L)
EMSG_RETURN(e_toolong);
#endif
/* Allocate space. */
r = (vim_regexp *)lalloc(sizeof(vim_regexp) + regsize, TRUE);
if (r == NULL)
return NULL;
/* Second pass: emit code. */
initchr((char_u *)exp);
num_complex_braces = 0;
regnpar = 1;
regcode = r->program;
regendp = r->endp;
regc(MAGIC);
if (reg(0, &flags) == NULL)
{
vim_free(r);
return NULL;
}
/* Dig out information for optimizations. */
r->regstart = '\0'; /* Worst-case defaults. */
r->reganch = 0;
r->regmust = NULL;
r->regmlen = 0;
scan = r->program + 1; /* First BRANCH. */
if (OP(regnext(scan)) == END) /* Only one top-level choice. */
{
scan = OPERAND(scan);
/* Starting-point info. */
if (OP(scan) == BOL)
{
r->reganch++;
scan = regnext(scan);
}
if (OP(scan) == EXACTLY)
r->regstart = *OPERAND(scan);
else if ((OP(scan) == BOW || OP(scan) == EOW)
&& OP(regnext(scan)) == EXACTLY)
r->regstart = *OPERAND(regnext(scan));
/*
* If there's something expensive in the r.e., find the longest
* literal string that must appear and make it the regmust. Resolve
* ties in favor of later strings, since the regstart check works
* with the beginning of the r.e. and avoiding duplication
* strengthens checking. Not a strong reason, but sufficient in the
* absence of others.
*/
/*
* When the r.e. starts with BOW, it is faster to look for a regmust
* first. Used a lot for "#" and "*" commands. (Added by mool).
*/
if (flags & SPSTART || OP(scan) == BOW || OP(scan) == EOW)
{
longest = NULL;
len = 0;
for (; scan != NULL; scan = regnext(scan))
if (OP(scan) == EXACTLY && STRLEN(OPERAND(scan)) >= (size_t)len)
{
longest = OPERAND(scan);
len = STRLEN(OPERAND(scan));
}
r->regmust = longest;
r->regmlen = len;
}
}
#ifdef DEBUG
regdump(exp, r);
#endif
return r;
}
/*
* Check if during the previous call to vim_regcomp the EOL item "$" has been
* found. This is messy, but it works fine.
*/
int
vim_regcomp_had_eol()
{
return had_eol;
}
/*
* reg - regular expression, i.e. main body or parenthesized thing
*
* Caller must absorb opening parenthesis.
*
* Combining parenthesis handling with the base level of regular expression
* is a trifle forced, but the need to tie the tails of the branches to what
* follows makes it hard to avoid.
*/
static char_u *
reg(paren, flagp)
int paren; /* Parenthesized? */
int *flagp;
{
char_u *ret;
char_u *br;
char_u *ender;
int parno = 0;
int flags;
*flagp = HASWIDTH; /* Tentatively. */
/* Make an MOPEN node, if parenthesized. */
if (paren)
{
if (regnpar >= NSUBEXP)
EMSG_RETURN(e_toombra);
parno = regnpar;
regnpar++;
ret = regnode(MOPEN + parno);
if (regendp)
regendp[parno] = NULL; /* haven't seen the close paren yet */
}
else
ret = NULL;
/* Pick up the branches, linking them together. */
br = regbranch(&flags);
if (br == NULL)
return NULL;
if (ret != NULL)
regtail(ret, br); /* MOPEN -> first. */
else
ret = br;
if (!(flags & HASWIDTH))
*flagp &= ~HASWIDTH;
*flagp |= flags & SPSTART;
while (peekchr() == Magic('|'))
{
skipchr();
br = regbranch(&flags);
if (br == NULL)
return NULL;
regtail(ret, br); /* BRANCH -> BRANCH. */
if (!(flags & HASWIDTH))
*flagp &= ~HASWIDTH;
*flagp |= flags & SPSTART;
}
/* Make a closing node, and hook it on the end. */
ender = regnode((paren) ? MCLOSE + parno : END);
regtail(ret, ender);
/* Hook the tails of the branches to the closing node. */
for (br = ret; br != NULL; br = regnext(br))
regoptail(br, ender);
/* Check for proper termination. */
if (paren && getchr() != Magic(')'))
EMSG_RETURN(e_toombra)
else if (!paren && peekchr() != '\0')
{
if (PeekChr() == Magic(')'))
EMSG_RETURN(e_toomket)
else
EMSG_RETURN(e_trailing) /* "Can't happen". */
/* NOTREACHED */
}
/*
* Here we set the flag allowing back references to this set of
* parentheses.
*/
if (paren && regendp)
regendp[parno] = ender; /* have seen the close paren */
return ret;
}
/*
* regbranch - one alternative of an | operator
*
* Implements the concatenation operator.
*/
static char_u *
regbranch(flagp)
int *flagp;
{
char_u *ret;
char_u *chain;
char_u *latest;
int flags;
*flagp = WORST; /* Tentatively. */
ret = regnode(BRANCH);
chain = NULL;
while (peekchr() != '\0' && PeekChr() != Magic('|') &&
PeekChr() != Magic(')'))
{
latest = regpiece(&flags);
if (latest == NULL)
return NULL;
*flagp |= flags & HASWIDTH;
if (chain == NULL) /* First piece. */
*flagp |= flags & SPSTART;
else
regtail(chain, latest);
chain = latest;
}
if (chain == NULL) /* Loop ran zero times. */
(void) regnode(NOTHING);
return ret;
}
/*
* regpiece - something followed by possible [*+=]
*
* Note that the branching code sequences used for = and the general cases
* of * and + are somewhat optimized: they use the same NOTHING node as
* both the endmarker for their branch list and the body of the last branch.
* It might seem that this node could be dispensed with entirely, but the
* endmarker role is not redundant.
*/
static char_u *
regpiece(flagp)
int *flagp;
{
char_u *ret;
int op;
char_u *next;
int flags;
int minval;
int maxval;
ret = regatom(&flags);
if (ret == NULL)
return NULL;
op = peekchr();
if (!re_ismult(op))
{
*flagp = flags;
return ret;
}
if (!(flags & HASWIDTH) && op != Magic('='))
EMSG_RETURN((char_u *)"*, \\+, or \\{ operand could be empty");
*flagp = (WORST | SPSTART); /* default flags */
skipchr();
if (op == Magic('*') && (flags & SIMPLE))
reginsert(STAR, ret);
else if (op == Magic('*'))
{
/* Emit x* as (x&|), where & means "self". */
reginsert(BRANCH, ret); /* Either x */
regoptail(ret, regnode(BACK)); /* and loop */
regoptail(ret, ret); /* back */
regtail(ret, regnode(BRANCH)); /* or */
regtail(ret, regnode(NOTHING)); /* null. */
}
else if (op == Magic('+') && (flags & SIMPLE))
{
reginsert(PLUS, ret);
*flagp = (WORST | HASWIDTH);
}
else if (op == Magic('+'))
{
/* Emit x+ as x(&|), where & means "self". */
next = regnode(BRANCH); /* Either */
regtail(ret, next);
regtail(regnode(BACK), ret); /* loop back */
regtail(next, regnode(BRANCH)); /* or */
regtail(ret, regnode(NOTHING)); /* null. */
*flagp = (WORST | HASWIDTH);
}
else if (op == Magic('='))
{
/* Emit x= as (x|) */
reginsert(BRANCH, ret); /* Either x */
regtail(ret, regnode(BRANCH)); /* or */
next = regnode(NOTHING);/* null. */
regtail(ret, next);
regoptail(ret, next);
}
else if (op == Magic('{') && (flags & SIMPLE))
{
if (!read_limits('{', '}', &minval, &maxval))
return NULL;
reginsert(BRACE_SIMPLE, ret);
reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
if (minval > 0)
*flagp = (WORST | HASWIDTH);
}
else if (op == Magic('{'))
{
if (!read_limits('{', '}', &minval, &maxval))
return NULL;
if (num_complex_braces >= 10)
EMSG_RETURN((char_u *)"Too many complex \\{...}s");
reginsert(BRACE_COMPLEX + num_complex_braces, ret);
regoptail(ret, regnode(BACK));
regoptail(ret, ret);
reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
if (minval > 0)
*flagp = (WORST | HASWIDTH);
++num_complex_braces;
}
if (re_ismult(peekchr()))
EMSG_RETURN((char_u *)"Nested *, \\=, \\+, or \\{");
return ret;
}
/*
* regatom - the lowest level
*
* Optimization: gobbles an entire sequence of ordinary characters so that
* it can turn them into a single node, which is smaller to store and
* faster to run.
*/
static char_u *
regatom(flagp)
int *flagp;
{
char_u *ret;
int flags;
int cpo_lit; /* 'cpoptions' contains 'l' flag */
*flagp = WORST; /* Tentatively. */
cpo_lit = (!reg_syn && vim_strchr(p_cpo, CPO_LITERAL) != NULL);
switch (getchr())
{
case Magic('^'):
ret = regnode(BOL);
break;
case Magic('$'):
ret = regnode(EOL);
had_eol = TRUE;
break;
case Magic('<'):
ret = regnode(BOW);
break;
case Magic('>'):
ret = regnode(EOW);
break;
case Magic('.'):
ret = regnode(ANY);
*flagp |= HASWIDTH | SIMPLE;
break;
case Magic('i'):
ret = regnode(IDENT);
*flagp |= HASWIDTH | SIMPLE;
break;
case Magic('k'):
ret = regnode(KWORD);
*flagp |= HASWIDTH | SIMPLE;
break;
case Magic('I'):
ret = regnode(SIDENT);
*flagp |= HASWIDTH | SIMPLE;
break;
case Magic('K'):
ret = regnode(SWORD);
*flagp |= HASWIDTH | SIMPLE;
break;
case Magic('f'):
ret = regnode(FNAME);
*flagp |= HASWIDTH | SIMPLE;
break;
case Magic('F'):
ret = regnode(SFNAME);
*flagp |= HASWIDTH | SIMPLE;
break;
case Magic('p'):
ret = regnode(PRINT);
*flagp |= HASWIDTH | SIMPLE;
break;
case Magic('P'):
ret = regnode(SPRINT);
*flagp |= HASWIDTH | SIMPLE;
break;
case Magic('s'):
ret = regnode(WHITE);
*flagp |= HASWIDTH | SIMPLE;
break;
case Magic('S'):
ret = regnode(NWHITE);
*flagp |= HASWIDTH | SIMPLE;
break;
case Magic('d'):
ret = regnode(DIGIT);
*flagp |= HASWIDTH | SIMPLE;
break;
case Magic('D'):
ret = regnode(NDIGIT);
*flagp |= HASWIDTH | SIMPLE;
break;
case Magic('x'):
ret = regnode(HEX);
*flagp |= HASWIDTH | SIMPLE;
break;
case Magic('X'):
ret = regnode(NHEX);
*flagp |= HASWIDTH | SIMPLE;
break;
case Magic('o'):
ret = regnode(OCTAL);
*flagp |= HASWIDTH | SIMPLE;
break;
case Magic('O'):
ret = regnode(NOCTAL);
*flagp |= HASWIDTH | SIMPLE;
break;
case Magic('w'):
ret = regnode(WORD);
*flagp |= HASWIDTH | SIMPLE;
break;
case Magic('W'):
ret = regnode(NWORD);
*flagp |= HASWIDTH | SIMPLE;
break;
case Magic('h'):
ret = regnode(HEAD);
*flagp |= HASWIDTH | SIMPLE;
break;
case Magic('H'):
ret = regnode(NHEAD);
*flagp |= HASWIDTH | SIMPLE;
break;
case Magic('a'):
ret = regnode(ALPHA);
*flagp |= HASWIDTH | SIMPLE;
break;
case Magic('A'):
ret = regnode(NALPHA);
*flagp |= HASWIDTH | SIMPLE;
break;
case Magic('l'):
ret = regnode(LOWER);
*flagp |= HASWIDTH | SIMPLE;
break;
case Magic('L'):
ret = regnode(NLOWER);
*flagp |= HASWIDTH | SIMPLE;
break;
case Magic('u'):
ret = regnode(UPPER);
*flagp |= HASWIDTH | SIMPLE;
break;
case Magic('U'):
ret = regnode(NUPPER);
*flagp |= HASWIDTH | SIMPLE;
break;
case Magic('('):
ret = reg(1, &flags);
if (ret == NULL)
return NULL;
*flagp |= flags & (HASWIDTH | SPSTART);
break;
case '\0':
case Magic('|'):
case Magic(')'):
EMSG_RETURN(e_internal) /* Supposed to be caught earlier. */
/* NOTREACHED */
case Magic('='):
EMSG_RETURN((char_u *)"\\= follows nothing")
/* NOTREACHED */
case Magic('+'):
EMSG_RETURN((char_u *)"\\+ follows nothing")
/* NOTREACHED */
case Magic('{'):
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -