📄 regexp.c

📁 VIM文本编辑器
💻 C
📖 第 1 页 / 共 5 页
字号:
	if (p[0] == dirc)	/* found end of regexp */
	    break;
	if ((p[0] == '[' && magic) || (p[0] == '\\' && p[1] == '[' && !magic))
	{
	    p = skip_range(p + 1);
	    if (p[0] == NUL)
		break;
	}
	else if (p[0] == '\\' && p[1] != NUL)
	    ++p;    /* skip next character */
    }
    return p;
}

/*
 * vim_regcomp - compile a regular expression into internal code
 *
 * We can't allocate space until we know how big the compiled form will be,
 * but we can't compile it (and thus know how big it is) until we've got a
 * place to put the code.  So we cheat:  we compile it twice, once with code
 * generation turned off and size counting turned on, and once "for real".
 * This also means that we don't allocate space until we are sure that the
 * thing really will compile successfully, and we never have to move the
 * code and thus invalidate pointers into it.  (Note that it has to be in
 * one piece because vim_free() must be able to free it all.)
 *
 * Does not use reg_ic, see vim_regexec() for that.
 *
 * Beware that the optimization-preparation code in here knows about some
 * of the structure of the compiled regexp.
 */
    vim_regexp *
vim_regcomp(exp, magic)
    char_u	*exp;
    int		magic;
{
    vim_regexp	*r;
    char_u	*scan;
    char_u	*longest;
    int		len;
    int		flags;

    if (exp == NULL)
	EMSG_RETURN(e_null);

    reg_magic = magic;
    init_class_tab();

    /* First pass: determine size, legality. */
    initchr((char_u *)exp);
    num_complex_braces = 0;
    regnpar = 1;
    regsize = 0L;
    regcode = JUST_CALC_SIZE;
    regendp = NULL;
    had_eol = FALSE;
    regc(MAGIC);
    if (reg(0, &flags) == NULL)
	return NULL;

    /* Small enough for pointer-storage convention? */
#ifdef SMALL_MALLOC		/* 16 bit storage allocation */
    if (regsize >= 65536L - 256L)
	EMSG_RETURN(e_toolong);
#endif

    /* Allocate space. */
    r = (vim_regexp *)lalloc(sizeof(vim_regexp) + regsize, TRUE);
    if (r == NULL)
	return NULL;

    /* Second pass: emit code. */
    initchr((char_u *)exp);
    num_complex_braces = 0;
    regnpar = 1;
    regcode = r->program;
    regendp = r->endp;
    regc(MAGIC);
    if (reg(0, &flags) == NULL)
    {
	vim_free(r);
	return NULL;
    }

    /* Dig out information for optimizations. */
    r->regstart = '\0';		/* Worst-case defaults. */
    r->reganch = 0;
    r->regmust = NULL;
    r->regmlen = 0;
    scan = r->program + 1;	/* First BRANCH. */
    if (OP(regnext(scan)) == END)   /* Only one top-level choice. */
    {
	scan = OPERAND(scan);

	/* Starting-point info. */
	if (OP(scan) == BOL)
	{
	    r->reganch++;
	    scan = regnext(scan);
	}
	if (OP(scan) == EXACTLY)
	    r->regstart = *OPERAND(scan);
	else if ((OP(scan) == BOW || OP(scan) == EOW)
		 && OP(regnext(scan)) == EXACTLY)
	    r->regstart = *OPERAND(regnext(scan));

	/*
	 * If there's something expensive in the r.e., find the longest
	 * literal string that must appear and make it the regmust.  Resolve
	 * ties in favor of later strings, since the regstart check works
	 * with the beginning of the r.e. and avoiding duplication
	 * strengthens checking.  Not a strong reason, but sufficient in the
	 * absence of others.
	 */
	/*
	 * When the r.e. starts with BOW, it is faster to look for a regmust
	 * first. Used a lot for "#" and "*" commands. (Added by mool).
	 */
	if (flags & SPSTART || OP(scan) == BOW || OP(scan) == EOW)
	{
	    longest = NULL;
	    len = 0;
	    for (; scan != NULL; scan = regnext(scan))
		if (OP(scan) == EXACTLY && STRLEN(OPERAND(scan)) >= (size_t)len)
		{
		    longest = OPERAND(scan);
		    len = STRLEN(OPERAND(scan));
		}
	    r->regmust = longest;
	    r->regmlen = len;
	}
    }
#ifdef DEBUG
    regdump(exp, r);
#endif
    return r;
}

/*
 * Check if during the previous call to vim_regcomp the EOL item "$" has been
 * found.  This is messy, but it works fine.
 */
    int
vim_regcomp_had_eol()
{
    return had_eol;
}

/*
 * reg - regular expression, i.e. main body or parenthesized thing
 *
 * Caller must absorb opening parenthesis.
 *
 * Combining parenthesis handling with the base level of regular expression
 * is a trifle forced, but the need to tie the tails of the branches to what
 * follows makes it hard to avoid.
 */
    static char_u *
reg(paren, flagp)
    int		    paren;	/* Parenthesized? */
    int		    *flagp;
{
    char_u	*ret;
    char_u	*br;
    char_u	*ender;
    int		parno = 0;
    int		flags;

    *flagp = HASWIDTH;		/* Tentatively. */

    /* Make an MOPEN node, if parenthesized. */
    if (paren)
    {
	if (regnpar >= NSUBEXP)
	    EMSG_RETURN(e_toombra);
	parno = regnpar;
	regnpar++;
	ret = regnode(MOPEN + parno);
	if (regendp)
	    regendp[parno] = NULL;  /* haven't seen the close paren yet */
    }
    else
	ret = NULL;

    /* Pick up the branches, linking them together. */
    br = regbranch(&flags);
    if (br == NULL)
	return NULL;
    if (ret != NULL)
	regtail(ret, br);	/* MOPEN -> first. */
    else
	ret = br;
    if (!(flags & HASWIDTH))
	*flagp &= ~HASWIDTH;
    *flagp |= flags & SPSTART;
    while (peekchr() == Magic('|'))
    {
	skipchr();
	br = regbranch(&flags);
	if (br == NULL)
	    return NULL;
	regtail(ret, br);	/* BRANCH -> BRANCH. */
	if (!(flags & HASWIDTH))
	    *flagp &= ~HASWIDTH;
	*flagp |= flags & SPSTART;
    }

    /* Make a closing node, and hook it on the end. */
    ender = regnode((paren) ? MCLOSE + parno : END);
    regtail(ret, ender);

    /* Hook the tails of the branches to the closing node. */
    for (br = ret; br != NULL; br = regnext(br))
	regoptail(br, ender);

    /* Check for proper termination. */
    if (paren && getchr() != Magic(')'))
	EMSG_RETURN(e_toombra)
    else if (!paren && peekchr() != '\0')
    {
	if (PeekChr() == Magic(')'))
	    EMSG_RETURN(e_toomket)
	else
	    EMSG_RETURN(e_trailing)	/* "Can't happen". */
	/* NOTREACHED */
    }
    /*
     * Here we set the flag allowing back references to this set of
     * parentheses.
     */
    if (paren && regendp)
	regendp[parno] = ender;	/* have seen the close paren */
    return ret;
}

/*
 * regbranch - one alternative of an | operator
 *
 * Implements the concatenation operator.
 */
    static char_u    *
regbranch(flagp)
    int		    *flagp;
{
    char_u	    *ret;
    char_u	    *chain;
    char_u	    *latest;
    int		    flags;

    *flagp = WORST;		/* Tentatively. */

    ret = regnode(BRANCH);
    chain = NULL;
    while (peekchr() != '\0' && PeekChr() != Magic('|') &&
						      PeekChr() != Magic(')'))
    {
	latest = regpiece(&flags);
	if (latest == NULL)
	    return NULL;
	*flagp |= flags & HASWIDTH;
	if (chain == NULL)	/* First piece. */
	    *flagp |= flags & SPSTART;
	else
	    regtail(chain, latest);
	chain = latest;
    }
    if (chain == NULL)		/* Loop ran zero times. */
	(void) regnode(NOTHING);

    return ret;
}

/*
 * regpiece - something followed by possible [*+=]
 *
 * Note that the branching code sequences used for = and the general cases
 * of * and + are somewhat optimized:  they use the same NOTHING node as
 * both the endmarker for their branch list and the body of the last branch.
 * It might seem that this node could be dispensed with entirely, but the
 * endmarker role is not redundant.
 */
    static char_u *
regpiece(flagp)
    int		    *flagp;
{
    char_u	    *ret;
    int		    op;
    char_u	    *next;
    int		    flags;
    int		    minval;
    int		    maxval;

    ret = regatom(&flags);
    if (ret == NULL)
	return NULL;

    op = peekchr();
    if (!re_ismult(op))
    {
	*flagp = flags;
	return ret;
    }
    if (!(flags & HASWIDTH) && op != Magic('='))
	EMSG_RETURN((char_u *)"*, \\+, or \\{ operand could be empty");
    *flagp = (WORST | SPSTART);		    /* default flags */

    skipchr();
    if (op == Magic('*') && (flags & SIMPLE))
	reginsert(STAR, ret);
    else if (op == Magic('*'))
    {
	/* Emit x* as (x&|), where & means "self". */
	reginsert(BRANCH, ret); /* Either x */
	regoptail(ret, regnode(BACK));	/* and loop */
	regoptail(ret, ret);	/* back */
	regtail(ret, regnode(BRANCH));	/* or */
	regtail(ret, regnode(NOTHING)); /* null. */
    }
    else if (op == Magic('+') && (flags & SIMPLE))
    {
	reginsert(PLUS, ret);
	*flagp = (WORST | HASWIDTH);
    }
    else if (op == Magic('+'))
    {
	/* Emit x+ as x(&|), where & means "self". */
	next = regnode(BRANCH); /* Either */
	regtail(ret, next);
	regtail(regnode(BACK), ret);	/* loop back */
	regtail(next, regnode(BRANCH)); /* or */
	regtail(ret, regnode(NOTHING)); /* null. */
	*flagp = (WORST | HASWIDTH);
    }
    else if (op == Magic('='))
    {
	/* Emit x= as (x|) */
	reginsert(BRANCH, ret); /* Either x */
	regtail(ret, regnode(BRANCH));	/* or */
	next = regnode(NOTHING);/* null. */
	regtail(ret, next);
	regoptail(ret, next);
    }
    else if (op == Magic('{') && (flags & SIMPLE))
    {
	if (!read_limits('{', '}', &minval, &maxval))
	    return NULL;
	reginsert(BRACE_SIMPLE, ret);
	reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
	if (minval > 0)
	    *flagp = (WORST | HASWIDTH);
    }
    else if (op == Magic('{'))
    {
	if (!read_limits('{', '}', &minval, &maxval))
	    return NULL;
	if (num_complex_braces >= 10)
	    EMSG_RETURN((char_u *)"Too many complex \\{...}s");
	reginsert(BRACE_COMPLEX + num_complex_braces, ret);
	regoptail(ret, regnode(BACK));
	regoptail(ret, ret);
	reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
	if (minval > 0)
	    *flagp = (WORST | HASWIDTH);
	++num_complex_braces;
    }
    if (re_ismult(peekchr()))
	EMSG_RETURN((char_u *)"Nested *, \\=, \\+, or \\{");

    return ret;
}

/*
 * regatom - the lowest level
 *
 * Optimization:  gobbles an entire sequence of ordinary characters so that
 * it can turn them into a single node, which is smaller to store and
 * faster to run.
 */
    static char_u *
regatom(flagp)
    int		   *flagp;
{
    char_u	    *ret;
    int		    flags;
    int		    cpo_lit;	    /* 'cpoptions' contains 'l' flag */

    *flagp = WORST;		/* Tentatively. */
    cpo_lit = (!reg_syn && vim_strchr(p_cpo, CPO_LITERAL) != NULL);

    switch (getchr())
    {
      case Magic('^'):
	ret = regnode(BOL);
	break;
      case Magic('$'):
	ret = regnode(EOL);
	had_eol = TRUE;
	break;
      case Magic('<'):
	ret = regnode(BOW);
	break;
      case Magic('>'):
	ret = regnode(EOW);
	break;
      case Magic('.'):
	ret = regnode(ANY);
	*flagp |= HASWIDTH | SIMPLE;
	break;
      case Magic('i'):
	ret = regnode(IDENT);
	*flagp |= HASWIDTH | SIMPLE;
	break;
      case Magic('k'):
	ret = regnode(KWORD);
	*flagp |= HASWIDTH | SIMPLE;
	break;
      case Magic('I'):
	ret = regnode(SIDENT);
	*flagp |= HASWIDTH | SIMPLE;
	break;
      case Magic('K'):
	ret = regnode(SWORD);
	*flagp |= HASWIDTH | SIMPLE;
	break;
      case Magic('f'):
	ret = regnode(FNAME);
	*flagp |= HASWIDTH | SIMPLE;
	break;
      case Magic('F'):
	ret = regnode(SFNAME);
	*flagp |= HASWIDTH | SIMPLE;
	break;
      case Magic('p'):
	ret = regnode(PRINT);
	*flagp |= HASWIDTH | SIMPLE;
	break;
      case Magic('P'):
	ret = regnode(SPRINT);
	*flagp |= HASWIDTH | SIMPLE;
	break;
      case Magic('s'):
	ret = regnode(WHITE);
	*flagp |= HASWIDTH | SIMPLE;
	break;
      case Magic('S'):
	ret = regnode(NWHITE);
	*flagp |= HASWIDTH | SIMPLE;
	break;
      case Magic('d'):
	ret = regnode(DIGIT);
	*flagp |= HASWIDTH | SIMPLE;
	break;
      case Magic('D'):
	ret = regnode(NDIGIT);
	*flagp |= HASWIDTH | SIMPLE;
	break;
      case Magic('x'):
	ret = regnode(HEX);
	*flagp |= HASWIDTH | SIMPLE;
	break;
      case Magic('X'):
	ret = regnode(NHEX);
	*flagp |= HASWIDTH | SIMPLE;
	break;
      case Magic('o'):
	ret = regnode(OCTAL);
	*flagp |= HASWIDTH | SIMPLE;
	break;
      case Magic('O'):
	ret = regnode(NOCTAL);
	*flagp |= HASWIDTH | SIMPLE;
	break;
      case Magic('w'):
	ret = regnode(WORD);
	*flagp |= HASWIDTH | SIMPLE;
	break;
      case Magic('W'):
	ret = regnode(NWORD);
	*flagp |= HASWIDTH | SIMPLE;
	break;
      case Magic('h'):
	ret = regnode(HEAD);
	*flagp |= HASWIDTH | SIMPLE;
	break;
      case Magic('H'):
	ret = regnode(NHEAD);
	*flagp |= HASWIDTH | SIMPLE;
	break;
      case Magic('a'):
	ret = regnode(ALPHA);
	*flagp |= HASWIDTH | SIMPLE;
	break;
      case Magic('A'):
	ret = regnode(NALPHA);
	*flagp |= HASWIDTH | SIMPLE;
	break;
      case Magic('l'):
	ret = regnode(LOWER);
	*flagp |= HASWIDTH | SIMPLE;
	break;
      case Magic('L'):
	ret = regnode(NLOWER);
	*flagp |= HASWIDTH | SIMPLE;
	break;
      case Magic('u'):
	ret = regnode(UPPER);
	*flagp |= HASWIDTH | SIMPLE;
	break;
      case Magic('U'):
	ret = regnode(NUPPER);
	*flagp |= HASWIDTH | SIMPLE;
	break;
      case Magic('('):
	ret = reg(1, &flags);
	if (ret == NULL)
	    return NULL;
	*flagp |= flags & (HASWIDTH | SPSTART);
	break;
      case '\0':
      case Magic('|'):
      case Magic(')'):
	EMSG_RETURN(e_internal)	    /* Supposed to be caught earlier. */
	/* NOTREACHED */
      case Magic('='):
	EMSG_RETURN((char_u *)"\\= follows nothing")
	/* NOTREACHED */
      case Magic('+'):
	EMSG_RETURN((char_u *)"\\+ follows nothing")
	/* NOTREACHED */
      case Magic('{'):
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -