📄 regcomp.c
字号:
#include <sys/types.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <limits.h>
#include <stdlib.h>
#include <regex.h>
#include "utils.h"
#include "regex2.h"
#include "cclass.h"
#include "cname.h"
/*
* parse structure, passed up and down to avoid global variables and
* other clumsinesses
*/
struct parse {
char *next; /* next character in RE */
char *end; /* end of string (-> NUL normally) */
int error; /* has an error been seen? */
sop *strip; /* malloced strip */
sopno ssize; /* malloced strip size (allocated) */
sopno slen; /* malloced strip length (used) */
int ncsalloc; /* number of csets allocated */
struct re_guts *g;
# define NPAREN 10 /* we need to remember () 1-9 for back refs */
sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
sopno pend[NPAREN]; /* -> ) ([0] unused) */
};
#include "regcomp.ih"
static char nuls[10]; /* place to point scanner in event of error */
/*
* macros for use with parse structure
* BEWARE: these know that the parse structure is named `p' !!!
*/
#define PEEK() (*p->next)
#define PEEK2() (*(p->next+1))
#define MORE() (p->next < p->end)
#define MORE2() (p->next+1 < p->end)
#define SEE(c) (MORE() && PEEK() == (c))
#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
#define NEXT() (p->next++)
#define NEXT2() (p->next += 2)
#define NEXTn(n) (p->next += (n))
#define GETNEXT() (*p->next++)
#define SETERROR(e) seterr(p, (e))
#define REQUIRE(co, e) ((co) || SETERROR(e))
#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
#define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
#define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
#define HERE() (p->slen)
#define THERE() (p->slen - 1)
#define THERETHERE() (p->slen - 2)
#define DROP(n) (p->slen -= (n))
#ifndef NDEBUG
static int never = 0; /* for use in asserts; shuts lint up */
#else
#define never 0 /* some <assert.h>s have bugs too */
#endif
/*
- regcomp - interface for parser and compilation
= extern int regcomp(regex_t *, const char *, int);
= #define REG_BASIC 0000
= #define REG_EXTENDED 0001
= #define REG_ICASE 0002
= #define REG_NOSUB 0004
= #define REG_NEWLINE 0010
= #define REG_NOSPEC 0020
= #define REG_PEND 0040
= #define REG_DUMP 0200
*/
int /* 0 success, otherwise REG_something */
regcomp(preg, pattern, cflags)
regex_t *preg;
const char *pattern;
int cflags;
{
struct parse pa;
register struct re_guts *g;
register struct parse *p = &pa;
register int i;
register size_t len;
#ifdef REDEBUG
# define GOODFLAGS(f) (f)
#else
# define GOODFLAGS(f) ((f)&~REG_DUMP)
#endif
cflags = GOODFLAGS(cflags);
if ((cflags®_EXTENDED) && (cflags®_NOSPEC))
return(REG_INVARG);
if (cflags®_PEND) {
if (preg->re_endp < pattern)
return(REG_INVARG);
len = preg->re_endp - pattern;
} else
len = strlen((char *)pattern);
/* do the mallocs early so failure handling is easy */
g = (struct re_guts *)malloc(sizeof(struct re_guts) +
(NC-1)*sizeof(cat_t));
if (g == NULL)
return(REG_ESPACE);
p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
p->strip = (sop *)malloc(p->ssize * sizeof(sop));
p->slen = 0;
if (p->strip == NULL) {
free((char *)g);
return(REG_ESPACE);
}
/* set things up */
p->g = g;
p->next = (char *)pattern; /* convenience; we do not modify it */
p->end = p->next + len;
p->error = 0;
p->ncsalloc = 0;
for (i = 0; i < NPAREN; i++) {
p->pbegin[i] = 0;
p->pend[i] = 0;
}
g->csetsize = NC;
g->sets = NULL;
g->setbits = NULL;
g->ncsets = 0;
g->cflags = cflags;
g->iflags = 0;
g->nbol = 0;
g->neol = 0;
g->must = NULL;
g->mlen = 0;
g->nsub = 0;
g->ncategories = 1; /* category 0 is "everything else" */
g->categories = &g->catspace[-(CHAR_MIN)];
(void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
g->backrefs = 0;
/* do it */
EMIT(OEND, 0);
g->firststate = THERE();
if (cflags®_EXTENDED)
p_ere(p, OUT);
else if (cflags®_NOSPEC)
p_str(p);
else
p_bre(p, OUT, OUT);
EMIT(OEND, 0);
g->laststate = THERE();
/* tidy up loose ends and fill things in */
categorize(p, g);
stripsnug(p, g);
findmust(p, g);
g->nplus = pluscount(p, g);
g->magic = MAGIC2;
preg->re_nsub = g->nsub;
preg->re_g = g;
preg->re_magic = MAGIC1;
#ifndef REDEBUG
/* not debugging, so can't rely on the assert() in regexec() */
if (g->iflags&BAD)
SETERROR(REG_ASSERT);
#endif
/* win or lose, we're done */
if (p->error != 0) /* lose */
regfree(preg);
return(p->error);
}
/*
- p_ere - ERE parser top level, concatenation and alternation
== static void p_ere(register struct parse *p, int stop);
*/
static void
p_ere(p, stop)
register struct parse *p;
int stop; /* character this ERE should end at */
{
register char c;
register sopno prevback;
register sopno prevfwd;
register sopno conc;
register int first = 1; /* is this the first alternative? */
for (;;) {
/* do a bunch of concatenated expressions */
conc = HERE();
while (MORE() && (c = PEEK()) != '|' && c != stop)
p_ere_exp(p);
REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
if (!EAT('|'))
break; /* NOTE BREAK OUT */
if (first) {
INSERT(OCH_, conc); /* offset is wrong */
prevfwd = conc;
prevback = conc;
first = 0;
}
ASTERN(OOR1, prevback);
prevback = THERE();
AHEAD(prevfwd); /* fix previous offset */
prevfwd = HERE();
EMIT(OOR2, 0); /* offset is very wrong */
}
if (!first) { /* tail-end fixups */
AHEAD(prevfwd);
ASTERN(O_CH, prevback);
}
assert(!MORE() || SEE(stop));
}
/*
- p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
== static void p_ere_exp(register struct parse *p);
*/
static void
p_ere_exp(p)
register struct parse *p;
{
register char c;
register sopno pos;
register int count;
register int count2;
register sopno subno;
int wascaret = 0;
assert(MORE()); /* caller should have ensured this */
c = GETNEXT();
pos = HERE();
switch (c) {
case '(':
REQUIRE(MORE(), REG_EPAREN);
p->g->nsub++;
subno = p->g->nsub;
if (subno < NPAREN)
p->pbegin[subno] = HERE();
EMIT(OLPAREN, subno);
if (!SEE(')'))
p_ere(p, ')');
if (subno < NPAREN) {
p->pend[subno] = HERE();
assert(p->pend[subno] != 0);
}
EMIT(ORPAREN, subno);
MUSTEAT(')', REG_EPAREN);
break;
#ifndef POSIX_MISTAKE
case ')': /* happens only if no current unmatched ( */
/*
* You may ask, why the ifndef? Because I didn't notice
* this until slightly too late for 1003.2, and none of the
* other 1003.2 regular-expression reviewers noticed it at
* all. So an unmatched ) is legal POSIX, at least until
* we can get it fixed.
*/
SETERROR(REG_EPAREN);
break;
#endif
case '^':
EMIT(OBOL, 0);
p->g->iflags |= USEBOL;
p->g->nbol++;
wascaret = 1;
break;
case '$':
EMIT(OEOL, 0);
p->g->iflags |= USEEOL;
p->g->neol++;
break;
case '|':
SETERROR(REG_EMPTY);
break;
case '*':
case '+':
case '?':
SETERROR(REG_BADRPT);
break;
case '.':
if (p->g->cflags®_NEWLINE)
nonnewline(p);
else
EMIT(OANY, 0);
break;
case '[':
p_bracket(p);
break;
case '\\':
REQUIRE(MORE(), REG_EESCAPE);
c = GETNEXT();
ordinary(p, c);
break;
case '{': /* okay as ordinary except if digit follows */
REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT);
/* FALLTHROUGH */
default:
ordinary(p, c);
break;
}
if (!MORE())
return;
c = PEEK();
/* we call { a repetition if followed by a digit */
if (!( c == '*' || c == '+' || c == '?' ||
(c == '{' && MORE2() && isdigit(PEEK2())) ))
return; /* no repetition, we're done */
NEXT();
REQUIRE(!wascaret, REG_BADRPT);
switch (c) {
case '*': /* implemented as +? */
/* this case does not require the (y|) trick, noKLUDGE */
INSERT(OPLUS_, pos);
ASTERN(O_PLUS, pos);
INSERT(OQUEST_, pos);
ASTERN(O_QUEST, pos);
break;
case '+':
INSERT(OPLUS_, pos);
ASTERN(O_PLUS, pos);
break;
case '?':
/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
INSERT(OCH_, pos); /* offset slightly wrong */
ASTERN(OOR1, pos); /* this one's right */
AHEAD(pos); /* fix the OCH_ */
EMIT(OOR2, 0); /* offset very wrong... */
AHEAD(THERE()); /* ...so fix it */
ASTERN(O_CH, THERETHERE());
break;
case '{':
count = p_count(p);
if (EAT(',')) {
if (isdigit(PEEK())) {
count2 = p_count(p);
REQUIRE(count <= count2, REG_BADBR);
} else /* single number with comma */
count2 = INFINITY;
} else /* just a single number */
count2 = count;
repeat(p, pos, count, count2);
if (!EAT('}')) { /* error heuristics */
while (MORE() && PEEK() != '}')
NEXT();
REQUIRE(MORE(), REG_EBRACE);
SETERROR(REG_BADBR);
}
break;
}
if (!MORE())
return;
c = PEEK();
if (!( c == '*' || c == '+' || c == '?' ||
(c == '{' && MORE2() && isdigit(PEEK2())) ) )
return;
SETERROR(REG_BADRPT);
}
/*
- p_str - string (no metacharacters) "parser"
== static void p_str(register struct parse *p);
*/
static void
p_str(p)
register struct parse *p;
{
REQUIRE(MORE(), REG_EMPTY);
while (MORE())
ordinary(p, GETNEXT());
}
/*
- p_bre - BRE parser top level, anchoring and concatenation
== static void p_bre(register struct parse *p, register int end1, \
== register int end2);
* Giving end1 as OUT essentially eliminates the end1/end2 check.
*
* This implementation is a bit of a kludge, in that a trailing $ is first
* taken as an ordinary character and then revised to be an anchor. The
* only undesirable side effect is that '$' gets included as a character
* category in such cases. This is fairly harmless; not worth fixing.
* The amount of lookahead needed to avoid this kludge is excessive.
*/
static void
p_bre(p, end1, end2)
register struct parse *p;
register int end1; /* first terminating character */
register int end2; /* second terminating character */
{
register sopno start = HERE();
register int first = 1; /* first subexpression? */
register int wasdollar = 0;
if (EAT('^')) {
EMIT(OBOL, 0);
p->g->iflags |= USEBOL;
p->g->nbol++;
}
while (MORE() && !SEETWO(end1, end2)) {
wasdollar = p_simp_re(p, first);
first = 0;
}
if (wasdollar) { /* oops, that was a trailing anchor */
DROP(1);
EMIT(OEOL, 0);
p->g->iflags |= USEEOL;
p->g->neol++;
}
REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
}
/*
- p_simp_re - parse a simple RE, an atom possibly followed by a repetition
== static int p_simp_re(register struct parse *p, int starordinary);
*/
static int /* was the simple RE an unbackslashed $? */
p_simp_re(p, starordinary)
register struct parse *p;
int starordinary; /* is a leading * an ordinary character? */
{
register int c;
register int count;
register int count2;
register sopno pos;
register int i;
register sopno subno;
# define BACKSL (1<<CHAR_BIT)
pos = HERE(); /* repetion op, if any, covers from here */
assert(MORE()); /* caller should have ensured this */
c = GETNEXT();
if (c == '\\') {
REQUIRE(MORE(), REG_EESCAPE);
c = BACKSL | (unsigned char)GETNEXT();
}
switch (c) {
case '.':
if (p->g->cflags®_NEWLINE)
nonnewline(p);
else
EMIT(OANY, 0);
break;
case '[':
p_bracket(p);
break;
case BACKSL|'{':
SETERROR(REG_BADRPT);
break;
case BACKSL|'(':
p->g->nsub++;
subno = p->g->nsub;
if (subno < NPAREN)
p->pbegin[subno] = HERE();
EMIT(OLPAREN, subno);
/* the MORE here is an error heuristic */
if (MORE() && !SEETWO('\\', ')'))
p_bre(p, '\\', ')');
if (subno < NPAREN) {
p->pend[subno] = HERE();
assert(p->pend[subno] != 0);
}
EMIT(ORPAREN, subno);
REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
break;
case BACKSL|')': /* should not get here -- must be user */
case BACKSL|'}':
SETERROR(REG_EPAREN);
break;
case BACKSL|'1':
case BACKSL|'2':
case BACKSL|'3':
case BACKSL|'4':
case BACKSL|'5':
case BACKSL|'6':
case BACKSL|'7':
case BACKSL|'8':
case BACKSL|'9':
i = (c&~BACKSL) - '0';
assert(i < NPAREN);
if (p->pend[i] != 0) {
assert(i <= p->g->nsub);
EMIT(OBACK_, i);
assert(p->pbegin[i] != 0);
assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
assert(OP(p->strip[p->pend[i]]) == ORPAREN);
(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
EMIT(O_BACK, i);
} else
SETERROR(REG_ESUBREG);
p->g->backrefs = 1;
break;
case '*':
REQUIRE(starordinary, REG_BADRPT);
/* FALLTHROUGH */
default:
ordinary(p, (char)c); /* takes off BACKSL, if any */
break;
}
if (EAT('*')) { /* implemented as +? */
/* this case does not require the (y|) trick, noKLUDGE */
INSERT(OPLUS_, pos);
ASTERN(O_PLUS, pos);
INSERT(OQUEST_, pos);
ASTERN(O_QUEST, pos);
} else if (EATTWO('\\', '{')) {
count = p_count(p);
if (EAT(',')) {
if (MORE() && isdigit(PEEK())) {
count2 = p_count(p);
REQUIRE(count <= count2, REG_BADBR);
} else /* single number with comma */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -