📄 regexp.c
字号:
/* regexp.c *//* This file contains the code that compiles regular expressions and executes * them. It supports the same syntax and features as vi's regular expression * code. Specifically, the meta characters are: * ^ matches the beginning of a line * $ matches the end of a line * \< matches the beginning of a word * \> matches the end of a word * . matches any single character * [] matches any character in a character class * \( delimits the start of a subexpression * \) delimits the end of a subexpression * * repeats the preceding 0 or more times * NOTE: You cannot follow a \) with a *. * * The physical structure of a compiled RE is as follows: * - First, there is a one-byte value that says how many character classes * are used in this regular expression * - Next, each character class is stored as a bitmap that is 256 bits * (32 bytes) long. * - A mixture of literal characters and compiled meta characters follows. * This begins with M_BEGIN(0) and ends with M_END(0). All meta chars * are stored as a \n followed by a one-byte code, so they take up two * bytes apiece. Literal characters take up one byte apiece. \n can't * be used as a literal character. * * If NO_MAGIC is defined, then a different set of functions is used instead. * That right, this file contains TWO versions of the code. */#include <setjmp.h>#include "config.h"#include "ctype.h"#include "vi.h"#include "regexp.h"static char *previous; /* the previous regexp, used when null regexp is given */#ifndef NO_MAGIC/* THE REAL REGEXP PACKAGE IS USED UNLESS "NO_MAGIC" IS DEFINED *//* These are used to classify or recognize meta-characters */#define META '\0'#define BASE_META(m) ((m) - 256)#define INT_META(c) ((c) + 256)#define IS_META(m) ((m) >= 256)#define IS_CLASS(m) ((m) >= M_CLASS(0) && (m) <= M_CLASS(9))#define IS_START(m) ((m) >= M_START(0) && (m) <= M_START(9))#define IS_END(m) ((m) >= M_END(0) && (m) <= M_END(9))#define IS_CLOSURE(m) ((m) >= M_SPLAT && (m) <= M_RANGE)#define ADD_META(s,m) (*(s)++ = META, *(s)++ = BASE_META(m))#define GET_META(s) (*(s) == META ? INT_META(*++(s)) : *s)/* These are the internal codes used for each type of meta-character */#define M_BEGLINE 256 /* internal code for ^ */#define M_ENDLINE 257 /* internal code for $ */#define M_BEGWORD 258 /* internal code for \< */#define M_ENDWORD 259 /* internal code for \> */#define M_ANY 260 /* internal code for . */#define M_SPLAT 261 /* internal code for * */#define M_PLUS 262 /* internal code for \+ */#define M_QMARK 263 /* internal code for \? */#define M_RANGE 264 /* internal code for \{ */#define M_CLASS(n) (265+(n)) /* internal code for [] */#define M_START(n) (275+(n)) /* internal code for \( */#define M_END(n) (285+(n)) /* internal code for \) *//* These are used during compilation */static int class_cnt; /* used to assign class IDs */static int start_cnt; /* used to assign start IDs */static int end_stk[NSUBEXP];/* used to assign end IDs */static int end_sp;static char *retext; /* points to the text being compiled *//* error-handling stuff */jmp_buf errorhandler;#define FAIL(why) regerror(why); longjmp(errorhandler, 1)/* This function builds a bitmap for a particular class */static char *makeclass(text, bmap) REG char *text; /* start of the class */ REG char *bmap; /* the bitmap */{ REG int i; int complement = 0; /* zero the bitmap */ for (i = 0; bmap && i < 32; i++) { bmap[i] = 0; } /* see if we're going to complement this class */ if (*text == '^') { text++; complement = 1; } /* add in the characters */ while (*text && *text != ']') { /* is this a span of characters? */ if (text[1] == '-' && text[2]) { /* spans can't be backwards */ if (text[0] > text[2]) { FAIL("Backwards span in []"); } /* add each character in the span to the bitmap */ for (i = text[0]; bmap && i <= text[2]; i++) { bmap[i >> 3] |= (1 << (i & 7)); } /* move past this span */ text += 3; } else { /* add this single character to the span */ i = *text++; if (bmap) { bmap[i >> 3] |= (1 << (i & 7)); } } } /* make sure the closing ] is missing */ if (*text++ != ']') { FAIL("] missing"); } /* if we're supposed to complement this class, then do so */ if (complement && bmap) { for (i = 0; i < 32; i++) { bmap[i] = ~bmap[i]; } } return text;}/* This function gets the next character or meta character from a string. * The pointer is incremented by 1, or by 2 for \-quoted characters. For [], * a bitmap is generated via makeclass() (if re is given), and the * character-class text is skipped. */static int gettoken(sptr, re) char **sptr; regexp *re;{ int c; c = **sptr; ++*sptr; if (c == '\\') { c = **sptr; ++*sptr; switch (c) { case '<': return M_BEGWORD; case '>': return M_ENDWORD; case '(': if (start_cnt >= NSUBEXP) { FAIL("Too many \\(s"); } end_stk[end_sp++] = start_cnt; return M_START(start_cnt++); case ')': if (end_sp <= 0) { FAIL("Mismatched \\)"); } return M_END(end_stk[--end_sp]); case '*': return (*o_magic ? c : M_SPLAT); case '.': return (*o_magic ? c : M_ANY); case '+': return M_PLUS; case '?': return M_QMARK;#ifndef CRUNCH case '{': return M_RANGE;#endif default: return c; } } else if (*o_magic) { switch (c) { case '^': if (*sptr == retext + 1) { return M_BEGLINE; } return c; case '$': if (!**sptr) { return M_ENDLINE; } return c; case '.': return M_ANY; case '*': return M_SPLAT; case '[': /* make sure we don't have too many classes */ if (class_cnt >= 10) { FAIL("Too many []s"); } /* process the character list for this class */ if (re) { /* generate the bitmap for this class */ *sptr = makeclass(*sptr, re->program + 1 + 32 * class_cnt); } else { /* skip to end of the class */ *sptr = makeclass(*sptr, (char *)0); } return M_CLASS(class_cnt++); default: return c; } } else /* unquoted nomagic */ { switch (c) { case '^': if (*sptr == retext + 1) { return M_BEGLINE; } return c; case '$': if (!**sptr) { return M_ENDLINE; } return c; default: return c; } } /*NOTREACHED*/}/* This function calculates the number of bytes that will be needed for a * compiled RE. Its argument is the uncompiled version. It is not clever * about catching syntax errors; that is done in a later pass. */static unsigned calcsize(text) char *text;{ unsigned size; int token; retext = text; class_cnt = 0; start_cnt = 1; end_sp = 0; size = 5; while ((token = gettoken(&text, (regexp *)0)) != 0) { if (IS_CLASS(token)) { size += 34; }#ifndef CRUNCH else if (token == M_RANGE) { size += 4; while ((token = gettoken(&text, (regexp *)0)) != 0 && token != '}') { } if (!token) { return size; } }#endif else if (IS_META(token)) { size += 2; } else { size++; } } return size;}/* This function compiles a regexp. */regexp *regcomp(exp) char *exp;{ int needfirst; unsigned size; int token; int peek; char *build; regexp *re;#ifndef CRUNCH int from; int to; int digit;#endif /* prepare for error handling */ re = (regexp *)0; if (setjmp(errorhandler)) { if (re) { free(re); } return (regexp *)0; } /* if an empty regexp string was given, use the previous one */ if (*exp == 0) { if (!previous) { FAIL("No previous RE"); } exp = previous; } else /* non-empty regexp given, so remember it */ { if (previous) free(previous); previous = (char *)malloc((unsigned)(strlen(exp) + 1)); if (previous) strcpy(previous, exp); } /* allocate memory */ class_cnt = 0; start_cnt = 1; end_sp = 0; retext = exp; size = calcsize(exp) + sizeof(regexp) + 10; /* !!! 10 bytes for slop */#ifdef lint re = ((regexp *)0) + size;#else re = (regexp *)malloc((unsigned)size);#endif if (!re) { FAIL("Not enough memory for this RE"); } /* compile it */ build = &re->program[1 + 32 * class_cnt]; re->program[0] = class_cnt; for (token = 0; token < NSUBEXP; token++) { re->startp[token] = re->endp[token] = (char *)0; } re->first = 0; re->bol = 0; re->minlen = 0; needfirst = 1; class_cnt = 0; start_cnt = 1; end_sp = 0; retext = exp; for (token = M_START(0), peek = gettoken(&exp, re); token; token = peek, peek = gettoken(&exp, re)) { /* special processing for the closure operator */ if (IS_CLOSURE(peek)) { /* detect misuse of closure operator */ if (IS_START(token)) { FAIL("Closure operator follows nothing"); } else if (IS_META(token) && token != M_ANY && !IS_CLASS(token)) { FAIL("Closure operators can only follow a normal character or . or []"); }#ifndef CRUNCH /* if \{ \} then read the range */ if (peek == M_RANGE) { from = 0; for (digit = gettoken(&exp, re); !IS_META(digit) && isdigit(digit); digit = gettoken(&exp, re)) { from = from * 10 + digit - '0'; } if (digit == '}') { to = from; } else if (digit == ',') { to = 0; for (digit = gettoken(&exp, re); !IS_META(digit) && isdigit(digit); digit = gettoken(&exp, re)) { to = to * 10 + digit - '0'; } if (to == 0) { to = 255; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -