📄 regex.c
字号:
#include <pmachine.h>#ifndef HAVE_RE_COMP/* * These routines are BSD regex(3)/ed(1) compatible regular-expression * routines written by Ozan S. Yigit, Computer Science, York University. * Parts of the code that are not needed by Prospero have been removed, * but most of the accompanying information has been left intact. * This file is to be included on those operating systems that do not * support re_comp and re_exec. *//* * regex - Regular expression pattern matching * and replacement * * by: Ozan S. Yigit (oz@nexus.yorku.ca) * Dept. of Computing Services * York University * * These routines are the PUBLIC DOMAIN equivalents * of regex routines as found in 4.nBSD UN*X, with minor * extensions. * * Modification history: * * $Log: regex.c,v $ * Revision 1.4 1996/04/25 20:06:29 blob * Added const's so that it will compile on OSs that have const in the prototype in * unistd.h * * Revision 1.3 1996/04/11 06:52:27 blob * *** empty log message *** * * Revision 1.2 1996/04/11 06:51:34 blob * Cleaned up warnings... * * Revision 1.1.1.2 1996/03/14 22:06:31 blob * Try 1000000... * * Revision 1.1.1.1 1996/03/13 20:34:40 blob * Initial Socks5 Beta import. * * Revision 1.1 1991/11/20 02:32:13 brendan * entered into RCS * * Revision 1.1 1991/11/20 02:32:13 brendan * entered into RCS * * Revision 1.3 89/04/01 14:18:09 oz * Change all references to a dfa: this is actually an nfa. * * Revision 1.2 88/08/28 15:36:04 oz * Use a complement bitmap to represent NCL. * This removes the need to have seperate * code in the pmatch case block - it is * just CCL code now. * * Use the actual CCL code in the CLO * section of pmatch. No need for a recursive * pmatch call. * * Use a bitmap table to set char bits in an * 8-bit chunk. * * Routines: * re_comp: compile a regular expression into * a NFA. * * char *re_comp(s) * char *s; * * re_exec: execute the NFA to match a pattern. * * int re_exec(s) * char *s; * * Regular Expressions: * * [1] char matches itself, unless it is a special * character (metachar): . \ [ ] * + ^ $ * * [2] . matches any character. * * [3] \ matches the character following it, except * when followed by a left or right round bracket, * a digit 1 to 9 or a left or right angle bracket. * (see [7], [8] and [9]) * It is used as an escape character for all * other meta-characters, and itself. When used * in a set ([4]), it is treated as an ordinary * character. * * [4] [set] matches one of the characters in the set. * If the first character in the set is "^", * it matches a character NOT in the set, i.e. * complements the set. A shorthand S-E is * used to specify a set of characters S upto * E, inclusive. The special characters "]" and * "-" have no special meaning if they appear * as the first chars in the set. * examples: match: * * [a-z] any lowercase alpha * * [^]-] any char except ] and - * * [^A-Z] any char except uppercase * alpha * * [a-zA-Z] any alpha * * [5] * any regular expression form [1] to [4], followed by * closure char (*) matches zero or more matches of * that form. * * [6] + same as [5], except it matches one or more. * * [7] a regular expression in the form [1] to [10], enclosed * as \(form\) matches what form matches. The enclosure * creates a set of tags, used for [8] and for * pattern substution. The tagged forms are numbered * starting from 1. * * [8] a \ followed by a digit 1 to 9 matches whatever a * previously tagged regular expression ([7]) matched. * * [9] \< a regular expression starting with a \< construct * \> and/or ending with a \> construct, restricts the * pattern matching to the beginning of a word, and/or * the end of a word. A word is defined to be a character * string beginning and/or ending with the characters * A-Z a-z 0-9 and _. It must also be preceded and/or * followed by any character outside those mentioned. * * [10] a composite regular expression xy where x and y * are in the form [1] to [10] matches the longest * match of x followed by a match for y. * * [11] ^ a regular expression starting with a ^ character * $ and/or ending with a $ character, restricts the * pattern matching to the beginning of the line, * or the end of line. [anchors] Elsewhere in the * pattern, ^ and $ are treated as ordinary characters. * * * Acknowledgements: * * HCR's Hugh Redelmeier has been most helpful in various * stages of development. He convinced me to include BOW * and EOW constructs, originally invented by Rob Pike at * the University of Toronto. * * References: * Software tools Kernighan & Plauger * Software tools in Pascal Kernighan & Plauger * Grep [rsx-11 C dist] David Conroy * ed - text editor Un*x Programmer's Manual * Advanced editing on Un*x B. W. Kernighan * regexp routines Henry Spencer * * Notes: * * This implementation uses a bit-set representation for character * classes for speed and compactness. Each character is represented * by one bit in a 128-bit block. Thus, CCL always takes a * constant 16 bytes in the internal nfa, and re_exec does a single * bit comparison to locate the character in the set. * * Examples: * * pattern: foo*.* * compile: CHR f CHR o CLO CHR o END CLO ANY END END * matches: fo foo fooo foobar fobar foxx ... * * pattern: fo[ob]a[rz] * compile: CHR f CHR o CCL bitset CHR a CCL bitset END * matches: fobar fooar fobaz fooaz * * pattern: foo\\+ * compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END * matches: foo\ foo\\ foo\\\ ... * * pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo) * compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END * matches: foo1foo foo2foo foo3foo * * pattern: \(fo.*\)-\1 * compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END * matches: foo-foo fo-fo fob-fob foobar-foobar ... * */#define MAXNFA 1024#define MAXTAG 10#define OKP 1#define NOP 0#define CHR 1#define ANY 2#define CCL 3#define BOL 4#define EOL 5#define BOT 6#define EOT 7#define BOW 8#define EOW 9#define REF 10#define CLO 11#define END 0/* * The following defines are not meant * to be changeable. They are for readability * only. * */#define MAXCHR 128#define CHRBIT 8#define BITBLK MAXCHR/CHRBIT#define BLKIND 0170#define BITIND 07#define ASCIIB 0177typedef /*unsigned*/ char CHAR;static int tagstk[MAXTAG]; /* subpat tag stack..*/static CHAR nfa[MAXNFA]; /* automaton.. */static int sta = NOP; /* status of lastpat */static CHAR bittab[BITBLK]; /* bit table for CCL */ /* pre-set bits... */static CHAR bitarr[] = {1,2,4,8,16,32,64,128};static int internal_error;static voidchset(c)register CHAR c;{ bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND];}#define badpat(x) return (*nfa = END, x)#define store(x) *mp++ = x char * re_comp(pat)const char *pat;{ register const char *p; /* pattern pointer */ register CHAR *mp = nfa; /* nfa pointer */ register CHAR *lp; /* saved pointer.. */ register CHAR *sp = nfa; /* another one.. */ register int tagi = 0; /* tag stack index */ register int tagc = 1; /* actual tag count */ register int n; register CHAR mask; /* xor mask -CCL/NCL */ int c1, c2; if (!pat || !*pat) if (sta) return 0; else badpat("No previous regular expression"); sta = NOP; for (p = pat; *p; p++) { lp = mp; switch(*p) { case '.': /* match any char.. */ store(ANY); break; case '^': /* match beginning.. */ if (p == pat) store(BOL); else { store(CHR); store(*p); } break; case '$': /* match endofline.. */ if (!*(p+1)) store(EOL); else { store(CHR); store(*p); } break; case '[': /* match char class..*/ store(CCL); if (*++p == '^') { mask = 0377; p++; } else mask = 0; if (*p == '-') /* real dash */ chset(*p++); if (*p == ']') /* real brac */ chset(*p++); while (*p && *p != ']') { if (*p == '-' && *(p+1) && *(p+1) != ']') { p++; c1 = *(p-2) + 1; c2 = *p++; while (c1 <= c2) chset(c1++); }#ifdef EXTEND else if (*p == '\\' && *(p+1)) { p++; chset(*p++); }#endif else chset(*p++); } if (!*p) badpat("Missing ]"); for (n = 0; n < BITBLK; bittab[n++] = (char) 0) store(mask ^ bittab[n]); break; case '*': /* match 0 or more.. */ case '+': /* match 1 or more.. */ if (p == pat) badpat("Empty closure"); lp = sp; /* previous opcode */ if (*lp == CLO) /* equivalence.. */ break; switch(*lp) { case BOL: case BOT: case EOT: case BOW: case EOW: case REF: badpat("Illegal closure"); default: break; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -