📄 regex.c
字号:
#include "portable.h"
#if defined( MACOS ) || defined( DOS ) || defined( _WIN32 ) || \
defined( NEED_BSDREGEX ) || defined(PGPSOCKETSLDAP) /* jason */
#include "regex.h"
/*
* regex - Regular expression pattern matching and replacement
*
* By: Ozan S. Yigit (oz)
* Dept. of Computer Science
* York University
*
* These routines are the PUBLIC DOMAIN equivalents of regex
* routines as found in 4.nBSD UN*X, with minor extensions.
*
* These routines are derived from various implementations found
* in software tools books, and Conroy's grep. They are NOT derived
* from licensed/restricted software.
* For more interesting/academic/complicated implementations,
* see Henry Spencer's regexp routines, or GNU Emacs pattern
* matching module.
*
* Modification history:
*
* $Log: regex.c,v $
* Revision 1.3 1998/02/18 13:19:59 jason
* Complete Keyserver API
*
* Revision 1.2 1997/10/21 01:07:12 marcd
* Reformated for 80 char margins
*
* Revision 1.1.1.1 1997/10/04 19:17:00 marcd
* Src tree restructuring after r1.0
*
* Revision 1.3 1997/06/09 18:25:50 heller
* Minor changes for Mac client compilation.
*
* Revision 1.2 1997/06/05 17:53:12 marcd
* Port to NT and initial keyserver functions
*
* Revision 1.12 1996/04/25 16:20:59 mcs
* make re_exec() match "" with ".*" and similar patterns
* hopefully this change doesn't break anything else!
*
* Revision 1.11 1994/12/14 21:33:45 mcs
* use new NEED_BSDREGEX
* fix pmatch() prototype
*
* Revision 1.10 1994/12/12 18:16:39 mcs
* use on NetBSD
*
* Revision 1.9 1994/11/15 19:16:35 mcs
* add (CHAR) cast to make VisualC++ happy
*
* Revision 1.8 1994/11/08 21:14:32 mcs
* WIN32 changes
*
* Revision 1.7 1994/07/23 19:51:24 mcs
* use ANSI-style inline function parameters
*
* Revision 1.6 1993/10/18 01:52:32 tim
* include for VMS
*
* Revision 1.5 1993/09/28 21:37:54 mcs
* HP/UX needs the regex we include (not in its libc)
*
* Revision 1.4 1993/08/27 15:59:52 mcs
* use CHAR for deftab
*
* Revision 1.3 1993/08/27 15:49:47 mcs
* added missing 0 to octal constants
* use unsigned char for CHAR under DOS
*
* Revision 1.2 1993/08/27 14:57:48 mcs
* add proto. for pmatch
*
* Revision 1.1 1993/08/18 21:20:02 mcs
* Initial revision
*
* Revision 1.4 1991/10/17 03:56:42 oz
* miscellaneous changes, small cleanups etc.
*
* Revision 1.3 1989/04/01 14:18:09 oz
* Change all references to a dfa: this is actually an nfa.
*
* Revision 1.2 88/08/28 15:36:04 oz
* Use a complement bitmap to represent NCL.
* This removes the need to have seperate
* code in the pmatch case block - it is
* just CCL code now.
*
* Use the actual CCL code in the CLO
* section of pmatch. No need for a recursive
* pmatch call.
*
* Use a bitmap table to set char bits in an
* 8-bit chunk.
*
* Interfaces:
* re_comp: compile a regular expression into a NFA.
*
* char *re_comp(s)
* char *s;
*
* re_exec: execute the NFA to match a pattern.
*
* int re_exec(s)
* char *s;
*
* re_modw change re_exec's understanding of what a "word"
* looks like (for \< and \>) by adding into the
* hidden word-syntax table.
*
* void re_modw(s)
* char *s;
*
* re_subs: substitute the matched portions in a new string.
*
* int re_subs(src, dst)
* char *src;
* char *dst;
*
* re_fail: failure routine for re_exec.
*
* void re_fail(msg, op)
* char *msg;
* char op;
*
* Regular Expressions:
*
* [1] char matches itself, unless it is a special
* character (metachar): . \ [ ] * + ^ $
*
* [2] . matches any character.
*
* [3] \ matches the character following it, except
* when followed by a left or right round bracket,
* a digit 1 to 9 or a left or right angle bracket.
* (see [7], [8] and [9])
* It is used as an escape character for all
* other meta-characters, and itself. When used
* in a set ([4]), it is treated as an ordinary
* character.
*
* [4] [set] matches one of the characters in the set.
* If the first character in the set is "^",
* it matches a character NOT in the set, i.e.
* complements the set. A shorthand S-E is
* used to specify a set of characters S upto
* E, inclusive. The special characters "]" and
* "-" have no special meaning if they appear
* as the first chars in the set.
* examples: match:
*
* [a-z] any lowercase alpha
*
* [^]-] any char except ] and -
*
* [^A-Z] any char except uppercase
* alpha
*
* [a-zA-Z] any alpha
*
* [5] * any regular expression form [1] to [4], followed by
* closure char (*) matches zero or more matches of
* that form.
*
* [6] + same as [5], except it matches one or more.
*
* [7] a regular expression in the form [1] to [10], enclosed
* as \(form\) matches what form matches. The enclosure
* creates a set of tags, used for [8] and for
* pattern substution. The tagged forms are numbered
* starting from 1.
*
* [8] a \ followed by a digit 1 to 9 matches whatever a
* previously tagged regular expression ([7]) matched.
*
* [9] \< a regular expression starting with a \< construct
* \> and/or ending with a \> construct, restricts the
* pattern matching to the beginning of a word, and/or
* the end of a word. A word is defined to be a character
* string beginning and/or ending with the characters
* A-Z a-z 0-9 and _. It must also be preceded and/or
* followed by any character outside those mentioned.
*
* [10] a composite regular expression xy where x and y
* are in the form [1] to [10] matches the longest
* match of x followed by a match for y.
*
* [11] ^ a regular expression starting with a ^ character
* $ and/or ending with a $ character, restricts the
* pattern matching to the beginning of the line,
* or the end of line. [anchors] Elsewhere in the
* pattern, ^ and $ are treated as ordinary characters.
*
*
* Acknowledgements:
*
* HCR's Hugh Redelmeier has been most helpful in various
* stages of development. He convinced me to include BOW
* and EOW constructs, originally invented by Rob Pike at
* the University of Toronto.
*
* References:
* Software tools Kernighan & Plauger
* Software tools in Pascal Kernighan & Plauger
* Grep [rsx-11 C dist] David Conroy
* ed - text editor Un*x Programmer's Manual
* Advanced editing on Un*x B. W. Kernighan
* RegExp routines Henry Spencer
*
* Notes:
*
* This implementation uses a bit-set representation for character
* classes for speed and compactness. Each character is represented
* by one bit in a 128-bit block. Thus, CCL always takes a
* constant 16 bytes in the internal nfa, and re_exec does a single
* bit comparison to locate the character in the set.
*
* Examples:
*
* pattern: foo*.*
* compile: CHR f CHR o CLO CHR o END CLO ANY END END
* matches: fo foo fooo foobar fobar foxx ...
*
* pattern: fo[ob]a[rz]
* compile: CHR f CHR o CCL bitset CHR a CCL bitset END
* matches: fobar fooar fobaz fooaz
*
* pattern: foo\\+
* compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END
* matches: foo\ foo\\ foo\\\ ...
*
* pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo)
* compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
* matches: foo1foo foo2foo foo3foo
*
* pattern: \(fo.*\)-\1
* compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
* matches: foo-foo fo-fo fob-fob foobar-foobar ...
*/
#define MAXNFA 1024
#define MAXTAG 10
#define OKP 1
#define NOP 0
#define CHR 1
#define ANY 2
#define CCL 3
#define BOL 4
#define EOL 5
#define BOT 6
#define EOT 7
#define BOW 8
#define EOW 9
#define REF 10
#define CLO 11
#define END 0
/*
* The following defines are not meant to be changeable.
* They are for readability only.
*/
#define MAXCHR 128
#define CHRBIT 8
#define BITBLK MAXCHR/CHRBIT
#define BLKIND 0170
#define BITIND 07
#define ASCIIB 0177
#if defined( DOS ) || defined( _WIN32 )
typedef unsigned char CHAR;
#else /* DOS */
typedef /*unsigned*/ char CHAR;
#endif /* DOS */
static int tagstk[MAXTAG]; /* subpat tag stack..*/
static CHAR nfa[MAXNFA]; /* automaton.. */
static int sta = NOP; /* status of lastpat */
static CHAR bittab[BITBLK]; /* bit table for CCL */
/* pre-set bits... */
static CHAR bitarr[] = {1,2,4,8,16,32,64,128};
static void
chset(CHAR c)
{
bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND];
}
#define badpat(x) (*nfa = END, x)
#define store(x) *mp++ = x
char *
re_comp( char *pat )
{
register char *p; /* pattern pointer */
register CHAR *mp=nfa; /* nfa pointer */
register CHAR *lp; /* saved pointer.. */
register CHAR *sp=nfa; /* another one.. */
register int tagi = 0; /* tag stack index */
register int tagc = 1; /* actual tag count */
register int n;
register CHAR mask; /* xor mask -CCL/NCL */
int c1, c2;
if (!pat || !*pat)
if (sta)
return 0;
else
return badpat("No previous regular expression");
sta = NOP;
for (p = pat; *p; p++) {
lp = mp;
switch(*p) {
case '.': /* match any char.. */
store(ANY);
break;
case '^': /* match beginning.. */
if (p == pat)
store(BOL);
else {
store(CHR);
store(*p);
}
break;
case '$': /* match endofline.. */
if (!*(p+1))
store(EOL);
else {
store(CHR);
store(*p);
}
break;
case '[': /* match char class..*/
store(CCL);
if (*++p == '^') {
mask = 0377;
p++;
}
else
mask = 0;
if (*p == '-') /* real dash */
chset(*p++);
if (*p == ']') /* real brac */
chset(*p++);
while (*p && *p != ']') {
if (*p == '-' && *(p+1) && *(p+1) != ']') {
p++;
c1 = *(p-2) + 1;
c2 = *p++;
while (c1 <= c2)
chset((CHAR)c1++);
}
#ifdef EXTEND
else if (*p == '\\' && *(p+1)) {
p++;
chset(*p++);
}
#endif
else
chset(*p++);
}
if (!*p)
return badpat("Missing ]");
for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
store(mask ^ bittab[n]);
break;
case '*': /* match 0 or more.. */
case '+': /* match 1 or more.. */
if (p == pat)
return badpat("Empty closure");
lp = sp; /* previous opcode */
if (*lp == CLO) /* equivalence.. */
break;
switch(*lp) {
case BOL:
case BOT:
case EOT:
case BOW:
case EOW:
case REF:
return badpat("Illegal closure");
default:
break;
}
if (*p == '+')
for (sp = mp; lp < sp; lp++)
store(*lp);
store(END);
store(END);
sp = mp;
while (--mp > lp)
*mp = mp[-1];
store(CLO);
mp = sp;
break;
case '\\': /* tags, backrefs .. */
switch(*++p) {
case '(':
if (tagc < MAXTAG) {
tagstk[++tagi] = tagc;
store(BOT);
store(tagc++);
}
else
return badpat("Too many \\(\\) pairs");
break;
case ')':
if (*sp == BOT)
return badpat("Null pattern inside \\(\\)");
if (tagi > 0) {
store(EOT);
store(tagstk[tagi--]);
}
else
return badpat("Unmatched \\)");
break;
case '<':
store(BOW);
break;
case '>':
if (*sp == BOW)
return badpat("Null pattern inside \\<\\>");
store(EOW);
break;
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
n = *p-'0';
if (tagi > 0 && tagstk[tagi] == n)
return badpat("Cyclical reference");
if (tagc > n) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -