egrep.c
来自「早期freebsd实现」· C语言 代码 · 共 925 行 · 第 1/2 页
C
925 行
/*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */#ifndef lintstatic char copyright[] ="@(#) Copyright (c) 1991, 1993\n\ The Regents of the University of California. All rights reserved.\n";#endif /* not lint */#ifndef lintstatic char sccsid[] = "@(#)egrep.c 8.1 (Berkeley) 6/6/93";#endif /* not lint *//* Hybrid Boyer/Moore/Gosper-assisted 'grep/egrep/fgrep' search, with delta0 table as in original paper (CACM, October, 1977). No delta1 or delta2. According to experiment (Horspool, Soft. Prac. Exp., 1982), delta2 is of minimal practical value. However, to improve for worst case input, integrating the improved Galil strategies (Apostolico/Giancarlo, SIAM. J. Comput., Feb. 1986) deserves consideration. Method: extract longest metacharacter-free string from expression. this is done using a side-effect from henry spencer's regcomp(). use boyer-moore to match such, then pass submatching lines to either regexp() or standard 'egrep', depending on certain criteria within execstrategy() below. [this tradeoff is due to the general slowness of the regexp() nondeterministic machine on complex expressions, as well as the startup time of standard 'egrep' on short files.] alternatively, one may change the vendor-supplied 'egrep' automaton to include boyer-moore directly. see accompanying writeup for discussion of kanji expression treatment. late addition: apply trickbag for fast match of simple alternations (sublinear, in common low-cardinality cases). trap fgrep into this lair. gnu additions: -f, newline as |, \< and \> [in regexec()], more comments. inspire better dfa exec() strategy. serious testing and help with special cases. Algorithm amalgam summary: dfa e?grep (aho/thompson) ndfa regexp() (spencer/aho) bmg (boyer/moore/gosper) "superimposed" bmg (jaw) fgrep (aho/corrasick) sorry, but the knuth/morris/pratt machine, horspool's "frequentist" code, and the rabin/karp matcher, however cute, just don't cut it for this production. James A. Woods Copyright (c) 1986 NASA Ames Research Center*/#include <sys/types.h>#include <sys/stat.h>#include <sys/file.h>#include <regexp.h> /* must be henry spencer's version */#include <stdio.h>#include <ctype.h>#include "pathnames.h"#define MIN(A, B) ((A) > (B) ? (B) : (A))#ifdef SLOWSYS#define read xread#endif#define BUFSIZE 8192 /* make higher for cray */#define PATSIZE 6000#define LARGE BUFSIZE + PATSIZE#define NALT 7 /* tied to scanf() size in alternate() */#define NMUSH 6 /* loosely relates to expected alt length */#define FIRSTFEW 33 /* Always do FIRSTFEW matches with regexec() */#define PUNTPERCENT 10 /* After FIRSTFEW, if PUNTPERCENT of the input * was processed by regexp(), exec std egrep. */#define NL '\n'#define EOS '\0'#define NONASCII 0200 /* Bit mask for Kanji non-ascii chars */#define META "\n^$.[]()?+*|\\" /* egrep meta-characters */#define SS2 '\216' /* EUC Katakana (or Chinese2) prefix */#define SS3 '\217' /* EUC Kanji2 (or Chinese3) prefix */extern char *optarg;extern int optind;char *progname;int cflag, iflag, eflag, fflag, lflag, nflag; /* SVID flags */int sflag, hflag; /* v7, v8, bsd */int firstflag; /* Stop at first match */int grepflag; /* Called as "grep" */int fgrepflag; /* Called as "fgrep" */int altflag; /* Simple alternation in pattern */int boyonly; /* No regexp needed -- all simple */int flushflag;int grepold, egrepold, fgrepold;int nalt; /* Number of alternatives */int nsuccess; /* 1 for match, 2 for error */int altmin; /* Minimum length of all the alternate * strings */int firstfile; /* argv index of first file argument */int patind; /* argv index of pattern */long nmatch; /* Number of matches in this file */long incount, counted; /* Amount of input consumed */long rxcount; /* Bytes of input processed by regexec() */int boyfound; /* accumulated partial matches (tripped by * FIRSTFEW) */int prevmatch; /* next three lines aid fast -n */long nline, prevnline;char *prevloc;regexp *rspencer;char *pattern;char *patboy; /* Pattern for simple Boyer-Moore */char *patfile; /* Filename containing pattern(s) */int delta0[256]; /* Boyer-Moore algorithm core */char cmap[256]; /* Usually 0-255, but if -i, maps upper to * lower case */char str[BUFSIZE + 2];int nleftover;char linetemp[BUFSIZE];char *altpat[NALT]; /* alternation component storage */int altlen[NALT];short altset[NMUSH + 1][256];char preamble[200]; /* match prefix (filename, line no.) */int fd;char *strchr(), *strrchr(), *strcpy(), *strncpy(), *strpbrk(), *malloc();char *grepxlat(), *fold(), *pfile(), *alternate(), *isolate();char *gotamatch(), *kanji(), *linesave(), *submatch();char **args;main(argc, argv) int argc; char *argv[];{ int c, oflag; int errflag = 0; args = argv; if ((progname = strrchr(argv[0], '/')) != 0) progname++; else progname = argv[0]; if (strcmp(progname, "grep") == 0) grepflag++; else if (strcmp(progname, "fgrep") == 0) fgrepflag++; oflag = 0; while ((c = getopt(argc, argv, "bchie:f:lnosvwxy1")) != EOF) { switch (c) { case 'f': fflag++; patfile = optarg; continue; case 'b': case 'v': egrepold++; /* boyer-moore of little help here */ continue; case 'c': cflag++; continue; case 'e': eflag++; pattern = optarg; continue; case 'h': hflag++; continue; case 'o': oflag++; continue; case '1': /* Stop at very first match */ firstflag++; /* spead freaks only */ continue; case 'i': iflag++; continue; case 'l': lflag++; continue; case 'n': nflag++; continue; case 's': sflag++; continue; case 'w': case 'y': if (!grepflag) errflag++; grepold++; continue; case 'x': /* needs more work, like -b above */ if (!fgrepflag) errflag++; fgrepold++; continue; case '?': errflag++; } } if (errflag || ((argc <= optind) && !fflag && !eflag)) { if (grepflag)oops("usage: grep [-bchilnosvwy] [-e] pattern [file ...]"); else if (fgrepflag)oops("usage: fgrep [-bchilnosvx] {-f patfile | [-e] strings} [file ...]"); else /* encourage SVID options, though we provide * others */oops("usage: egrep [-bchilnosv] {-f patfile | [-e] pattern} [file ...]"); } if (fflag) pattern = pfile(patfile); else if (!eflag) { patind = optind; pattern = argv[optind++]; } if (!oflag && (argc - optind) <= 1) /* Filename invisible given < 2 files */ hflag++; if (pattern[0] == EOS) kernighan(argv); /* same as it ever was */ /* * 'grep/egrep' merger -- "old" grep is called to handle: tagged * exprs \( \), word matches \< and \>, -w and -y options, char * classes with '-' at end (egrep bug?), and patterns beginning with * an asterisk (don't ask why). otherwise, characters meaningful to * 'egrep' but not to 'grep' are escaped; the entire expr is then * passed to 'egrep'. */ if (grepflag && !grepold) { if (strindex(pattern, "\\(") >= 0 || strindex(pattern, "\\<") >= 0 || strindex(pattern, "\\>") >= 0 || strindex(pattern, "-]") >= 0 || pattern[0] == '*') /* grep bug */ grepold++; else pattern = grepxlat(pattern); } if (grepold || egrepold || fgrepold) kernighan(argv); if (iflag) strcpy(pattern, fold(pattern)); /* * If the pattern is a plain string, just run boyer-moore. If it * consists of meta-free alternatives, run "superimposed" bmg. * Otherwise, find best string, and compile pattern for regexec(). */ if (strpbrk(pattern, META) == NULL) { /* do boyer-moore only */ boyonly++; patboy = pattern; } else { if ((patboy = alternate(pattern)) != NULL) boyonly++; else { if ((patboy = isolate(pattern)) == NULL) kernighan(argv); /* expr too involved */#ifndef NOKANJI for (c = 0; pattern[c] != EOS; c++) if (pattern[c] & NONASCII) /* kanji + meta */ kernighan(argv);#endif if ((rspencer = regcomp(pattern)) == NULL) oops("regcomp failure"); } } gosper(patboy); /* "pre-conditioning is wonderful" * -- v. strassen */ if ((firstfile = optind) >= argc) { /* Grep standard input */ if (lflag) /* We don't know its name! */ exit(1); egsecute((char *) NULL); } else { while (optind < argc) { egsecute(argv[optind]); optind++; if (firstflag && (nsuccess == 1)) break; } } exit((nsuccess == 2) ? 2 : (nsuccess == 0));}char *pfile(pfname) /* absorb expression from file */ char *pfname;{ int fd; struct stat patstat; static char *pat; if ((fd = open(pfname, O_RDONLY, 0)) < 0) oops("can't read pattern file"); if (fstat(fd, &patstat) != 0) oops("can't stat pattern file"); if (patstat.st_size > PATSIZE) { if (fgrepflag) { /* defer to unix version */ fgrepold++; return "dummy"; } else oops("pattern file too big"); } if ((pat = malloc((unsigned) patstat.st_size + 1)) == NULL) oops("out of memory to read pattern file"); if (patstat.st_size != read(fd, pat, (int)patstat.st_size)) oops("error reading pattern file"); (void) close(fd); pat[patstat.st_size] = EOS; if (pat[patstat.st_size - 1] == NL) /* NOP for egrep; helps grep */ pat[patstat.st_size - 1] = EOS; if (nlcount(pat, &pat[patstat.st_size]) > NALT) { if (fgrepflag) fgrepold++; /* "what's it all about, alfie?" */ else egrepold++; } return (pat);}egsecute(file) char *file;{ extern int errno; if (file == NULL) fd = 0; else if ((fd = open(file, O_RDONLY, 0)) <= 0) { fprintf(stderr, "%s: %s: %s\n", progname, file, strerror(errno)); nsuccess = 2; return; } chimaera(file, patboy); if (!boyonly && !flushflag && file != NULL) flushmatches(); if (file != NULL) close(fd);}chimaera(file, pat) /* "reach out and boyer-moore search someone" */ char *file, *pat; /* -- soon-to-be-popular bumper sticker */{ register char *k, *strend, *s; register int j, count; register int *deltazero = delta0; int patlen = altmin; char *t; nleftover = boyfound = flushflag = 0; nline = 1L; prevmatch = 0; nmatch = counted = rxcount = 0L; while ((count = read(fd, str + nleftover, BUFSIZE - nleftover)) > 0) { counted += count; strend = linesave(str, count); for (k = str + patlen - 1; k < strend;) { /* * for a large class of patterns, upwards of 80% of * match time is spent on the next line. we beat * existing microcode (vax 'matchc') this way. */ while ((k += deltazero[*(unsigned char *) k]) < strend); if (k < (str + LARGE)) break; k -= LARGE; if (altflag) { /* * Parallel Boyer-Moore. Check whether each * of the previous <altmin> chars COULD be * from one of the alternative strings. */ s = k - 1; j = altmin; while (altset[--j][(unsigned char) cmap[*(unsigned char *) s--]]); /* * quick test fails. in this life, compare * 'em all. but, a "reverse trie" would * attenuate worst case (linear w/delta2?). */ if (--j < 0) { count = nalt - 1; do { s = k; j = altlen[count]; t = altpat[count]; while (cmap[*(unsigned char *) s--] == t[--j]); if (j < 0) break; } while (count--); } } else { /* One string -- check it */ j = patlen - 1; s = k - 1; while (cmap[*(unsigned char *) s--] == pat[--j]); } /* * delta-less shortcut for literati. short shrift for * genetic engineers? */ if (j >= 0) { k++; /* no match; restart next char */ continue; } k = submatch(file, pat, str, strend, k, count);
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?