📄 newmgrep.c
字号:
/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal. All Rights Reserved. */
/*
[fix] TG 22.10.97 3.34 in mgrep(): check bounds before memcpy-ing
[chg] TG 21.08.96 now uses ISO_CHAR.H
*/
/* multipattern matcher */
#include <stdio.h>
#include <ctype.h>
#ifdef __EMX__
#include <sys/types.h>
#endif
#include <sys/stat.h>
#include "agrep.h"
#include "codepage.h"
#ifndef _WIN32
#include <sys/time.h>
#else
#include <sys/timeb.h>
#include "config.h"
#endif
extern unsigned char LUT[256];
#define ddebug
#define uchar unsigned char
#undef MAXPAT
#define MAXPAT 256
#undef MAXLINE
#define MAXLINE 1024
#undef MAXSYM
#define MAXSYM 256
#define MAXMEMBER1 32768
/* #define MAXMEMBER1 262144 */ /*2^18 */
#define MAXPATFILE 600000
#define BLOCKSIZE 16384
#define MAXHASH 32768
/* #define MAXHASH 262144 */
#define mask5 32767
#define max_num 40000
#if ISO_CHAR_SET
#define W_DELIM 256
#else
#define W_DELIM 128
#endif
#define L_DELIM 10
#define Hbits 5 /* how much to shift to perform the hash */
extern int LIMITOUTPUT, LIMITPERFILE;
extern int BYTECOUNT, PRINTOFFSET, PRINTRECORD, CurrentByteOffset;
extern int MULTI_OUTPUT; /* used by glimpse only if OR, never for AND */
extern int DELIMITER;
extern CHAR D_pattern[MaxDelimit*2];
extern int D_length;
extern CHAR tc_D_pattern[MaxDelimit*2];
extern int tc_D_length;
extern COUNT, FNAME, SILENT, FILENAMEONLY, prev_num_of_matched, num_of_matched;
extern INVERSE, OUTTAIL;
extern WORDBOUND, WHOLELINE, NOUPPER;
extern ParseTree *AParse;
extern int AComplexPattern;
extern unsigned char CurrentFileName[], Progname[];
extern total_line;
extern agrep_initialfd;
extern int EXITONERROR;
extern int PRINTPATTERN;
extern int agrep_inlen;
extern CHAR *agrep_inbuffer;
extern FILE *agrep_finalfp;
extern int agrep_outpointer;
extern int agrep_outlen;
extern CHAR * agrep_outbuffer;
#ifndef __BORLANDC__
extern int errno;
#endif
extern int NEW_FILE, POST_FILTER;
extern int tuncompressible();
extern int quick_tcompress();
extern int quick_tuncompress();
extern int TCOMPRESSED;
extern int EASYSEARCH;
extern char FREQ_FILE[MAX_LINE_LEN], HASH_FILE[MAX_LINE_LEN], STRING_FILE[MAX_LINE_LEN];
extern char PAT_FILE_NAME[MAX_LINE_LEN];
uchar SHIFT1[MAXMEMBER1];
int LONG = 0;
int SHORT = 0;
int p_size= 0;
uchar tr[MAXSYM];
uchar tr1[MAXSYM];
int HASH[MAXHASH];
int Hash2[max_num];
uchar *PatPtr[max_num];
uchar *pat_spool = NULL; /* [MAXPATFILE+2*max_num+MAXPAT]; */
uchar *patt[max_num];
int pat_len[max_num];
int pat_indices[max_num]; /* pat_indices[p] gives the actual index in matched_teriminals: used only with AParse != 0 */
int num_pat;
extern char amatched_terminals[MAXNUM_PAT]; /* which patterns have been matched in the current line? Used only with AParse != 0, so max_num is not needed */
extern int anum_terminals;
extern int AComplexBoolean;
static void countline();
#ifdef _WIN32
int eval_tree(); /* asplit.c */
int fill_buf(); /* bitap.c */
int monkey1(); /* newmgrep.c */
int m_short(); /* newmgrep.c */
#endif
#if DOTCOMPRESSED
/* Equivalent variables for compression search */
uchar tc_SHIFT1[MAXMEMBER1];
int tc_LONG = 0;
int tc_SHORT = 0;
int tc_p_size= 0;
uchar tc_tr[MAXSYM];
uchar tc_tr1[MAXSYM];
int tc_HASH[MAXHASH];
int tc_Hash2[max_num];
uchar *tc_PatPtr[max_num];
uchar *tc_pat_spool = NULL; /* [MAXPATFILE+2*max_num+MAXPAT]; */
uchar *tc_patt[max_num];
int tc_pat_len[max_num];
int tc_pat_indices[max_num]; /* pat_indices[p] gives the actual index in matched_teriminals: used only with AParse != 0 */
int tc_num_pat; /* must be the same as num_pat */
#endif /*DOTCOMPRESSED*/
static void f_prep();
static void f_prep1();
static void accumulate();
#if DOTCOMPRESSED
static void tc_f_prep();
static void tc_f_prep1();
static void tc_accumulate();
#endif
#ifdef perf_check
int cshift=0, cshift0=0, chash=0;
#endif
/*
* General idea behind output processing with delimiters, inverse, compression, etc.
* CAUTION: In compressed files, we can search ONLY for simple patterns or their ;,.
* Attempts to search for complex patterns / with errors might lead to spurious matches.
* 1. Once we find the match, go back and forward to get the delimiters that surround
* the matched region.
* 2. If it is a compressed file, verify that the match is "real" (compressed files
* can have pseudo matches hence this filtering step is required).
* 3. Increment num_of_matched.
* 4. Process some output options which print stuff before the matched region is
* printed.
* 5. If there is compression, decomress and output the matched region. Otherwise
* just output it as is. Remember, from step (1) we know the matched region.
* 6. If inverse is set, then we must keep track of the end of the last matched region
* in the variable lastout. When there is a match, we must print everything from
* lastout to the beginning of the current matched region (curtextbegin) and then
* update lastout to point to the end of the current matched region (curtextend).
* ALSO: if we exit from the main loops, we must output everything from the end
* of the last matched region to the end of the input buffer.
* 7. Delimiter handling in complex patterns is different: there the search is done
* for a boolean and of the delimiter pattern and the actual pattern.
* 8. For convenience and speed, the multipattern matching routines to handle
* compressed files have been separated from their (normal) counterparts.
* 9. One special note on handling complicated boolean patterns: the parse
* tree will be the same for both compressed and uncomrpessed patterns and the
* same amatched_terminals array will be used in both. BUT, the pat_spool and
* pat_index, etc., will be different as they refer to the individual terminals.
*/
int
prepf(mfp, mbuf, mlen)
int mfp, mlen;
unsigned char *mbuf;
{
int length=0, i, p=1;
uchar *pat_ptr;
unsigned Mask = 31;
int num_read;
unsigned char *buf;
struct stat stbuf;
int j, k; /* to implement \\ */
if ((mfp == -1) && ((mbuf == NULL) || (mlen <= 0))) return -1;
if (mfp != -1) {
if (fstat(mfp, &stbuf) == -1) {
fprintf(stderr, "%s: cannot stat file: %s\n", Progname, PAT_FILE_NAME);
return -1;
}
if (!S_ISREG(stbuf.st_mode)) {
fprintf(stderr, "%s: pattern file not regular file: %s\n", Progname, PAT_FILE_NAME);
return -1;
}
if (stbuf.st_size*2 > MAXPATFILE + 2*max_num) {
fprintf(stderr, "%s: pattern file too large (> %d B): %s\n", Progname, (MAXPATFILE+2*max_num)/2, PAT_FILE_NAME);
return -1;
}
if (pat_spool != NULL) free(pat_spool);
pat_ptr = pat_spool = (unsigned char *)malloc(stbuf.st_size*2 + MAXPAT);
alloc_buf(mfp, &buf, MAXPATFILE+2*BLOCKSIZE);
while((num_read = fill_buf(mfp, buf+length, 2*BLOCKSIZE)) > 0) {
length = length + num_read;
if(length > MAXPATFILE) {
fprintf(stderr, "%s: maximum pattern file size is %d\n", Progname, MAXPATFILE);
if (!EXITONERROR) {
errno = AGREP_ERROR;
free_buf(mfp, buf);
return -1;
}
else exit(2);
}
}
}
else {
buf = mbuf;
length = mlen;
if (mlen*2 > MAXPATFILE + 2*max_num) {
fprintf(stderr, "%s: pattern buffer too large (> %d B)\n", Progname, (MAXPATFILE+2*max_num)/2);
return -1;
}
if (pat_spool != NULL) free(pat_spool);
pat_ptr = pat_spool = (unsigned char *)malloc(mlen*2 + MAXPAT);
}
/* Now all the patterns are in buf */
buf[length] = '\n';
i=0; p=1;
/* removed by Udi 11/8/94 - we now do WORDBOUND "by hand"
*
* if(WORDBOUND) {
* while(i<length) {
* patt[p] = pat_ptr;
* *pat_ptr++ = W_DELIM;
* while((i<length) && ((*pat_ptr = buf[i++]) != '\n')) pat_ptr++;
* *pat_ptr++ = W_DELIM;
* *pat_ptr++ = 0;
* p++;
* }
* }
* else
*/
if(WHOLELINE) {
while(i<length) {
patt[p] = pat_ptr;
*pat_ptr++ = L_DELIM;
while((i<length) && ((*pat_ptr = buf[i++]) != '\n')) pat_ptr++;
*pat_ptr++ = L_DELIM;
*pat_ptr++ = 0;
p++;
}
}
else {
while(i < length) {
patt[p] = pat_ptr;
while((i<length) && ((*pat_ptr = buf[i++]) != '\n')) pat_ptr++;
*pat_ptr++ = 0;
p++;
}
}
/* Now, the patterns have been copied into patt[] */
if(p>max_num) {
fprintf(stderr, "%s: maximum number of patterns is %d\n", Progname, max_num);
if (!EXITONERROR) {
errno = AGREP_ERROR;
free_buf(mfp, buf);
return -1;
}
else exit(2);
}
for(i=1; i<20; i++) *pat_ptr = i; /* boundary safety zone */
/* I might have to keep changing tr s.t. mgrep won't get confused with W_DELIM */
for(i=0; i< MAXSYM; i++) tr[i] = i;
if(NOUPPER) {
for (i=0; i<MAXSYM; i++)
#if ((defined(__EMX__) || defined(_WIN32)) && defined(ISO_CHAR_SET))
tr[i] = tr[LUT[i]];
#else
if (isupper(i)) tr[i] = tr[tolower(i)];
#endif
}
/* if(WORDBOUND) {
for(i=1; i<MAXSYM; i++) if(!isalnum(i)) tr[i] = W_DELIM;
}
removed by Udi 11/8/94 - the trick of using W-delim was too buggy.
we now do it "by hand" after we find a match
*/
for(i=0; i< MAXSYM; i++) tr1[i] = tr[i]&Mask;
num_pat = p-1;
p_size = MAXPAT;
for(i=1; i<=num_pat; i++) {
p = strlen(patt[i]);
if ((patt[i][0] == '^') || (patt[i][0] == '$')) patt[i][0] = '\n';
if ((p > 1) && ((patt[i][p-1] == '^') || (patt[i][p-1] == '$')) && (patt[i][p-2] != '\\')) patt[i][p-1] = '\n';
/* Added by bg, Dec 2nd 1994 */
for (k=0; k<p; k++) {
if (patt[i][k] == '\\') {
for (j=k; j<p; j++)
patt[i][j] = patt[i][j+1]; /* including '\0' */
p--;
}
}
pat_len[i] = p;
/*
pat_len[i] = (WORDBOUND?(p-2>0?p-2:1):p); changed by Udi 11/8/94
*/
#ifdef debug
printf("prepf(): patt[%d]=%s, pat_len[%d]=%d\n", i, patt[i], i, pat_len[i]);
#endif
if(p!=0 && p < p_size) p_size = p; /* MIN */
}
if(p_size == 0) {
fprintf(stderr, "%s: the pattern file is empty\n", Progname);
if (!EXITONERROR) {
errno = AGREP_ERROR;
free_buf(mfp, buf);
return -1;
}
else exit(2);
}
if(length > 400 && p_size > 2) LONG = 1;
if(p_size == 1) SHORT = 1;
for(i=0; i<MAXMEMBER1; i++) SHIFT1[i] = p_size - 1 - LONG;
for(i=0; i<MAXHASH; i++) {
HASH[i] = 0;
}
for(i=1; i<=num_pat; i++) f_prep(i, patt[i]);
accumulate();
memset(pat_indices, '\0', sizeof(int) * (num_pat + 1));
for(i=1; i<=num_pat; i++) f_prep1(i, patt[i]);
#if DOTCOMPRESSED
/* prepf for compression */
if (-1 == tc_prepf(buf, length)) {
free_buf(mfp, buf);
return -1;
}
#endif /*DOTCOMPRESSED*/
free_buf(mfp, buf);
return 0;
}
#if DOTCOMPRESSED
/*
* Compression equivalent of prepf: called right after prepf.
* 1. Read patt and SHIFT1
* 2. Call tcompress on the patterns in patt and put in tc_patt.
* 3. Use these patterns to compute tc_SHIFT (ignore WDELIM, LDELIM, case sensitivity, etc.)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -