📄 newmgrep.c
字号:
/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal. All Rights Reserved. *//* multipattern matcher */#include <stdio.h>#include <ctype.h>#include <errno.h>#ifdef ultrix#include <sys/types.h>#endif#include <sys/stat.h>#include "agrep.h"#include <sys/time.h>#define ddebug#define uchar unsigned char#undef MAXPAT#define MAXPAT 256#undef MAXLINE#define MAXLINE 1024#undef MAXSYM#define MAXSYM 256#define MAXMEMBER1 32768/* #define MAXMEMBER1 262144 */ /*2^18 */ #define MAXPATFILE 600000#define BLOCKSIZE 16384#define MAXHASH 32768 /* #define MAXHASH 262144 */#define mask5 32767#define max_num MAX_DASHF_FILES#if ISO_CHAR_SET#define W_DELIM 256#else#define W_DELIM 128#endif#define L_DELIM 10 #define Hbits 5 /* how much to shift to perform the hash */extern char aduplicates[MAXNUM_PAT][MAXNUM_PAT]; /* tells what other patterns are exactly equal to the i-th one */extern char tc_aduplicates[MAXNUM_PAT][MAXNUM_PAT]; /* tells what other patterns are exactly equal to the i-th one */extern ParseTree aterminals[MAXNUM_PAT];extern int AComplexBoolean;extern int LIMITOUTPUT, LIMITPERFILE;extern int BYTECOUNT, PRINTOFFSET, PRINTRECORD, CurrentByteOffset;extern int MULTI_OUTPUT; /* used by glimpse only if OR, never for AND */extern int DELIMITER;extern CHAR D_pattern[MaxDelimit*2];extern int D_length;extern CHAR tc_D_pattern[MaxDelimit*2];extern int tc_D_length;extern COUNT, FNAME, SILENT, FILENAMEONLY, prev_num_of_matched, num_of_matched, PRINTFILETIME;extern INVERSE, OUTTAIL;extern WORDBOUND, WHOLELINE, NOUPPER;extern ParseTree *AParse;extern int AComplexPattern;extern unsigned char CurrentFileName[], Progname[]; extern long CurrentFileTime;extern total_line;extern agrep_initialfd;extern int EXITONERROR;extern int PRINTPATTERN;extern int agrep_inlen;extern CHAR *agrep_inbuffer;extern FILE *agrep_finalfp;extern int agrep_outpointer;extern int agrep_outlen;extern CHAR * agrep_outbuffer;extern int errno;extern int NEW_FILE, POST_FILTER;extern int tuncompressible();extern int quick_tcompress();extern int quick_tuncompress();extern int TCOMPRESSED;extern int EASYSEARCH;extern char FREQ_FILE[MAX_LINE_LEN], HASH_FILE[MAX_LINE_LEN], STRING_FILE[MAX_LINE_LEN];extern char PAT_FILE_NAME[MAX_LINE_LEN];uchar SHIFT1[MAXMEMBER1];int LONG = 0;int SHORT = 0;int p_size= 0;uchar tr[MAXSYM];uchar tr1[MAXSYM];int HASH[MAXHASH];int Hash2[max_num];uchar *PatPtr[max_num];uchar *pat_spool = NULL; /* [MAXPATFILE+2*max_num+MAXPAT]; */uchar *patt[max_num];int pat_len[max_num];int pat_indices[max_num]; /* pat_indices[p] gives the actual index in matched_teriminals: used only with AParse != 0 */int num_pat;extern char amatched_terminals[MAXNUM_PAT]; /* which patterns have been matched in the current line? Used only with AParse != 0, so max_num is not needed */extern int anum_terminals;extern int AComplexBoolean;static void countline();void acompute_duplicates();#if DOTCOMPRESSED/* Equivalent variables for compression search */uchar tc_SHIFT1[MAXMEMBER1];int tc_LONG = 0;int tc_SHORT = 0;int tc_p_size= 0;uchar tc_tr[MAXSYM];uchar tc_tr1[MAXSYM];int tc_HASH[MAXHASH];int tc_Hash2[max_num];uchar *tc_PatPtr[max_num];uchar *tc_pat_spool = NULL; /* [MAXPATFILE+2*max_num+MAXPAT]; */uchar *tc_patt[max_num];int tc_pat_len[max_num];int tc_pat_indices[max_num]; /* pat_indices[p] gives the actual index in matched_teriminals: used only with AParse != 0 */int tc_num_pat; /* must be the same as num_pat */#endif /*DOTCOMPRESSED*/static void f_prep();static void f_prep1();static void accumulate();#if DOTCOMPRESSEDstatic void tc_f_prep();static void tc_f_prep1();static void tc_accumulate();#endif#ifdef perf_check int cshift=0, cshift0=0, chash=0;#endif/* * General idea behind output processing with delimiters, inverse, compression, etc. * CAUTION: In compressed files, we can search ONLY for simple patterns or their ;,. * Attempts to search for complex patterns / with errors might lead to spurious matches. * 1. Once we find the match, go back and forward to get the delimiters that surround * the matched region. * 2. If it is a compressed file, verify that the match is "real" (compressed files * can have pseudo matches hence this filtering step is required). * 3. Increment num_of_matched. * 4. Process some output options which print stuff before the matched region is * printed. * 5. If there is compression, decomress and output the matched region. Otherwise * just output it as is. Remember, from step (1) we know the matched region. * 6. If inverse is set, then we must keep track of the end of the last matched region * in the variable lastout. When there is a match, we must print everything from * lastout to the beginning of the current matched region (curtextbegin) and then * update lastout to point to the end of the current matched region (curtextend). * ALSO: if we exit from the main loops, we must output everything from the end * of the last matched region to the end of the input buffer. * 7. Delimiter handling in complex patterns is different: there the search is done * for a boolean and of the delimiter pattern and the actual pattern. * 8. For convenience and speed, the multipattern matching routines to handle * compressed files have been separated from their (normal) counterparts. * 9. One special note on handling complicated boolean patterns: the parse * tree will be the same for both compressed and uncomrpessed patterns and the * same amatched_terminals array will be used in both. BUT, the pat_spool and * pat_index, etc., will be different as they refer to the individual terminals. */intprepf(mfp, mbuf, mlen)int mfp, mlen;unsigned char *mbuf;{ int length=0, i, p=1; uchar *pat_ptr; unsigned Mask = 31; int num_read; unsigned char *buf; struct stat stbuf; int j, k; /* to implement \\ */ if ((mfp == -1) && ((mbuf == NULL) || (mlen <= 0))) return -1; if (mfp != -1) { if (fstat(mfp, &stbuf) == -1) { fprintf(stderr, "%s: cannot stat file: %s\n", Progname, PAT_FILE_NAME); return -1; } if (!S_ISREG(stbuf.st_mode)) { fprintf(stderr, "%s: pattern file not regular file: %s\n", Progname, PAT_FILE_NAME); return -1; } if (stbuf.st_size*2 > MAXPATFILE + 2*max_num) { fprintf(stderr, "%s: pattern file too large (> %d B): %s\n", Progname, (MAXPATFILE+2*max_num)/2, PAT_FILE_NAME); return -1; } if (pat_spool != NULL) free(pat_spool); pat_ptr = pat_spool = (unsigned char *)malloc(stbuf.st_size*2 + MAXPAT); alloc_buf(mfp, &buf, MAXPATFILE+2*BlockSize); while((num_read = fill_buf(mfp, buf+length, 2*BlockSize)) > 0) { length = length + num_read; if(length > MAXPATFILE) { fprintf(stderr, "%s: maximum pattern file size is %d\n", Progname, MAXPATFILE); if (!EXITONERROR) { errno = AGREP_ERROR; free_buf(mfp, buf); return -1; } else exit(2); } } } else { buf = mbuf; length = mlen; if (mlen*2 > MAXPATFILE + 2*max_num) { fprintf(stderr, "%s: pattern buffer too large (> %d B)\n", Progname, (MAXPATFILE+2*max_num)/2); return -1; } if (pat_spool != NULL) free(pat_spool); pat_ptr = pat_spool = (unsigned char *)malloc(mlen*2 + MAXPAT); } /* Now all the patterns are in buf */ buf[length] = '\n'; i=0; p=1;/* removed by Udi 11/8/94 - we now do WORDBOUND "by hand" if(WORDBOUND) { while(i<length) { patt[p] = pat_ptr; *pat_ptr++ = W_DELIM; while((i<length) && ((*pat_ptr = buf[i++]) != '\n')) pat_ptr++; *pat_ptr++ = W_DELIM; *pat_ptr++ = 0; p++; } } else*/ if(WHOLELINE) { while(i<length) { patt[p] = pat_ptr; *pat_ptr++ = L_DELIM; while((i<length) && ((*pat_ptr = buf[i++]) != '\n')) pat_ptr++; *pat_ptr++ = L_DELIM; *pat_ptr++ = 0; p++; } } else { while(i < length) { patt[p] = pat_ptr; while((i<length) && ((*pat_ptr = buf[i++]) != '\n')) pat_ptr++; *pat_ptr++ = 0; p++; } } /* Now, the patterns have been copied into patt[] */ if(p>max_num) { fprintf(stderr, "%s: maximum number of patterns is %d\n", Progname, max_num); if (!EXITONERROR) { errno = AGREP_ERROR; free_buf(mfp, buf); return -1; } else exit(2); } for(i=1; i<20; i++) *pat_ptr = i; /* boundary safety zone */ /* I might have to keep changing tr s.t. mgrep won't get confused with W_DELIM */ for(i=0; i< MAXSYM; i++) tr[i] = i; if(NOUPPER) { for (i=0; i<MAXSYM; i++) if (isupper(i)) tr[i] = tr[tolower(i)]; /* for(i='A'; i<= 'Z'; i++) tr[i] = i + 'a' - 'A'; */ }/* if(WORDBOUND) { for(i=1; i<MAXSYM; i++) if(!isalnum(i)) tr[i] = W_DELIM; }removed by Udi 11/8/94 - the trick of using W-delim was too buggy.we now do it "by hand" after we find a match */ for(i=0; i< MAXSYM; i++) tr1[i] = tr[i]&Mask; num_pat = p-1; p_size = MAXPAT; for(i=1; i<=num_pat; i++) { p = strlen(patt[i]); if ((patt[i][0] == '^') || (patt[i][0] == '$')) patt[i][0] = '\n'; if ((p > 1) && ((patt[i][p-1] == '^') || (patt[i][p-1] == '$')) && (patt[i][p-2] != '\\')) patt[i][p-1] = '\n'; /* Added by bg, Dec 2nd 1994 */ for (k=0; k<p; k++) { if (patt[i][k] == '\\') { for (j=k; j<p; j++) patt[i][j] = patt[i][j+1]; /* including '\0' */ p--; } } pat_len[i] = p; /* pat_len[i] = (WORDBOUND?(p-2>0?p-2:1):p); changed by Udi 11/8/94 */#ifdef debug printf("prepf(): patt[%d]=%s, pat_len[%d]=%d\n", i, patt[i], i, pat_len[i]);#endif if(p!=0 && p < p_size) p_size = p; /* MIN */ } if(p_size == 0) { fprintf(stderr, "%s: the pattern file is empty\n", Progname); if (!EXITONERROR) { errno = AGREP_ERROR; free_buf(mfp, buf); return -1; } else exit(2); } if(length > 400 && p_size > 2) LONG = 1; if(p_size == 1) SHORT = 1; for(i=0; i<MAXMEMBER1; i++) SHIFT1[i] = p_size - 1 - LONG; for(i=0; i<MAXHASH; i++) { HASH[i] = 0; } for(i=1; i<=num_pat; i++) f_prep(i, patt[i]); accumulate(); memset(pat_indices, '\0', sizeof(int) * (num_pat + 1)); for(i=1; i<=num_pat; i++) f_prep1(i, patt[i]);#if DOTCOMPRESSED /* prepf for compression */ if (-1 == tc_prepf(buf, length)) { free_buf(mfp, buf); return -1; }#endif /*DOTCOMPRESSED*/ free_buf(mfp, buf); acompute_duplicates(aduplicates, aterminals, anum_terminals, tr); return 0;}#if DOTCOMPRESSED/* * Compression equivalent of prepf: called right after prepf. * 1. Read patt and SHIFT1 * 2. Call tcompress on the patterns in patt and put in tc_patt. * 3. Use these patterns to compute tc_SHIFT (ignore WDELIM, LDELIM, case sensitivity, etc.) * 4. Process other variables/functions (pat_spool, tr, tr1, pat_len, accumulate, SHIFT1, f_prep, f_prep1, pat_indices) appropriately. */inttc_prepf(buf, length)unsigned char *buf;int length;{ int i, p=1; uchar *pat_ptr; unsigned Mask = 31; int tc_length; unsigned char tc_buf[MAXPAT * 2]; /* maximum length of the compressed pattern */ static struct timeval initt, finalt; if (length*2 > MAXPATFILE + 2*max_num) { fprintf(stderr, "%s: pattern buffer too large (> %d B)\n", Progname, (MAXPATFILE+2*max_num)/2); return -1; } if (tc_pat_spool != NULL) free(tc_pat_spool); pat_ptr = tc_pat_spool = (unsigned char *)malloc(length*2 + MAXPAT);#if MEASURE_TIMES gettimeofday(&initt, NULL);#endif /*MEASURE_TIMES*/ i=0; p=1; while(i < length) { tc_patt[p] = pat_ptr; while((*pat_ptr = buf[i++]) != '\n') pat_ptr++; *pat_ptr++ = 0; if ((tc_length = quick_tcompress(FREQ_FILE, HASH_FILE, tc_patt[p], strlen(tc_patt[p]), tc_buf, MAXPAT * 2 - 8, TC_EASYSEARCH)) > 0) { memcpy(tc_patt[p], tc_buf, tc_length); tc_patt[p][tc_length] = '\0'; pat_ptr = tc_patt[p] + tc_length + 1; /* character after '\0' */ } p++; } for(i=1; i<20; i++) *pat_ptr = i; /* boundary safety zone */ /* Ignore all other options: it is automatically W_DELIM */ for(i=0; i< MAXSYM; i++) tc_tr[i] = i; for(i=0; i< MAXSYM; i++) tc_tr1[i] = tc_tr[i]&Mask; tc_num_pat = p-1; tc_p_size = MAXPAT; for(i=1; i<=num_pat; i++) { p = strlen(tc_patt[i]); tc_pat_len[i] = p;#ifdef debug
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -