⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 newmgrep.c

📁 Mehldau和Myer的Agrep3版本
💻 C
📖 第 1 页 / 共 5 页
字号:
/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal.  All Rights Reserved. */

/*

[fix] TG 22.10.97 3.34	in mgrep(): check bounds before memcpy-ing
[chg] TG 21.08.96	now uses ISO_CHAR.H

*/

/* multipattern matcher */

#include <stdio.h>
#include <ctype.h>

#ifdef __EMX__
#include <sys/types.h>
#endif

#include <sys/stat.h>
#include "agrep.h"
#include "codepage.h"
#ifndef _WIN32
#include <sys/time.h>
#else
#include <sys/timeb.h>
#include "config.h"
#endif

extern unsigned char LUT[256];

#define ddebug
#define uchar unsigned char
#undef	MAXPAT
#define MAXPAT  256
#undef	MAXLINE
#define MAXLINE 1024
#undef	MAXSYM
#define MAXSYM  256
#define MAXMEMBER1 32768
/* #define MAXMEMBER1 262144 */ /*2^18 */ 
#define MAXPATFILE 600000
#define BLOCKSIZE  16384
#define MAXHASH    32768 
/* #define MAXHASH    262144 */
#define mask5 	   32767
#define max_num    40000
#if	ISO_CHAR_SET
#define W_DELIM	   256
#else
#define W_DELIM	   128
#endif
#define L_DELIM    10 
#define Hbits 5 /* how much to shift to perform the hash */

extern int LIMITOUTPUT, LIMITPERFILE;
extern int BYTECOUNT, PRINTOFFSET, PRINTRECORD, CurrentByteOffset;
extern int MULTI_OUTPUT;	/* used by glimpse only if OR, never for AND */
extern int DELIMITER;
extern CHAR D_pattern[MaxDelimit*2];
extern int D_length;
extern CHAR tc_D_pattern[MaxDelimit*2];
extern int tc_D_length;
extern COUNT, FNAME, SILENT, FILENAMEONLY, prev_num_of_matched, num_of_matched;
extern INVERSE, OUTTAIL;
extern WORDBOUND, WHOLELINE, NOUPPER;
extern ParseTree *AParse;
extern int AComplexPattern;
extern unsigned char  CurrentFileName[], Progname[]; 
extern total_line;
extern agrep_initialfd;
extern int EXITONERROR;
extern int PRINTPATTERN;
extern int agrep_inlen;
extern CHAR *agrep_inbuffer;
extern FILE *agrep_finalfp;
extern int agrep_outpointer;
extern int agrep_outlen;
extern CHAR * agrep_outbuffer;
#ifndef __BORLANDC__
extern int errno;
#endif
extern int NEW_FILE, POST_FILTER;

extern int tuncompressible();
extern int quick_tcompress();
extern int quick_tuncompress();
extern int TCOMPRESSED;
extern int EASYSEARCH;
extern char FREQ_FILE[MAX_LINE_LEN], HASH_FILE[MAX_LINE_LEN], STRING_FILE[MAX_LINE_LEN];
extern char PAT_FILE_NAME[MAX_LINE_LEN];

uchar SHIFT1[MAXMEMBER1];

int   LONG  = 0;
int   SHORT = 0;
int   p_size= 0;

uchar tr[MAXSYM];
uchar tr1[MAXSYM];
int   HASH[MAXHASH];
int   Hash2[max_num];
uchar *PatPtr[max_num];
uchar *pat_spool = NULL; /* [MAXPATFILE+2*max_num+MAXPAT]; */
uchar *patt[max_num];
int   pat_len[max_num];
int   pat_indices[max_num]; /* pat_indices[p] gives the actual index in matched_teriminals: used only with AParse != 0 */
int num_pat;

extern char  amatched_terminals[MAXNUM_PAT]; /* which patterns have been matched in the current line? Used only with AParse != 0, so max_num is not needed */
extern int anum_terminals;
extern int AComplexBoolean;
static void countline();

#ifdef _WIN32
int  eval_tree();         /* asplit.c */
int  fill_buf();          /* bitap.c */
int  monkey1();           /* newmgrep.c */
int  m_short();           /* newmgrep.c */
#endif

#if	DOTCOMPRESSED
/* Equivalent variables for compression search */
uchar tc_SHIFT1[MAXMEMBER1];

int   tc_LONG  = 0;
int   tc_SHORT = 0;
int   tc_p_size= 0;

uchar tc_tr[MAXSYM];
uchar tc_tr1[MAXSYM];
int   tc_HASH[MAXHASH];
int   tc_Hash2[max_num];
uchar *tc_PatPtr[max_num];
uchar *tc_pat_spool = NULL; /* [MAXPATFILE+2*max_num+MAXPAT]; */
uchar *tc_patt[max_num];
int   tc_pat_len[max_num];
int   tc_pat_indices[max_num]; /* pat_indices[p] gives the actual index in matched_teriminals: used only with AParse != 0 */
int tc_num_pat;	/* must be the same as num_pat */
#endif	/*DOTCOMPRESSED*/

static void f_prep();
static void f_prep1();
static void accumulate();
#if	DOTCOMPRESSED
static void tc_f_prep();
static void tc_f_prep1();
static void tc_accumulate();
#endif

#ifdef perf_check
	int cshift=0, cshift0=0, chash=0;
#endif

/*
 * General idea behind output processing with delimiters, inverse, compression, etc.
 * CAUTION: In compressed files, we can search ONLY for simple patterns or their ;,.
 * Attempts to search for complex patterns / with errors might lead to spurious matches.
 * 1. Once we find the match, go back and forward to get the delimiters that surround
 *    the matched region.
 * 2. If it is a compressed file, verify that the match is "real" (compressed files
 *    can have pseudo matches hence this filtering step is required).
 * 3. Increment num_of_matched.
 * 4. Process some output options which print stuff before the matched region is
 *    printed.
 * 5. If there is compression, decomress and output the matched region. Otherwise
 *    just output it as is. Remember, from step (1) we know the matched region.
 * 6. If inverse is set, then we must keep track of the end of the last matched region
 *    in the variable lastout. When there is a match, we must print everything from
 *    lastout to the beginning of the current matched region (curtextbegin) and then
 *    update lastout to point to the end of the current matched region (curtextend).
 *    ALSO: if we exit from the main loops, we must output everything from the end
 *    of the last matched region to the end of the input buffer.
 * 7. Delimiter handling in complex patterns is different: there the search is done
 *    for a boolean and of the delimiter pattern and the actual pattern.
 * 8. For convenience and speed, the multipattern matching routines to handle
 *    compressed files have been separated from their (normal) counterparts.
 * 9. One special note on handling complicated boolean patterns: the parse
 *    tree will be the same for both compressed and uncomrpessed patterns and the
 *    same amatched_terminals array will be used in both. BUT, the pat_spool and
 *    pat_index, etc., will be different as they refer to the individual terminals.
 */

int
prepf(mfp, mbuf, mlen)
int mfp, mlen;
unsigned char *mbuf;
{
	int length=0, i, p=1;
	uchar *pat_ptr;
	unsigned Mask = 31;
	int num_read;
	unsigned char *buf;
	struct stat stbuf;
	int j, k;	/* to implement \\ */

        if ((mfp == -1) && ((mbuf == NULL) || (mlen <= 0))) return -1;

	if (mfp != -1) {
		if (fstat(mfp, &stbuf) == -1) {
			fprintf(stderr, "%s: cannot stat file: %s\n", Progname, PAT_FILE_NAME);
			return -1;
		}
		if (!S_ISREG(stbuf.st_mode)) {
			fprintf(stderr, "%s: pattern file not regular file: %s\n", Progname, PAT_FILE_NAME);
			return -1;
		}
		if (stbuf.st_size*2 > MAXPATFILE + 2*max_num) {
			fprintf(stderr, "%s: pattern file too large (> %d B): %s\n", Progname, (MAXPATFILE+2*max_num)/2, PAT_FILE_NAME);
			return -1;
		}
		if (pat_spool != NULL) free(pat_spool);
		pat_ptr = pat_spool = (unsigned char *)malloc(stbuf.st_size*2 + MAXPAT);
		alloc_buf(mfp, &buf, MAXPATFILE+2*BLOCKSIZE);
		while((num_read = fill_buf(mfp, buf+length, 2*BLOCKSIZE)) > 0) {
			length = length + num_read;
			if(length > MAXPATFILE) {
				fprintf(stderr, "%s: maximum pattern file size is %d\n", Progname, MAXPATFILE);
                                if (!EXITONERROR) {
                                        errno = AGREP_ERROR;
                                        free_buf(mfp, buf);
                                        return -1;
                                }
                                else exit(2);
			}
		}
	}
	else {
		buf = mbuf;
		length = mlen;
		if (mlen*2 > MAXPATFILE + 2*max_num) {
			fprintf(stderr, "%s: pattern buffer too large (> %d B)\n", Progname, (MAXPATFILE+2*max_num)/2);
			return -1;
		}
		if (pat_spool != NULL) free(pat_spool);
		pat_ptr = pat_spool = (unsigned char *)malloc(mlen*2 + MAXPAT);
	}

	/* Now all the patterns are in buf */
	buf[length] = '\n';
	i=0; p=1;

/* removed by Udi 11/8/94 - we now do WORDBOUND "by hand" 
 *
 *	if(WORDBOUND) {
 *		while(i<length) {
 *			patt[p] = pat_ptr;
 *			*pat_ptr++ = W_DELIM;
 *			while((i<length) && ((*pat_ptr = buf[i++]) != '\n')) pat_ptr++;
 *			*pat_ptr++ = W_DELIM;
 *			*pat_ptr++ = 0;
 *			p++;
 *		}
 *	}
 *	else
 */
	
	if(WHOLELINE) {
		while(i<length) {
			patt[p] = pat_ptr;
			*pat_ptr++ = L_DELIM;
			while((i<length) && ((*pat_ptr = buf[i++]) != '\n')) pat_ptr++;
			*pat_ptr++ = L_DELIM;
			*pat_ptr++ = 0;
			p++;
		}
	}
	else {
		while(i < length) {
			patt[p] = pat_ptr;
			while((i<length) && ((*pat_ptr = buf[i++]) != '\n')) pat_ptr++;
			*pat_ptr++ = 0;
			p++;  
		}
	}

	/* Now, the patterns have been copied into patt[] */
	if(p>max_num) {
		fprintf(stderr, "%s: maximum number of patterns is %d\n", Progname, max_num); 
                if (!EXITONERROR) {
                        errno = AGREP_ERROR;
                        free_buf(mfp, buf);
                        return -1;
                }
                else exit(2);

	}

	for(i=1; i<20; i++) *pat_ptr = i;  /* boundary safety zone */

	/* I might have to keep changing tr s.t. mgrep won't get confused with W_DELIM */
	
	for(i=0; i< MAXSYM; i++) tr[i] = i;
	
	if(NOUPPER) {
                for (i=0; i<MAXSYM; i++)

#if ((defined(__EMX__) || defined(_WIN32)) && defined(ISO_CHAR_SET))
			tr[i] = tr[LUT[i]];
#else
                        if (isupper(i)) tr[i] = tr[tolower(i)];
#endif
	}

/*	if(WORDBOUND) {
		for(i=1; i<MAXSYM; i++) if(!isalnum(i)) tr[i] = W_DELIM;
	}

	removed by Udi 11/8/94 - the trick of using W-delim was too buggy.
	we now do it "by hand" after we find a match
*/

	for(i=0; i< MAXSYM; i++) tr1[i] = tr[i]&Mask;
	num_pat =  p-1;
	p_size  =  MAXPAT;
	for(i=1; i<=num_pat; i++) {
		p = strlen(patt[i]);
		if ((patt[i][0] == '^') || (patt[i][0] == '$')) patt[i][0] = '\n';
		if ((p > 1) && ((patt[i][p-1] == '^') || (patt[i][p-1] == '$')) && (patt[i][p-2] != '\\')) patt[i][p-1] = '\n';

		/* Added by bg, Dec 2nd 1994 */
		for (k=0; k<p; k++) {
			if (patt[i][k] == '\\') {
				for (j=k; j<p; j++)
					patt[i][j] = patt[i][j+1]; /* including '\0' */
				p--;
			}
		}

		pat_len[i] = p;
		/*
		pat_len[i] = (WORDBOUND?(p-2>0?p-2:1):p);  changed by Udi 11/8/94
		*/
#ifdef	debug
		printf("prepf(): patt[%d]=%s, pat_len[%d]=%d\n", i, patt[i], i, pat_len[i]);
#endif
		if(p!=0 && p < p_size) p_size = p;	/* MIN */
	}
	if(p_size == 0) {
		fprintf(stderr, "%s: the pattern file is empty\n", Progname);
                if (!EXITONERROR) {
                        errno = AGREP_ERROR;
                        free_buf(mfp, buf);
                        return -1;
                }
                else exit(2);
	}
	if(length > 400 && p_size > 2) LONG = 1;
	if(p_size == 1) SHORT = 1;
	for(i=0; i<MAXMEMBER1; i++) SHIFT1[i] = p_size - 1 - LONG;
	for(i=0; i<MAXHASH; i++) {
		HASH[i] = 0;
	}
	for(i=1; i<=num_pat; i++) f_prep(i, patt[i]);
	accumulate();
	memset(pat_indices, '\0', sizeof(int) * (num_pat + 1));
	for(i=1; i<=num_pat; i++) f_prep1(i, patt[i]);

#if	DOTCOMPRESSED
	/* prepf for compression */
	if (-1 == tc_prepf(buf, length)) {
		free_buf(mfp, buf);
		return -1;
	}
#endif	/*DOTCOMPRESSED*/
        free_buf(mfp, buf);
        return 0;
}

#if	DOTCOMPRESSED
/*
 * Compression equivalent of prepf: called right after prepf.
 * 1. Read patt and SHIFT1
 * 2. Call tcompress on the patterns in patt and put in tc_patt.
 * 3. Use these patterns to compute tc_SHIFT (ignore WDELIM, LDELIM, case sensitivity, etc.)

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -