⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 sgrep.c

📁 Mehldau和Myer的Agrep3版本
💻 C
📖 第 1 页 / 共 5 页
字号:
/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal.  All Rights Reserved. */

/* #define DEBUG2 */
/* #define DEBUG */

/*	

[chg]	05.10.96	sgrep():
			- when the last buffer is read:
			  no need to look back to the previous CR, 
			  just process until (end) = (buf_end)
			  
[chg]			bm():
			- when there is a hit at the last line of a file,
			  and the file has not a CR as the last character,
			  now the print buffer is terminated with an artifical
			  CR.
			   
[chg]	04.10.96	bm(): major bugs in the algorithm removed [TG]
[chg]	22.09.96	UL850[].lower_1 -> UL850.lower [TG]
[chg]	21.08.96	header file ISO_CHAR  [TG]
[chg] 	13.08.96	edited by Tom Gries <gries@ibm.net>

*/

#include <stdio.h>
#include <ctype.h>
#include "agrep.h"
#include "codepage.h"

extern unsigned char LUT[256];

#undef	MAXSYM
#define MAXSYM  256

#define MAXMEMBER 8192
#define	CHARTYPE	unsigned char

#undef	MaxError			/* don't use agrep.h definition */
#define MaxError 20

#define MAXPATT 256

#undef	MAXLINE
#define MAXLINE 1024

#undef	MAXNAME
#define MAXNAME 256

#undef	MaxCan				/* don't use agrep.h definition */
#define MaxCan  2048

#define BLOCKSIZE    16384
#define MAX_SHIFT_2  4096

#undef	ON
#define ON      1

#undef	OFF
#define OFF	0

#define LOG_ASCII	8
#define LOG_DNA 	3
#define MAXMEMBER_1	65536
#define LONG_EXAC	20
#define LONG_APPX	24

#if	ISO_CHAR_SET
#define W_DELIM		256
#else
#define W_DELIM		128
#endif

#ifndef _WIN32
#include <sys/time.h>
#else
#include <sys/timeb.h>
#endif

extern int tuncompressible();
extern int quick_tcompress();
extern int quick_tuncompress();

extern int DELIMITER, OUTTAIL;
extern int D_length, tc_D_length;
extern unsigned char D_pattern[MaxDelimit *2], tc_D_pattern[MaxDelimit *2];
extern int LIMITOUTPUT, LIMITPERFILE, INVERSE;
extern int CurrentByteOffset;
extern int BYTECOUNT;
extern int PRINTOFFSET;
extern int PRINTRECORD;
extern int CONSTANT, COUNT, FNAME, SILENT, FILENAMEONLY, prev_num_of_matched, num_of_matched;

extern int DNA ;	/* DNA flag is set in checksg when pattern is DNA pattern
			   and p_size > 16  */

extern WORDBOUND, WHOLELINE, NOUPPER;
extern unsigned char CurrentFileName[],  Progname[]; 
extern unsigned Mask[];
extern unsigned endposition;
extern int agrep_inlen;
extern CHARTYPE *agrep_inbuffer;

extern int agrep_initialfd;
extern FILE *agrep_finalfp;
extern int agrep_outpointer;
extern int agrep_outlen;
extern CHARTYPE * agrep_outbuffer;

extern int NEW_FILE, POST_FILTER;

extern int EXITONERROR;
#ifndef __BORLANDC__
extern int errno;
#endif
extern int TCOMPRESSED;
extern int EASYSEARCH;
extern char FREQ_FILE[MAX_LINE_LEN], HASH_FILE[MAX_LINE_LEN], STRING_FILE[MAX_LINE_LEN];

#ifdef _WIN32
int  fill_buf();          /* bitap.c */
int  a_monkey();          /* sgrep.c */
int  agrep();             /* sgrep.c */
int  bm();                /* sgrep.c */
int  blog();              /* sgrep.c */
int  monkey();            /* sgrep.c */
int  monkey4();           /* sgrep.c */
int  s_output();          /* sgrep.c */
int  verify();            /* sgrep.c */
#endif

#if	MEASURE_TIMES
/* timing variables */
extern int OUTFILTER_ms;
extern int FILTERALGO_ms;
extern int INFILTER_ms;
#endif	/*MEASURE_TIMES*/

unsigned char BSize;                /* log_c m   */
unsigned char char_map[MAXSYM];

/* data area */

int		shift_1;
CHARTYPE	SHIFT[MAXSYM];
CHARTYPE	MEMBER[MAXMEMBER];
CHARTYPE	pat[MAXPATT];
unsigned	Hashmask;
char		MEMBER_1[MAXMEMBER_1];
CHARTYPE	TR[MAXSYM];

static void initmask();
static void am_preprocess();
static void m_preprocess();
static void prep();
static void prep4();
static void prep_bm();

/*
 * General idea behind output processing with delimiters, inverse, compression, etc.

 * CAUTION: In compressed files, we can search ONLY for simple patterns or their ;,.
 * Attempts to search for complex patterns / with errors might lead to spurious matches.

 * 1. Once we find the match, go back and forward to get the delimiters that surround
 *    the matched region.

 * 2. If it is a compressed file, verify that the match is "real" (compressed files
 *    can have pseudo matches hence this filtering step is required).

 * 3. Increment num_of_matched.

 * 4. Process some output options which print stuff before the matched region is
 *    printed.

 * 5. If there is compression, decomress and output the matched region. Otherwise
 *    just output it as is. Remember, from step (1) we know the matched region.

 * 6. If inverse is set, then we must keep track of the end of the last matched region
 *    in the variable lastout. When there is a match, we must print everything from
 *    lastout to the beginning of the current matched region (curtextbegin) and then
 *    update lastout to point to the end of the current matched region (curtextend).
 *    ALSO: if we exit from the main loops, we must output everything from the end
 *    of the last matched region to the end of the input buffer.

 * 7. Delimiter handling in complex patterns is different: there the search is done
 *    for a boolean and of the delimiter pattern and the actual pattern.

 */

/* skips over escaped characters */

unsigned char *
mystrchr(s, c)

unsigned char *s;
int c;
{
	unsigned char	*t = s;

	while (*t) {
		if (*t == '\\') t++;
		else if (c == *t) return t;
		t ++;
	}
	return NULL;
}

void
char_tr(pat, m)

unsigned char *pat;
int *m;
{
	int i;
	unsigned char temp[MAXPATT];

	for(i=0; i<MAXSYM; i++) TR[i] = i;

	/* if(NOUPPER) [TG] */ {

		for(i=0; i<MAXSYM; i++)

#if ((defined(__EMX__) || defined(_WIN32)) && defined(ISO_CHAR_SET))
			TR[i] = TR[LUT[i]];
#else
			if (isupper(i)) TR[i] = TR[tolower(i)];
#endif
	}

	/*
	if(WORDBOUND) {
		for(i=0; i<MAXSYM; i++) {
			if(!isalnum(i)) TR[i] = W_DELIM;  removed by Udi.
			
			we don't use the trick of making the boundary W_delim anymore.
			It's too buggy otherwise and it's not necessary.
			
		}
	}
	removed by bg 11/8/94
	*/
		
	if(WHOLELINE) {
		memcpy(temp, pat, *m);
		pat[0] = '\n';
		memcpy(pat+1, temp, *m);
		pat[*m+1] = '\n';
		pat[*m+2] = 0;
		*m = *m + 2;
	}
}

int sgrep(in_pat, in_m, fd, D, samepattern)

CHARTYPE *in_pat;  
int fd, in_m, D;
{
	CHARTYPE patbuf[MAXLINE];
	CHARTYPE *pat = patbuf;
	int m = in_m;
	CHARTYPE *text; /* input text stream */
	int offset = 2*MAXLINE;
	int buf_end, num_read, i, start, end, residue = 0;
	int first_time = 1;
	CHARTYPE *oldpat = pat;
	int k, j, oldm = m;
	static CHARTYPE newpat[MAXLINE];	/* holds compressed version */
	static int newm;

#if	MEASURE_TIMES
	static struct timeval initt, finalt;
#endif

	CHARTYPE *tempbuf;
	int	oldCurrentByteOffset;

	strncpy(pat, in_pat, MAXLINE);
	pat[MAXLINE-1] = '\0';

#define PROCESS_PATTERN \
	if (!CONSTANT) {\
		if( (pat[0] == '^') || (pat[0] == '$') ) pat[0] = '\n';\
		if ((m>1) && (pat[m-2] != '\\') && ((pat[m-1] == '^') || (pat[m-1] == '$'))) pat[m-1] = '\n';\
	}\
	/* whether constant or not, interpret the escape character */\
	for (k=0; k<m; k++) {\
		if (pat[k] == '\\') {\
			for (j=k; j<m; j++)\
				pat[j] = pat[j+1]; /* including '\0' */\
			m--;\
		}\
	}\
	char_tr(pat, &m);   /* will change pat, and m if WHOLELINE is ON */\
	if(m >= MAXPATT) {\
		fprintf(stderr, "%s: pattern too long (has > %d chars)\n", Progname, MAXPATT);\
		if (!EXITONERROR) {\
			errno = AGREP_ERROR;\
			return -1;\
		}\
		else exit(2);\
	}\
	if(D == 0) {\
		if(m > LONG_EXAC) m_preprocess(pat);\
		else prep_bm(pat, m);\
	}\
	else if (DNA) prep4(pat, m);\
	else 	if(m >= LONG_APPX) am_preprocess(pat);\
	else {\
		prep(pat, m, D);\
		initmask(pat, Mask, m, 0, &endposition);\
	}

#if	AGREP_POINTER
	if (fd != -1) {
#endif	/*AGREP_POINTER*/
		alloc_buf(fd, &text, 2*BLOCKSIZE+2*MAXLINE+MAXPATT);
		text[offset-1] = '\n';  /* initial case */
		for(i=0; i < MAXLINE; i++) text[i] = 0;   /* security zone */
		start = offset;   
		if(WHOLELINE) {
			start--;
			CurrentByteOffset --;
		}

		while( (num_read = fill_buf(fd, text+offset, 2*BLOCKSIZE)) > 0) 
		{
			buf_end = end = offset + num_read -1 ;
			
			oldCurrentByteOffset = CurrentByteOffset;

			if (first_time) {
				if ((TCOMPRESSED == ON) && tuncompressible(text+offset, num_read)) {
					EASYSEARCH = text[offset+SIGNATURE_LEN-1];
					start += SIGNATURE_LEN;
					CurrentByteOffset += SIGNATURE_LEN;
					if (!EASYSEARCH) {
						fprintf(stderr, "not compressed for easy-search: can miss some matches in: %s\n", CurrentFileName);
					}
#if	MEASURE_TIMES
					gettimeofday(&initt, NULL);
#endif	/*MEASURE_TIMES*/
					if (samepattern || ((newm = quick_tcompress(FREQ_FILE, HASH_FILE, pat, m, newpat, MAXLINE-8, EASYSEARCH)) > 0)) {
						oldm = m;
						oldpat = pat;
						m = newm;
						pat = newpat;
					}
#if	MEASURE_TIMES
					gettimeofday(&finalt, NULL);
					INFILTER_ms +=  (finalt.tv_sec*1000 + finalt.tv_usec/1000) - (initt.tv_sec*1000 + initt.tv_usec/1000);
#endif	/*MEASURE_TIMES*/
				}
				else TCOMPRESSED = OFF;

				PROCESS_PATTERN	/* must be AFTER we know that it is a compressed pattern... */

				/* to make sure the skip loop in bm() won't go out of bound in later iterations:
				   This was the original code. 
				   It will be inefficient to place a copy of the pattern at the end
				   of the buffer. Better put that stop pattern to the end of the
				   _actual_ read block (end)  [TG] 04.10.96 */
				   
				/* for(i=1; i<=m; i++) text[2*BLOCKSIZE+offset+i] = pat[m-1]; [del] [TG] */

				/* Emergency Stop: put one copy of pattern to the end of the buffer
				   to make sure that the skip loop in bm()
				   won't go out of bound in later iterations */
		   
				/* save portion being overwritten.
				   copied from below (memagrep()), but not need here: */
				/* memcpy(tempbuf, text+end+1, m); */
				
				for(i=1; i<=m; i++) text[end+i] = pat[m-1]; /* [new] [TG] */

				first_time = 0;
			}

                        if (!DELIMITER) {
/* [TG] */			if (num_read == 2*BLOCKSIZE) {
                                	while ((text[end]  != '\n') && (end > offset)) end--;
				}
				/* else end = buf_end; no need to look back [TG] */
				
                                text[start-1] = '\n';
                        }
                        else {
                                unsigned char *newbuf = text + end + 1;
                                newbuf = backward_delimiter(newbuf, text+offset, D_pattern, D_length, OUTTAIL);        /* see agrep.c/'d' */
				if (newbuf < text+offset+D_length) newbuf = text + end + 1;
                                end = newbuf - text - 1;
                                memcpy(text+start-D_length, D_pattern, D_length);
                        }

			residue = buf_end - end + 1 ;

			/* SGREP_PROCESS */
			/* No harm in sending a few extra parameters even if they are unused: they are not accessed in monkey*()s */
			if(D==0)  {
				if(m > LONG_EXAC) {
					if (-1 == monkey(pat, m, text+start, text+end, oldpat, oldm)) {
						free_buf(fd, text);
						return -1;
					}
				}
				else {
					if (-1 == bm(pat, m, text+start, text+end, oldpat, oldm)) {
						free_buf(fd, text);
						return -1;
					}
				}
			}
			else {
				if(DNA) {
					if (-1 == monkey4( pat, m, text+start, text+end, D , oldpat, oldm )) {
						free_buf(fd, text);
						return -1;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -