⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 newmgrep.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 5 页
字号:
/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal.  All Rights Reserved. *//* multipattern matcher */#include <stdio.h>#include <ctype.h>#include <errno.h>#ifdef ultrix#include <sys/types.h>#endif#include <sys/stat.h>#include "agrep.h"#include <sys/time.h>#define ddebug#define uchar unsigned char#undef	MAXPAT#define MAXPAT  256#undef	MAXLINE#define MAXLINE 1024#undef	MAXSYM#define MAXSYM  256#define MAXMEMBER1 32768/* #define MAXMEMBER1 262144 */ /*2^18 */ #define MAXPATFILE 600000#define BLOCKSIZE  16384#define MAXHASH    32768 /* #define MAXHASH    262144 */#define mask5 	   32767#define max_num    MAX_DASHF_FILES#if	ISO_CHAR_SET#define W_DELIM	   256#else#define W_DELIM	   128#endif#define L_DELIM    10 #define Hbits 5 /* how much to shift to perform the hash */extern char aduplicates[MAXNUM_PAT][MAXNUM_PAT];	/* tells what other patterns are exactly equal to the i-th one */extern char tc_aduplicates[MAXNUM_PAT][MAXNUM_PAT];	/* tells what other patterns are exactly equal to the i-th one */extern	ParseTree aterminals[MAXNUM_PAT];extern int	AComplexBoolean;extern int LIMITOUTPUT, LIMITPERFILE;extern int BYTECOUNT, PRINTOFFSET, PRINTRECORD, CurrentByteOffset;extern int MULTI_OUTPUT;	/* used by glimpse only if OR, never for AND */extern int DELIMITER;extern CHAR D_pattern[MaxDelimit*2];extern int D_length;extern CHAR tc_D_pattern[MaxDelimit*2];extern int tc_D_length;extern COUNT, FNAME, SILENT, FILENAMEONLY, prev_num_of_matched, num_of_matched, PRINTFILETIME;extern INVERSE, OUTTAIL;extern WORDBOUND, WHOLELINE, NOUPPER;extern ParseTree *AParse;extern int AComplexPattern;extern unsigned char  CurrentFileName[], Progname[]; extern long CurrentFileTime;extern total_line;extern agrep_initialfd;extern int EXITONERROR;extern int PRINTPATTERN;extern int agrep_inlen;extern CHAR *agrep_inbuffer;extern FILE *agrep_finalfp;extern int agrep_outpointer;extern int agrep_outlen;extern CHAR * agrep_outbuffer;extern int errno;extern int NEW_FILE, POST_FILTER;extern int tuncompressible();extern int quick_tcompress();extern int quick_tuncompress();extern int TCOMPRESSED;extern int EASYSEARCH;extern char FREQ_FILE[MAX_LINE_LEN], HASH_FILE[MAX_LINE_LEN], STRING_FILE[MAX_LINE_LEN];extern char PAT_FILE_NAME[MAX_LINE_LEN];uchar SHIFT1[MAXMEMBER1];int   LONG  = 0;int   SHORT = 0;int   p_size= 0;uchar tr[MAXSYM];uchar tr1[MAXSYM];int   HASH[MAXHASH];int   Hash2[max_num];uchar *PatPtr[max_num];uchar *pat_spool = NULL; /* [MAXPATFILE+2*max_num+MAXPAT]; */uchar *patt[max_num];int   pat_len[max_num];int   pat_indices[max_num]; /* pat_indices[p] gives the actual index in matched_teriminals: used only with AParse != 0 */int num_pat;extern char  amatched_terminals[MAXNUM_PAT]; /* which patterns have been matched in the current line? Used only with AParse != 0, so max_num is not needed */extern int anum_terminals;extern int AComplexBoolean;static void countline();void acompute_duplicates();#if	DOTCOMPRESSED/* Equivalent variables for compression search */uchar tc_SHIFT1[MAXMEMBER1];int   tc_LONG  = 0;int   tc_SHORT = 0;int   tc_p_size= 0;uchar tc_tr[MAXSYM];uchar tc_tr1[MAXSYM];int   tc_HASH[MAXHASH];int   tc_Hash2[max_num];uchar *tc_PatPtr[max_num];uchar *tc_pat_spool = NULL; /* [MAXPATFILE+2*max_num+MAXPAT]; */uchar *tc_patt[max_num];int   tc_pat_len[max_num];int   tc_pat_indices[max_num]; /* pat_indices[p] gives the actual index in matched_teriminals: used only with AParse != 0 */int tc_num_pat;	/* must be the same as num_pat */#endif	/*DOTCOMPRESSED*/static void f_prep();static void f_prep1();static void accumulate();#if	DOTCOMPRESSEDstatic void tc_f_prep();static void tc_f_prep1();static void tc_accumulate();#endif#ifdef perf_check	int cshift=0, cshift0=0, chash=0;#endif/* * General idea behind output processing with delimiters, inverse, compression, etc. * CAUTION: In compressed files, we can search ONLY for simple patterns or their ;,. * Attempts to search for complex patterns / with errors might lead to spurious matches. * 1. Once we find the match, go back and forward to get the delimiters that surround *    the matched region. * 2. If it is a compressed file, verify that the match is "real" (compressed files *    can have pseudo matches hence this filtering step is required). * 3. Increment num_of_matched. * 4. Process some output options which print stuff before the matched region is *    printed. * 5. If there is compression, decomress and output the matched region. Otherwise *    just output it as is. Remember, from step (1) we know the matched region. * 6. If inverse is set, then we must keep track of the end of the last matched region *    in the variable lastout. When there is a match, we must print everything from *    lastout to the beginning of the current matched region (curtextbegin) and then *    update lastout to point to the end of the current matched region (curtextend). *    ALSO: if we exit from the main loops, we must output everything from the end *    of the last matched region to the end of the input buffer. * 7. Delimiter handling in complex patterns is different: there the search is done *    for a boolean and of the delimiter pattern and the actual pattern. * 8. For convenience and speed, the multipattern matching routines to handle *    compressed files have been separated from their (normal) counterparts. * 9. One special note on handling complicated boolean patterns: the parse *    tree will be the same for both compressed and uncomrpessed patterns and the *    same amatched_terminals array will be used in both. BUT, the pat_spool and *    pat_index, etc., will be different as they refer to the individual terminals. */intprepf(mfp, mbuf, mlen)int mfp, mlen;unsigned char *mbuf;{	int length=0, i, p=1;	uchar *pat_ptr;	unsigned Mask = 31;	int num_read;	unsigned char *buf;	struct stat stbuf;	int j, k;	/* to implement \\ */        if ((mfp == -1) && ((mbuf == NULL) || (mlen <= 0))) return -1;	if (mfp != -1) {		if (fstat(mfp, &stbuf) == -1) {			fprintf(stderr, "%s: cannot stat file: %s\n", Progname, PAT_FILE_NAME);			return -1;		}		if (!S_ISREG(stbuf.st_mode)) {			fprintf(stderr, "%s: pattern file not regular file: %s\n", Progname, PAT_FILE_NAME);			return -1;		}		if (stbuf.st_size*2 > MAXPATFILE + 2*max_num) {			fprintf(stderr, "%s: pattern file too large (> %d B): %s\n", Progname, (MAXPATFILE+2*max_num)/2, PAT_FILE_NAME);			return -1;		}		if (pat_spool != NULL) free(pat_spool);		pat_ptr = pat_spool = (unsigned char *)malloc(stbuf.st_size*2 + MAXPAT);		alloc_buf(mfp, &buf, MAXPATFILE+2*BlockSize);		while((num_read = fill_buf(mfp, buf+length, 2*BlockSize)) > 0) {			length = length + num_read;			if(length > MAXPATFILE) {				fprintf(stderr, "%s: maximum pattern file size is %d\n", Progname, MAXPATFILE);                                if (!EXITONERROR) {                                        errno = AGREP_ERROR;                                        free_buf(mfp, buf);                                        return -1;                                }                                else exit(2);			}		}	}	else {		buf = mbuf;		length = mlen;		if (mlen*2 > MAXPATFILE + 2*max_num) {			fprintf(stderr, "%s: pattern buffer too large (> %d B)\n", Progname, (MAXPATFILE+2*max_num)/2);			return -1;		}		if (pat_spool != NULL) free(pat_spool);		pat_ptr = pat_spool = (unsigned char *)malloc(mlen*2 + MAXPAT);	}	/* Now all the patterns are in buf */	buf[length] = '\n';	i=0; p=1;/* removed by Udi 11/8/94 - we now do WORDBOUND "by hand" 	if(WORDBOUND) {		while(i<length) {			patt[p] = pat_ptr;			*pat_ptr++ = W_DELIM;			while((i<length) && ((*pat_ptr = buf[i++]) != '\n')) pat_ptr++;			*pat_ptr++ = W_DELIM;			*pat_ptr++ = 0;			p++;		}	}	else*/	if(WHOLELINE) {		while(i<length) {			patt[p] = pat_ptr;			*pat_ptr++ = L_DELIM;			while((i<length) && ((*pat_ptr = buf[i++]) != '\n')) pat_ptr++;			*pat_ptr++ = L_DELIM;			*pat_ptr++ = 0;			p++;		}	}	else {		while(i < length) {			patt[p] = pat_ptr;			while((i<length) && ((*pat_ptr = buf[i++]) != '\n')) pat_ptr++;			*pat_ptr++ = 0;			p++;  		}	}	/* Now, the patterns have been copied into patt[] */	if(p>max_num) {		fprintf(stderr, "%s: maximum number of patterns is %d\n", Progname, max_num);                 if (!EXITONERROR) {                        errno = AGREP_ERROR;                        free_buf(mfp, buf);                        return -1;                }                else exit(2);	}	for(i=1; i<20; i++) *pat_ptr = i;  /* boundary safety zone */	/* I might have to keep changing tr s.t. mgrep won't get confused with W_DELIM */	for(i=0; i< MAXSYM; i++) tr[i] = i;	if(NOUPPER) {                for (i=0; i<MAXSYM; i++)                        if (isupper(i)) tr[i] = tr[tolower(i)];		/* for(i='A'; i<= 'Z'; i++) tr[i] = i + 'a' - 'A'; */	}/*	if(WORDBOUND) {		for(i=1; i<MAXSYM; i++) if(!isalnum(i)) tr[i] = W_DELIM;	}removed by Udi 11/8/94 - the trick of using W-delim was too buggy.we now do it "by hand" after we find a match */	for(i=0; i< MAXSYM; i++) tr1[i] = tr[i]&Mask;	num_pat =  p-1;	p_size  =  MAXPAT;	for(i=1; i<=num_pat; i++) {		p = strlen(patt[i]);		if ((patt[i][0] == '^') || (patt[i][0] == '$')) patt[i][0] = '\n';		if ((p > 1) && ((patt[i][p-1] == '^') || (patt[i][p-1] == '$')) && (patt[i][p-2] != '\\')) patt[i][p-1] = '\n';		/* Added by bg, Dec 2nd 1994 */		for (k=0; k<p; k++) {			if (patt[i][k] == '\\') {				for (j=k; j<p; j++)					patt[i][j] = patt[i][j+1]; /* including '\0' */				p--;			}		}		pat_len[i] = p;		/*		pat_len[i] = (WORDBOUND?(p-2>0?p-2:1):p);  changed by Udi 11/8/94		*/#ifdef	debug		printf("prepf(): patt[%d]=%s, pat_len[%d]=%d\n", i, patt[i], i, pat_len[i]);#endif		if(p!=0 && p < p_size) p_size = p;	/* MIN */	}	if(p_size == 0) {		fprintf(stderr, "%s: the pattern file is empty\n", Progname);                if (!EXITONERROR) {                        errno = AGREP_ERROR;                        free_buf(mfp, buf);                        return -1;                }                else exit(2);	}	if(length > 400 && p_size > 2) LONG = 1;	if(p_size == 1) SHORT = 1;	for(i=0; i<MAXMEMBER1; i++) SHIFT1[i] = p_size - 1 - LONG;	for(i=0; i<MAXHASH; i++) {		HASH[i] = 0;	}	for(i=1; i<=num_pat; i++) f_prep(i, patt[i]);	accumulate();	memset(pat_indices, '\0', sizeof(int) * (num_pat + 1));	for(i=1; i<=num_pat; i++) f_prep1(i, patt[i]);#if	DOTCOMPRESSED	/* prepf for compression */	if (-1 == tc_prepf(buf, length)) {		free_buf(mfp, buf);		return -1;	}#endif	/*DOTCOMPRESSED*/        free_buf(mfp, buf);	acompute_duplicates(aduplicates, aterminals, anum_terminals, tr);        return 0;}#if	DOTCOMPRESSED/* * Compression equivalent of prepf: called right after prepf. * 1. Read patt and SHIFT1 * 2. Call tcompress on the patterns in patt and put in tc_patt. * 3. Use these patterns to compute tc_SHIFT (ignore WDELIM, LDELIM, case sensitivity, etc.) * 4. Process other variables/functions (pat_spool, tr, tr1, pat_len, accumulate, SHIFT1, f_prep, f_prep1, pat_indices) appropriately. */inttc_prepf(buf, length)unsigned char *buf;int	length;{	int i, p=1;	uchar *pat_ptr;	unsigned Mask = 31;	int tc_length;	unsigned char tc_buf[MAXPAT * 2];	/* maximum length of the compressed pattern */	static struct timeval initt, finalt;	if (length*2 > MAXPATFILE + 2*max_num) {		fprintf(stderr, "%s: pattern buffer too large (> %d B)\n", Progname, (MAXPATFILE+2*max_num)/2);		return -1;	}	if (tc_pat_spool != NULL) free(tc_pat_spool);	pat_ptr = tc_pat_spool = (unsigned char *)malloc(length*2 + MAXPAT);#if	MEASURE_TIMES	gettimeofday(&initt, NULL);#endif	/*MEASURE_TIMES*/	i=0; p=1;	while(i < length) {		tc_patt[p] = pat_ptr;		while((*pat_ptr = buf[i++]) != '\n') pat_ptr++;		*pat_ptr++ = 0;		if ((tc_length = quick_tcompress(FREQ_FILE, HASH_FILE, tc_patt[p], strlen(tc_patt[p]), tc_buf, MAXPAT * 2 - 8, TC_EASYSEARCH)) > 0) {			memcpy(tc_patt[p], tc_buf, tc_length);			tc_patt[p][tc_length] = '\0';			pat_ptr = tc_patt[p] + tc_length + 1;	/* character after '\0' */		}		p++;  	}	for(i=1; i<20; i++) *pat_ptr = i;  /* boundary safety zone */	/* Ignore all other options: it is automatically W_DELIM */	for(i=0; i< MAXSYM; i++) tc_tr[i] = i;	for(i=0; i< MAXSYM; i++) tc_tr1[i] = tc_tr[i]&Mask;	tc_num_pat =  p-1;	tc_p_size  =  MAXPAT;	for(i=1; i<=num_pat; i++) {		p = strlen(tc_patt[i]);		tc_pat_len[i] = p;#ifdef	debug

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -