⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 sgrep.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 5 页
字号:
/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal.  All Rights Reserved. */#include <stdio.h>#include <ctype.h>#include "agrep.h"#include <errno.h>#undef	MAXSYM#define MAXSYM  256#define MAXMEMBER 8192#define	CHARTYPE	unsigned char#undef	MaxError /* don't use agrep.h definition */#define MaxError 20#define MAXPATT 256#undef	MAXLINE#define MAXLINE 1024#undef	MAXNAME#define MAXNAME 256#undef	MaxCan	/* don't use agrep.h definition */#define MaxCan  2048#define BLOCKSIZE    16384#define MAX_SHIFT_2  4096#undef	ON#define ON      1#undef	OFF#define OFF	0#define LOG_ASCII 8#define LOG_DNA  3#define MAXMEMBER_1 65536#define LONG_EXAC  20#define LONG_APPX  24#if	ISO_CHAR_SET#define W_DELIM    256#else#define W_DELIM    128#endif#include <sys/time.h>extern int tuncompressible();extern int quick_tcompress();extern int quick_tuncompress();extern int DELIMITER, OUTTAIL;extern int D_length, tc_D_length;extern unsigned char D_pattern[MaxDelimit *2], tc_D_pattern[MaxDelimit *2];extern int LIMITOUTPUT, LIMITPERFILE, INVERSE;extern int CurrentByteOffset;extern int BYTECOUNT;extern int PRINTOFFSET;extern int PRINTRECORD;extern int CONSTANT, COUNT, FNAME, SILENT, FILENAMEONLY, prev_num_of_matched, num_of_matched, PRINTFILETIME;extern int DNA ;  /* DNA flag is set in checksg when pattern is DNA pattern and		 p_size > 16  */extern WORDBOUND, WHOLELINE, NOUPPER;extern unsigned char CurrentFileName[],  Progname[]; extern long CurrentFileTime;extern unsigned Mask[];extern unsigned endposition;extern int agrep_inlen;extern CHARTYPE *agrep_inbuffer;extern int agrep_initialfd;extern FILE *agrep_finalfp;extern int agrep_outpointer;extern int agrep_outlen;extern CHARTYPE * agrep_outbuffer;extern int NEW_FILE, POST_FILTER;extern int EXITONERROR;extern int errno;extern int TCOMPRESSED;extern int EASYSEARCH;extern char FREQ_FILE[MAX_LINE_LEN], HASH_FILE[MAX_LINE_LEN], STRING_FILE[MAX_LINE_LEN];#if	MEASURE_TIMES/* timing variables */extern int OUTFILTER_ms;extern int FILTERALGO_ms;extern int INFILTER_ms;#endif	/*MEASURE_TIMES*/unsigned char BSize;                /* log_c m   */unsigned char char_map[MAXSYM];/* data area */int shift_1;CHARTYPE SHIFT[MAXSYM];CHARTYPE MEMBER[MAXMEMBER];CHARTYPE pat[MAXPATT];unsigned Hashmask;char MEMBER_1[MAXMEMBER_1];CHARTYPE TR[MAXSYM];static void initmask();static void am_preprocess();static void m_preprocess();static void prep();static void prep4();static void prep_bm();/* * General idea behind output processing with delimiters, inverse, compression, etc. * CAUTION: In compressed files, we can search ONLY for simple patterns or their ;,. * Attempts to search for complex patterns / with errors might lead to spurious matches. * 1. Once we find the match, go back and forward to get the delimiters that surround *    the matched region. * 2. If it is a compressed file, verify that the match is "real" (compressed files *    can have pseudo matches hence this filtering step is required). * 3. Increment num_of_matched. * 4. Process some output options which print stuff before the matched region is *    printed. * 5. If there is compression, decomress and output the matched region. Otherwise *    just output it as is. Remember, from step (1) we know the matched region. * 6. If inverse is set, then we must keep track of the end of the last matched region *    in the variable lastout. When there is a match, we must print everything from *    lastout to the beginning of the current matched region (curtextbegin) and then *    update lastout to point to the end of the current matched region (curtextend). *    ALSO: if we exit from the main loops, we must output everything from the end *    of the last matched region to the end of the input buffer. * 7. Delimiter handling in complex patterns is different: there the search is done *    for a boolean and of the delimiter pattern and the actual pattern. *//* skips over escaped characters */unsigned char *mystrchr(s, c)unsigned char *s;int c;{	unsigned char	*t = s;	while (*t) {		if (*t == '\\') t++;		else if (c == *t) return t;		t ++;	}	return NULL;}voidchar_tr(pat, m)unsigned char *pat;int *m;{	int i;	unsigned char temp[MAXPATT];	for(i=0; i<MAXSYM; i++) TR[i] = i;	if(NOUPPER) {		for(i=0; i<MAXSYM; i++)			if (isupper(i)) TR[i] = TR[tolower(i)];		/* for(i='A'; i<= 'Z'; i++) TR[i] = i + 'a' - 'A'; */	}	/*	if(WORDBOUND) {		for(i=0; i<MAXSYM; i++) {			if(!isalnum(i)) TR[i] = W_DELIM;removed by Udi.			we don't use the trick of making the boundary W_delim anymore.			It's too buggy otherwise and it's not necessary.		}	}	removed by bg 11/8/94	*/	if(WHOLELINE) {		memcpy(temp, pat, *m);		pat[0] = '\n';		memcpy(pat+1, temp, *m);		pat[*m+1] = '\n';		pat[*m+2] = 0;		*m = *m + 2;	}}intsgrep(in_pat, in_m, fd, D, samepattern)CHARTYPE *in_pat;  int fd, in_m, D;{	CHARTYPE patbuf[MAXLINE];	CHARTYPE *pat = patbuf;	int m = in_m;	CHARTYPE *text; /* input text stream */	int offset = 2*Max_record;	int buf_end, num_read, i, start, end, residue = 0;	int first_time = 1;	CHARTYPE *oldpat = pat;	int k, j, oldm = m;	static CHARTYPE newpat[MAXLINE];	/* holds compressed version */	static int newm;#if	MEASURE_TIMES	static struct timeval initt, finalt;#endif	CHARTYPE *tempbuf;	int	oldCurrentByteOffset;	strncpy(pat, in_pat, MAXLINE);	pat[MAXLINE-1] = '\0';#define PROCESS_PATTERN \	if (!CONSTANT) {\		if( (pat[0] == '^') || (pat[0] == '$') ) pat[0] = '\n';\		if ((m>1) && (pat[m-2] != '\\') && ((pat[m-1] == '^') || (pat[m-1] == '$'))) pat[m-1] = '\n';\	}\	/* whether constant or not, interpret the escape character */\	for (k=0; k<m; k++) {\		if (pat[k] == '\\') {\			for (j=k; j<m; j++)\				pat[j] = pat[j+1]; /* including '\0' */\			m--;\		}\	}\	char_tr(pat, &m);   /* will change pat, and m if WHOLELINE is ON */\	if(m >= MAXPATT) {\		fprintf(stderr, "%s: pattern too long (has > %d chars)\n", Progname, MAXPATT);\		if (!EXITONERROR) {\			errno = AGREP_ERROR;\			return -1;\		}\		else exit(2);\	}\	if(D == 0) {\		if(m > LONG_EXAC) m_preprocess(pat);\		else prep_bm(pat, m);\	}\	else if (DNA) prep4(pat, m);\	else 	if(m >= LONG_APPX) am_preprocess(pat);\	else {\		prep(pat, m, D);\		initmask(pat, Mask, m, 0, &endposition);\	}#if	AGREP_POINTER	if (fd != -1) {#endif	/*AGREP_POINTER*/		alloc_buf(fd, &text, 2*BlockSize+2*Max_record+MAXPATT);		text[offset-1] = '\n';  /* initial case */		for(i=0; i < Max_record; i++) text[i] = 0;   /* security zone */		start = offset;   		if(WHOLELINE) {			start--;			CurrentByteOffset --;		}		while( (num_read = fill_buf(fd, text+offset, 2*BlockSize)) > 0) 		{			buf_end = end = offset + num_read -1 ;			oldCurrentByteOffset = CurrentByteOffset;			if (first_time) {				if ((TCOMPRESSED == ON) && tuncompressible(text+offset, num_read)) {					EASYSEARCH = text[offset+SIGNATURE_LEN-1];					start += SIGNATURE_LEN;					CurrentByteOffset += SIGNATURE_LEN;					if (!EASYSEARCH) {						fprintf(stderr, "not compressed for easy-search: can miss some matches in: %s\n", CurrentFileName);					}#if	MEASURE_TIMES					gettimeofday(&initt, NULL);#endif	/*MEASURE_TIMES*/					if (samepattern || ((newm = quick_tcompress(FREQ_FILE, HASH_FILE, pat, m, newpat, Max_record-8, EASYSEARCH)) > 0)) {						oldm = m;						oldpat = pat;						m = newm;						pat = newpat;					}#if	MEASURE_TIMES					gettimeofday(&finalt, NULL);					INFILTER_ms +=  (finalt.tv_sec*1000 + finalt.tv_usec/1000) - (initt.tv_sec*1000 + initt.tv_usec/1000);#endif	/*MEASURE_TIMES*/				}				else TCOMPRESSED = OFF;				PROCESS_PATTERN	/* must be AFTER we know that it is a compressed pattern... */				for(i=1; i<=m; i++) text[2*BlockSize+offset+i] = pat[m-1];				/* to make sure the skip loop in bm() won't go out of bound in later iterations */				first_time = 0;			}                        if (!DELIMITER) {                                while ((text[end]  != '\n') && (end > offset)) end--;                                text[start-1] = '\n';                        }                        else {                                unsigned char *newbuf = text + end + 1;                                newbuf = backward_delimiter(newbuf, text+offset, D_pattern, D_length, OUTTAIL);        /* see agrep.c/'d' */				if (newbuf < text+offset+D_length) newbuf = text + end + 1;                                end = newbuf - text - 1;                                memcpy(text+start-D_length, D_pattern, D_length);                        }			residue = buf_end - end + 1 ;			/* SGREP_PROCESS */			/* No harm in sending a few extra parameters even if they are unused: they are not accessed in monkey*()s */			if(D==0)  {				if(m > LONG_EXAC) {					if (-1 == monkey(pat, m, text+start, text+end, oldpat, oldm)) {						free_buf(fd, text);						return -1;					}				}				else {					if (-1 == bm(pat, m, text+start, text+end, oldpat, oldm)) {						free_buf(fd, text);						return -1;					}				}			}			else {				if(DNA) {					if (-1 == monkey4( pat, m, text+start, text+end, D , oldpat, oldm )) {						free_buf(fd, text);						return -1;					}				}				else {					if(m >= LONG_APPX) {						if (-1 == a_monkey(pat, m, text+start, text+end, D, oldpat, oldm)) {							free_buf(fd, text);							return -1;						}					}					else {						if (-1 == agrep(pat, m, text+start, text+end, D, oldpat, oldm)) {							free_buf(fd, text);							return -1;						}					}				}			}			if(FILENAMEONLY && (num_of_matched - prev_num_of_matched) && (NEW_FILE || !POST_FILTER)) {				if (agrep_finalfp != NULL)					fprintf(agrep_finalfp, "%s", CurrentFileName);				else {					int outindex;					for(outindex=0; (outindex+agrep_outpointer<agrep_outlen) &&							(CurrentFileName[outindex] != '\0'); outindex++) {						agrep_outbuffer[agrep_outpointer+outindex] = CurrentFileName[outindex];					}					if ((CurrentFileName[outindex] != '\0') || (outindex+agrep_outpointer>=agrep_outlen)) {						OUTPUT_OVERFLOW;						free_buf(fd, text);						return -1;					}					agrep_outpointer += outindex;				}				if (PRINTFILETIME) {					char *s = aprint_file_time(CurrentFileTime);					if (agrep_finalfp != NULL)						fprintf(agrep_finalfp, "%s", s);					else {						int outindex;						for(outindex=0; (outindex+agrep_outpointer<agrep_outlen) &&								(s[outindex] != '\0'); outindex++) {							agrep_outbuffer[agrep_outpointer+outindex] = s[outindex];						}						if ((s[outindex] != '\0') || (outindex+agrep_outpointer>=agrep_outlen)) {							OUTPUT_OVERFLOW;							free_buf(fd, text);							return -1;						}						agrep_outpointer += outindex;					}				}				if (agrep_finalfp != NULL)					fprintf(agrep_finalfp, "\n");				else {					if (agrep_outpointer+1>=agrep_outlen) {						OUTPUT_OVERFLOW;						free_buf(fd, text);						return -1;					}					else agrep_outbuffer[agrep_outpointer++] = '\n';				}				free_buf(fd, text);				NEW_FILE = OFF;				return 0; 			}			CurrentByteOffset = oldCurrentByteOffset + end - start + 1;	/* for a new iteration: avoid complicated calculations below */			start = offset - residue ;			if(start < Max_record) {				start = Max_record; 			}			/* strncpy(text+start, text+end, residue); */			memcpy(text+start, text+end, residue);			start++;			if (((LIMITOUTPUT > 0) && (LIMITOUTPUT <= num_of_matched)) ||			    ((LIMITPERFILE > 0) && (LIMITPERFILE <= num_of_matched - prev_num_of_matched))) {				free_buf(fd, text);				return 0;	/* done */			}		} /* end of while(num_read = ...) */                if (!DELIMITER) {                        text[start-1] = '\n';                        text[start+residue] = '\n';                }                else {                        if (start > D_length) memcpy(text+start-D_length, D_pattern, D_length);                        memcpy(text+start+residue, D_pattern, D_length);                }		end = start + residue - 2;                if(residue > 1) {			/* SGREP_PROCESS */			/* No harm in sending a few extra parameters even if they are unused: they are not accessed in monkey*()s */			if(D==0)  {				if(m > LONG_EXAC) {					if (-1 == monkey(pat, m, text+start, text+end, oldpat, oldm)) {						free_buf(fd, text);						return -1;					}				}				else {					if (-1 == bm(pat, m, text+start, text+end, oldpat, oldm)) {						free_buf(fd, text);						return -1;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -