⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 agrep.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 5 页
字号:
/* * bgopal: (1993-4) added a library interface and removed some bugs: also * selectively modified many routines to work with our text-compression algo. */#include <sys/stat.h>#include "agrep.h"#include "checkfile.h"#include <errno.h>#define PRINT(s)extern char **environ;extern int errno;int pattern_index;	/* index in argv where the pattern is */int glimpse_isserver=0;	/* so that there is no user interaction */int glimpse_call = 0;	/* So that usage message is not printed twice */int glimpse_clientdied=0;/* to quit search if glimpseserver's client dies */int  agrep_initialfd;	/* Where does input come from? File/Memory? */CHAR *agrep_inbuffer;int  agrep_inlen;int  agrep_inpointer;FILE *agrep_finalfp;	/* Where does output go to? File/Memory? */CHAR *agrep_outbuffer;int  agrep_outlen;int  agrep_outpointer;int  execfd;	/* used by exec called within agrep_search, set in agrep_init */int  multifd = -1; /* fd for multipattern search used in ^^ , set in   ^^^^^^^^ */extern char *pat_spool;#if	DOTCOMPRESSEDextern char *tc_pat_spool;#endif	/* DOTCOMPRESSED */char *multibuf=NULL; /* buffer to put the multiple patterns in */int  multilen = 0; /* length of the multibuf: not the #of multi-patterns! */extern int pos_cnt;	/* to re-initialize it to 0 for reg-exp search */unsigned Mask[MAXSYM];unsigned Init1, NO_ERR_MASK, Init[MaxError];unsigned Bit[WORD+1];CHAR buffer[BlockSize+Maxline+1];	/* should not be used anywhere: 10/18/93 */unsigned Next[MaxNext], Next1[MaxNext];unsigned wildmask, endposition, D_endpos; int  LIMITOUTPUT;	/* maximum number of matches we are going to allow */int  LIMITPERFILE;	/* maximum number of matches per file we are going to allow */int  LIMITTOTALFILE;	/* maximum number of files we are going to allow */int  EXITONERROR;	/* return -1 or exit on error? */int  REGEX, FASTREGEX, RE_ERR, FNAME, WHOLELINE, SIMPLEPATTERN;int  COUNT, HEAD, TAIL, LINENUM, INVERSE, I, S, DD, AND, SGREP, JUMP; int  NOOUTPUTZERO;int  Num_Pat, PSIZE, prev_num_of_matched, num_of_matched, files_matched, SILENT, NOPROMPT, BESTMATCH, NOUPPER;int  NOMATCH, TRUNCATE, FIRST_IN_RE, FIRSTOUTPUT;int  WORDBOUND, DELIMITER, D_length, tc_D_length, original_D_length;int  EATFIRST, OUTTAIL;int  BYTECOUNT;int  PRINTOFFSET;int  PRINTRECORD;int  PRINTNONEXISTENTFILE;int  FILEOUT;int  DNA;int  APPROX;int  PAT_FILE;	/* multiple patterns from a given file */char PAT_FILE_NAME[MAX_LINE_LEN];int  PAT_BUFFER; /* multiple patterns from a given buffer */int  CONSTANT;int  RECURSIVE;int  total_line; /* used in mgrep */int  D;int  M;int  TCOMPRESSED;int  EASYSEARCH;	/* 1 used only for compressed files: LITTLE/BIG */int  ALWAYSFILENAME = OFF;int  POST_FILTER = OFF;int  NEW_FILE = OFF;	/* only when post-filter is used */int  PRINTFILENUMBER = OFF;int  PRINTFILETIME = OFF;int  PRINTPATTERN = OFF;int  MULTI_OUTPUT = OFF; /* should mgrep print the matched line multiple times for each matched pattern or just once? *//* invisible to the user, used only by glimpse: cannot use -l since it is incompatible with stdin and -A is used for the index search (done next) *//* Stuff to handle complicated boolean patterns */int  AComplexBoolean = 0;ParseTree *AParse = NULL;int anum_terminals = 0;ParseTree aterminals[MAXNUM_PAT];char amatched_terminals[MAXNUM_PAT];char aduplicates[MAXNUM_PAT][MAXNUM_PAT];	/* tells what other patterns are exactly equal to the i-th one */char tc_aduplicates[MAXNUM_PAT][MAXNUM_PAT];	/* tells what other patterns are exactly equal to the i-th one */#if	MEASURE_TIMES/* timing variables */int OUTFILTER_ms;int FILTERALGO_ms;int INFILTER_ms;#endif	/*MEASURE_TIMES*/CHAR **Textfiles = NULL;     /* array of filenames to be searched */int Numfiles = 0;    /* indicates how many files in Textfiles */int copied_from_argv = 0; /* were filenames copied from argv (should I free 'em)? */CHAR old_D_pat[MaxDelimit * 2] = "\n";  /* to hold original D_pattern */CHAR original_old_D_pat[MaxDelimit * 2] = "\n";CHAR Pattern[MAXPAT], OldPattern[MAXPAT];CHAR CurrentFileName[MAX_LINE_LEN];long CurrentFileTime;int SetCurrentFileName = 0;	/* dirty glimpse trick to make filters work: output seems to come from another file */int SetCurrentFileTime = 0;	/* dirty glimpse trick to avoid doing a stat to find the time */int CurrentByteOffset;int SetCurrentByteOffset = 0;CHAR Progname[MAXNAME]; CHAR D_pattern[MaxDelimit * 2] = "\n; "; /* string which delimits records -- defaults to newline */CHAR tc_D_pattern[MaxDelimit * 2] = "\n";CHAR original_D_pattern[MaxDelimit * 2] = "\n; ";char COMP_DIR[MAX_LINE_LEN];char FREQ_FILE[MAX_LINE_LEN], HASH_FILE[MAX_LINE_LEN], STRING_FILE[MAX_LINE_LEN];	/* interfacing with tcompress */int  NOFILENAME,  /* Boolean flag, set for -h option */     FILENAMEONLY;/* Boolean flag, set for -l option */extern int init();int table[WORD][WORD];CHAR *agrep_saved_pattern = NULL;	/* to prevent multiple prepfs for each boolean search: crd@hplb.hpl.hp.com */longaget_file_time(stbuf, name)	struct stat *stbuf;	char	*name;{	long	ret = 0;	struct stat mystbuf;	if (stbuf != NULL) ret = stbuf->st_mtime;	else {		if (my_stat(name, &mystbuf) == -1) ret = 0;		else ret = mystbuf.st_mtime;	}	return ret;}char *aprint_file_time(thetime)	time_t	thetime;{#if	0	char	s[256], s1[16], s2[16], s3[16], s4[16], s5[16];	static char buffer[256];	strcpy(s, ctime(&thetime));	/* of the form: Sun Sep 16 01:03:52 1973\n\0 */	s[strlen(s) - 1] = '\0';	sscanf(s, "%s%s%s%s%s", s1, s2, s3, s4, s5);	sprintf(buffer, ": %s %s %s", s2, s3, s5);	/* ditch Sun 01:03:52 */#else	static char buffer[256];	buffer[0] = ':';	buffer[1] = ' ';	strftime(&buffer[2], 256, "%h %e %Y", gmtime(&thetime));#endif	return &buffer[0];}/* Called when multipattern search and pattern has not changed */voidreinit_value_partial(){	num_of_matched = prev_num_of_matched = 0;	errno = 0;	FIRST_IN_RE = ON;}/* This must be called before every agrep_search to reset agrep globals */voidreinit_value(){        int i, j;	/* Added on 7th Oct 194 */	if (AParse) {		if (AComplexBoolean) destroy_tree(AParse);		AComplexBoolean = 0;		AParse = 0;		PAT_BUFFER = 0;		if (multibuf != NULL) free(multibuf);	/* this was allocated for arbit booleans, not multipattern search */		multibuf = NULL;		multilen = 0;		/* Cannot free multifd here since that is always allocated for multipattern search */	}	for (i=0; i<anum_terminals; i++) {		free(aterminals[i].data.leaf.value);		memset(&aterminals[i], '\0', sizeof(ParseTree));	}	anum_terminals = 0;	for (i=0; i<MAXNUM_PAT; i++) memset(aduplicates[i], '\0', MAXNUM_PAT);	for (i=0; i<MAXNUM_PAT; i++) memset(tc_aduplicates[i], '\0', MAXNUM_PAT);        Bit[WORD] = 1;        for (i = WORD - 1; i > 0  ; i--)  Bit[i] = Bit[i+1] << 1;        for (i=0; i< MAXSYM; i++) Mask[i] = 0;        /* bg: new things added on Mar 13 94 */        Init1 = 0;        NO_ERR_MASK = 0;        memset(Init, '\0', MaxError * sizeof(unsigned));        memset(Next, '\0', MaxNext * sizeof(unsigned));        memset(Next1, '\0', MaxNext * sizeof(unsigned));        wildmask = endposition = D_endpos = 0;        for (i=0; i<WORD; i++)                for (j=0; j<WORD; j++)                        table[i][j] = 0;        strcpy(D_pattern, original_D_pattern);        D_length = original_D_length;        strcpy(old_D_pat, original_old_D_pat);	/* Changed on Dec 26th: bg */	FASTREGEX = REGEX = 0;	HEAD = TAIL = ON;	/* were off initially */	RE_ERR = 0;	AND = 0;	M = 0;	pos_cnt = 0;	/* added 31 Jan 95 */	reinit_value_partial();}/* This must be called before every agrep_init to reset agrep options */voidinitial_value(){	SetCurrentFileName = 0;	/* 16/9/94 */	SetCurrentFileTime = 0;	SetCurrentByteOffset = 0;	/* 23/9/94 */	/* courtesy: crd@hplb.hpl.hp.com */	if (agrep_saved_pattern) {		free(agrep_saved_pattern);		agrep_saved_pattern= NULL;	}	/* bg: new stuff on 17/Feb/94 */	if (multifd != -1) close(multifd);	multifd = -1;	if (multibuf != NULL) free(multibuf);	multibuf = NULL;	multilen = 0;	if (pat_spool != NULL) free(pat_spool);	pat_spool = NULL;#if	DOTCOMPRESSED	if (tc_pat_spool != NULL) free(tc_pat_spool);	tc_pat_spool = NULL;#endif	/* DOTCOMPRESSED */	LIMITOUTPUT = 0;	/* means infinity = current semantics */	LIMITPERFILE = 0;	/* means infinity = current semantics */	LIMITTOTALFILE = 0;	/* means infinity = current semantics */	EASYSEARCH = 1;	DNA = APPROX = PAT_FILE = PAT_BUFFER = CONSTANT = total_line = D = TCOMPRESSED = 0;	PAT_FILE_NAME[0] = '\0';	EXITONERROR = NOFILENAME = FILENAMEONLY = FILEOUT = ALWAYSFILENAME = NEW_FILE = POST_FILTER = 0;        original_old_D_pat[0] = old_D_pat[0] = '\n';        original_old_D_pat[1] = old_D_pat[1] = '\0';        original_D_pattern[0] = D_pattern[0] = '\n';        original_D_pattern[1] = D_pattern[1] = ';';        original_D_pattern[2] = D_pattern[2] = ' ';        original_D_pattern[3] = D_pattern[3] = '\0';	strcpy(tc_D_pattern, "\n");	tc_D_length = 1;	/* the functions agrep_init and agrep_search take care of Textfiles and Numfiles */	agrep_inpointer = 0;	agrep_outpointer = 0;	agrep_outlen = 0;#if	MEASURE_TIMES	OUTFILTER_ms = FILTERALGO_ms = INFILTER_ms = 0;#endif	/*MEASURE_TIMES*/	MULTI_OUTPUT = 0;	PRINTPATTERN = 0;	PRINTFILENUMBER = 0;	PRINTFILETIME = 0;	JUMP = FNAME = BESTMATCH = NOPROMPT = NOUPPER = 0;	RECURSIVE = 0;	COUNT = LINENUM = WHOLELINE = SGREP = 0;	NOOUTPUTZERO = 0;	EATFIRST = INVERSE = TRUNCATE = OUTTAIL = 0; 	NOMATCH = FIRSTOUTPUT = ON;	/* were off initally */	I = DD = S = 1;	/* were off initially */	original_D_length = D_length = 2;	/* was 0 initially */	SILENT = Num_Pat = PSIZE = SIMPLEPATTERN = prev_num_of_matched = num_of_matched = files_matched = 0;	WORDBOUND = DELIMITER = 0;	COMP_DIR[0] = '\0';	FREQ_FILE[0] = '\0';	HASH_FILE[0] = '\0';	STRING_FILE[0] = '\0';	BYTECOUNT = OFF;	PRINTOFFSET = OFF;	PRINTRECORD = ON;	PRINTNONEXISTENTFILE = OFF;	glimpse_clientdied = 0;	/* added 15th Feb 95 */	/* Pattern, OldPattern, execfd, Numfiles are set in agrep_init: so no need to initialize */	reinit_value();}voidcompute_next(M, Next, Next1)int M; unsigned *Next, *Next1;{	int i, j=0, n,  k, temp;	int mid, pp;	int MM, base;	unsigned V[WORD];	base = WORD - M;	temp = Bit[base]; 	Bit[base] = 0;	for (i=0; i<WORD; i++) V[i] = 0;	for (i=1; i<M; i++)	{  		j=0;		while (table[i][j] > 0 && j < 10) {			V[i] = V[i] | Bit[base + table[i][j++]];		}	}	Bit[base]=temp;	if(M <= SHORTREG)	{		k = exponen(M);		pp = 2*k;		for(i=k; i<pp ; i++)		{   			n = i;			Next[i]= (k>>1);			for(j=M; j>=1; j--)			{				if(n & Bit[WORD]) Next[i] = Next[i] | V[j];				n = (n>>1);			}		}      		return;	}	if(M > MAXREG) fprintf(stderr, "%s: regular expression too long\n", Progname);	MM = M;	if(M & 1) M=M+1;	k = exponen(M/2);	pp = 2*k;	mid = MM/2;	for(i=k; i<pp ; i++)	{     		n = i;		Next[i]= (Bit[base]>>1);		for(j=MM; j>mid ; j--)		{			if(n & Bit[WORD]) Next[i] = Next[i] | V[j-mid];			n = (n>>1);		}		n=i-k;		Next1[i-k] = 0;		for(j = 0; j<mid; j++)		{			if(n & Bit[WORD]) Next1[i-k] = Next1[i-k] | V[MM-j];			n = (n>>1);		}	}      	return;}intexponen(m)int m;{ 	int i, ex;	ex= 1;	for (i=0; i<m; i++) ex <<= 1;	/* was ex *= 2 */	return(ex);}intre1(Text, M, D)int Text, M, D;{	register unsigned i, c, r0, r1, r2, r3, CMask, Newline, Init0, r_NO_ERR; 	register unsigned end;	register unsigned hh, LL=0, k;  /* Lower part */	int  FIRST_TIME=ON, num_read , j=0, base;	unsigned A[MaxRerror+1], B[MaxRerror+1];	unsigned Next[MaxNext], Next1[MaxNext];	CHAR *buffer;	int FIRST_LOOP = 1;	r_NO_ERR = NO_ERR_MASK;	if(M > 30) {		fprintf(stderr, "%s: regular expression too long\n", Progname);		if (!EXITONERROR){			errno = AGREP_ERROR;			return -1;		}		else exit(2);	}	base = WORD - M;	hh = M/2;	for(i=WORD, j=0; j < hh ; i--, j++) LL = LL | Bit[i];	if(FIRST_IN_RE) compute_next(M, Next, Next1); 	/*SUN: try: change to memory allocation */	FIRST_IN_RE = 0;	Newline = '\n';	Init[0] = Bit[base];	if(HEAD) Init[0] = Init[0] | Bit[base+1];	for(i=1; i<= D; i++) Init[i] = Init[i-1] | Next[Init[i-1]>>hh] | Next1[Init[i-1]&LL];	Init1 = Init[0] | 1; 	Init0 = Init[0];

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -