⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 main.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 5 页
字号:
/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal.  All Rights Reserved. *//* bgopal: (1993-4) redesigned/rewritten using agrep's library interface */#include <sys/param.h>#include <errno.h>#include "glimpse.h"#include "defs.h"#include <fcntl.h>#include "checkfile.h"#include <sys/types.h>#include <sys/stat.h>#include <sys/time.h>#include <sys/file.h>	/* for flock definition */#if	ISO_CHAR_SET#include <locale.h>	/* support for 8bit character set */#endif#define CLIENTSERVER	1#define USE_MSGHDR	0#define USE_UNIXDOMAIN	0#define DEBUG	0#define DEF_SERV_PORT	2001#define MIN_SERV_PORT	1024#define MAX_SERV_PORT	30000#define SERVER_QUEUE_SIZE	10	/* number of requests to buffer up while processing one request = 5 *//* Borrowed from C-Lib */extern char **environ;extern int errno;#if	CLIENTSERVER#include "communicate.c"#endif	/*CLIENTSERVER*//* For client-server protocol */CHAR	*SERV_HOST = NULL;int	SERV_PORT;char	glimpse_reqbuf[MAX_ARGS*MAX_NAME_LEN];extern int glimpse_clientdied;	/* set if signal received about dead socket: need agrep variable so that exec() can return quickly */int	glimpse_reinitialize = 0;/* Borrowed from agrep.c */extern int D_length;		/* global variable in agrep */extern int D;			/* global variable in agrep */extern int pattern_index;/* These are used for byte level index search */extern CHAR CurrentFileName[MAX_LINE_LEN];extern int SetCurrentFileName;extern int CurrentByteOffset;extern int SetCurrentByteOffset;extern long CurrentFileTime;extern int SetCurrentFileTime;extern int execfd;extern int  agrep_initialfd;extern CHAR *agrep_inbuffer;extern int  agrep_inlen;extern int  agrep_inpointer;extern FILE *agrep_finalfp;extern CHAR *agrep_outbuffer;extern int  agrep_outlen;extern int  agrep_outpointer;extern int glimpse_call;	/* prevent agrep from printing out its usage */extern int glimpse_isserver;	/* prevent agrep from asking for user input */int	first_search = 1;	/* intra/interaction in process_query() and glimpse_search() */#if	ISSERVERint	RemoteFiles = 0;	/* Are the files present locally or remotely? If on, then -NQ is automatically added to all search options for each query */#endif/* Borrowed from index/io.c */extern int InfoAfterFilename;extern int OneFilePerBlock;extern int StructuredIndex;extern unsigned int *dest_index_set;extern unsigned char *dest_index_buf;extern unsigned int *src_index_set;extern unsigned char *src_index_buf;extern unsigned char *merge_index_buf;extern int mask_int[32];extern int indexable_char[256];int test_indexable_char[256];extern int p_table[MAX_PARTITION];extern int GMAX_WORD_SIZE;extern int IndexNumber;		/* used in getword() */extern int InterpretSpecial;	/* used to "not-split" agrep-regexps */extern int UseFilters;		/* defined in build_in.c, used for filtering routines in io.c */extern int ByteLevelIndex;extern int RecordLevelIndex;extern int rdelim_len;extern char rdelim[MAX_LINE_LEN];extern char old_rdelim[MAX_LINE_LEN];extern int file_num;extern int REAL_PARTITION, REAL_INDEX_BUF, MAX_ALL_INDEX, FILEMASK_SIZE;/* Borrowed from get_filename.c */extern int bigbuffer_size;extern char *bigbuffer;extern char *outputbuffer;/* OPTIONS/FLAGS */int	veryfast = 0;int	CONTACT_SERVER = 0;	/* Should client try to call server at all or just process query on its own? */int	NOBYTELEVEL = 0;	/* Some cases where we cannot do byte level fast-search: ALWAYS 0 if !ByteLevelIndex */int	OPTIMIZEBYTELEVEL = 0;	/* Some cases where we don't want to do byte level search since number of files is small */int	GCONSTANT = 0;		/* should pattern be taken as-is or parsed? */int	GLIMITOUTPUT = 0;	/* max no. of output lines: 0=>infinity=default=nolimit */int	GLIMITTOTALFILE = 0;	/* max no. of files to match: 0=>infinity=default=nolimit */int	GLIMITPERFILE = 0;	/* not used in glimpse */int	GBESTMATCH = 0;		/* Should I change -B to -# where # = no. of errors? */int	GRECURSIVE = 0;int	GNOPROMPT = 0;int	GBYTECOUNT = 0;int	GPRINTFILENUMBER = 0;int	GPRINTFILETIME = 0;int	GOUTTAIL = 0;int	GFILENAMEONLY = 0;	/* how to do it if it is an and expression in structured queries */int	GNOFILENAME=0;int	GPRINTNONEXISTENTFILE = 0; /* if filename is not there in index, then at least let user know its name */int	MATCHFILE = 0;int	PRINTATTR = 0;int	PRINTINDEXLINE = 0;int	Pat_as_is=0;int	Only_first=0;		/* Do index search only */int	PRINTAPPXFILEMATCH=0;	/* Print places in file where match occurs: useful with -b only to analyse the index */int	GCOUNT=0;		/* print number of matches rather than actual matches: used only when PRINTAPPX = 1 */int	HINTSFROMUSER=0;	/* The user gives the hints about where we should search (result of adding -EQNgy) */int	WHOLEFILESCOPE=0;	/* used only when foundattr is NOT set: otherwise, scope is whole file anyway */int	foundattr=0;		/* set in split.c -- != 0 only when StructuredIndex AND query is structured */int	foundnot=0;		/* set in split.c -- != 0 only when the not operator (~) is present in the pattern */int	FILENAMESINFILE=0;	/* whether the user is providing an explicit list of filenames to be searched for pattern (if absent, then means all files) */int	BITFIELDFILE=0;		/* Based on contribution From ada@mail2.umu.se Fri Jul 12 01:56 MST 1996; Christer Holgersson, Sen. SysNet Mgr, Umea University/SUNET, Sweden */int	BITFIELDOFFSET=0;int	BITFIELDLENGTH=0;int	BITFIELDENDIAN=0;int	GNumDays = 0;		/* whether the user wants files modified within these many days before creating the index: only >0 makes sense *//* structured queries */CHAR	***attr_vals;		/* matrix of char pointers: row=max #of attributes, col=max possible values */CHAR	**attr_found;		/* did the expression corr. to each value in attr_vals match? */ParseTree *GParse;		/* what kind of expression corr. to attr are we looking for *//* arbitrary booleans */ParseTree terminals[MAXNUM_PAT];	/* parse tree's terminal node pointers pt. to elements of this array; also used outside */ char	matched_terminals[MAXNUM_PAT];	/* ...[i] is 1 if i'th terminal matched: used in filter_output and eval_tree */int	num_terminals;		/* number of terminal patterns */int	ComplexBoolean=0;	/* 1 if we need to use parse trees and the eval function *//* index search */CHAR	*pat_list[MAXNUM_PAT];	/* complete words within global pattern */int	pat_lens[MAXNUM_PAT];	/* their lengths */int	pat_attr[MAXNUM_PAT];	/* set of attributes */int	is_mgrep_pat[MAXNUM_PAT];int	mgrep_pat_index[MAXNUM_PAT];int	num_mgrep_pat;CHAR	pat_buf[(MAXNUM_PAT + 2)*MAXPAT];int	pat_ptr = 0;extern char INDEX_DIR[MAX_LINE_LEN];char	*TEMP_DIR = NULL;	/* directory to store glimpse temporary files, usually /tmp unless -T is specified */char	indexnumberbuf[256];	/* to read in first few lines of the index */char	*index_argv[MAX_ARGS];int	index_argc = 0;int	bestmatcherrors=0;	/* set during index search, used later on */int	patindex; int	patbufpos = -1;char	tempfile[MAX_NAME_LEN];char	*filenames_file = NULL;char	*bitfield_file = NULL;/* agrep search */char	*agrep_argv[MAX_ARGS];int 	agrep_argc = 0;CHAR	*FileOpt;		/* the option list after -F */int	fileopt_length;CHAR	GPattern[MAXPAT];int	GM;CHAR	APattern[MAXPAT];int	AM;CHAR	GD_pattern[MAXPAT];int	GD_length;CHAR	**GTextfiles;CHAR	**GTextfilenames;int	*GFileIndex;int	GNumfiles;int	GNumpartitions;CHAR	GProgname[MAXNAME];/* persistent file descriptors */#if	BG_DEBUGFILE *debug; 			/* file descriptor for debugging output */#endif	/*BG_DEBUG*/FILE	*timesfp = NULL;FILE	*timesindexfp = NULL;FILE	*indexfp = NULL;	/* glimpse index */FILE	*partfp = NULL;		/* glimpse partitions */FILE	*minifp = NULL;		/* glimpse turbo */FILE	*nullfp = NULL;		/* to discard output: agrep -s doesn't work properly */int	svstdin = 0, svstdout = 1, svstderr = 2;static int one = 1;		/* to set socket option so that glimpseserver releases socket after death *//* Index manipulation */struct offsets **src_offset_table;struct offsets **multi_dest_offset_table[MAXNUM_PAT];unsigned int *multi_dest_index_set[MAXNUM_PAT];extern free_list();struct stat index_stat_buf, file_stat_buf;int timesindexsize = 0;int last_Y_filenumber = 0;/* Direct agrep access for bytelevel-indices */extern int COUNT, INVERSE, TCOMPRESSED, NOFILENAME, POST_FILTER, OUTTAIL, BYTECOUNT, SILENT, NEW_FILE,	LIMITOUTPUT, LIMITPERFILE, LIMITTOTALFILE, PRINTRECORD, DELIMITER, SILENT, FILENAMEONLY, num_of_matched, prev_num_of_matched, FILEOUT;CHAR	matched_region[MAX_REGION_LIMIT*2 + MAXPATT*2];int	RegionLimit=DEFAULT_REGION_LIMIT;/* Returns number of matched records/lines. Uses agrep's options to output stuff nicely; never called with RecordLevelIndex set */intglimpse_search(AM, APattern, GD_length, GD_pattern, realfilename, filename, fileindex, src_offset_table, outfp)	int		AM;	unsigned char	APattern[];	int		GD_length;	unsigned char	GD_pattern[];	char		*realfilename;	char		*filename;	int		fileindex;	struct offsets	*src_offset_table[];	FILE		*outfp;{	FILE		*infp;	char		sig[SIGNATURE_LEN];	struct offsets	**p1, *tp1;	CHAR		*text, *curtextend, *curtextbegin, c;	int		times;	int		num, ret, totalret = 0;	int		prevoffset = 0, begininterval = 0, endinterval = -1;	CHAR		*beginregionptr = 0, *endregionptr = 0;	int		beginpage = 0, endpage = -1;	static int	MAXTIMES, MAXPGTIMES, pagesize;	static int	first_time = 1;	/*	 * If can't open file for read, quit	 * For each offset for that file:	 *	seek to that point	 *	go back until delimiter, go forward until delimiter, output it: MAX_REGION_LIMIT is 16K on either side.	 *	read in units of RegionLimit	 *	before outputting matched record, use options to put prefixes (or use memagrep which does everything?)	 * Algorithm changed: don't read same page in twice.	 */	if (first_time) {		pagesize = DISKBLOCKSIZE;		MAXTIMES = ((MAX_REGION_LIMIT / RegionLimit) > 1) ? (MAX_REGION_LIMIT / RegionLimit) : 1;		MAXPGTIMES = ((MAX_REGION_LIMIT / pagesize) > 1) ? (MAX_REGION_LIMIT / pagesize) : 1;		first_time = 0;	}	/* Safety: must end/begin with delim */	memcpy(matched_region, GD_pattern, GD_length);	memcpy(matched_region+MAXPATT+2*MAX_REGION_LIMIT, GD_pattern, GD_length);	text = &matched_region[MAX_REGION_LIMIT+MAXPATT];	if ((infp = my_fopen(filename, "r")) == NULL) return 0;	NEW_FILE = ON;#if	0	/* Cannot search in .CZ files since offset computations will be incorrect */	TCOMPRESSED = ON;	if (!tuncompressible_filename(file_list[i], strlen(file_list[i]))) TCOMPRESSED = OFF;	num_read = fread(sig, 1, SIGNATURE_LEN, infp);	if ((TCOMPRESSED == ON) && tuncompressible(sig, num_read)) {		EASYSEARCH = sig[SIGNATURE_LEN-1];		if (!EASYSEARCH) {			fprintf(stderr, "not compressed for easy-search: can miss some matches in: %s\n", CurrentFileName);	/* not filename!!! */		}	}	else TCOMPRESSED = OFF;#endif	/*0*/	p1 = &src_offset_table[fileindex];	while (*p1 != NULL) {		if ( (begininterval <= (*p1)->offset) && (endinterval > (*p1)->offset) ) {	/* already covered this area */#if	DEBUG			printf("ignoring %d in [%d,%d]\n", (*p1)->offset, begininterval, endinterval);#endif	/*DEBUG*/			tp1 = *p1;			*p1 = (*p1)->next;			my_free(tp1, sizeof(struct offsets));			continue;		}		TCOMPRESSED = OFF;#if	1		if ( (beginpage <= (*p1)->offset) && (endpage >= (*p1)->offset) && (text + ((*p1)->offset - prevoffset) + GD_length < endregionptr)) {			/* beginregionptr = curtextend - GD_length;	/* prevent next curtextbegin to go behind previous curtextend (!) */			text += ((*p1)->offset - prevoffset);			prevoffset = (*p1)->offset;			if (!((curtextend = forward_delimiter(text, endregionptr, GD_pattern, GD_length, 1)) < endregionptr))				goto fresh_read;			if (!((curtextbegin = backward_delimiter(text, beginregionptr, GD_pattern, GD_length, 0)) > beginregionptr))				goto fresh_read;		}		else { /* NOT within an area already read: must read another page: if record overlapps page, might read page twice: no time to fix */		fresh_read:			prevoffset = (*p1)->offset;			text = &matched_region[MAX_REGION_LIMIT+MAXPATT];	/* middle: points to occurrence of pattern */			endpage = beginpage = ((*p1)->offset / pagesize) * pagesize;			/* endpage = (((*p1)->offset + pagesize) / pagesize) * pagesize */			endregionptr = beginregionptr = text - ((*p1)->offset - beginpage);	/* overlay physical place starting from this logical point */			/* endregionptr = text + (endpage - (*p1)->offset); */			curtextbegin = curtextend = text;			times = 0;			while (times < MAXPGTIMES) {				fseek(infp, endpage, 0);				num = (&matched_region[MAX_REGION_LIMIT*2+MAXPATT] - endregionptr < pagesize) ? (&matched_region[MAX_REGION_LIMIT*2+MAXPATT] - endregionptr) : pagesize;				if ((num = fread(endregionptr, 1, num, infp)) <= 0) break;				endpage += num;				endregionptr += num;				if (endregionptr <= text) {					curtextend = text;	/* error in value of offset: file was modified and offsets no longer true: your RISK! */					break;				}				if (((curtextend = forward_delimiter(text, endregionptr, GD_pattern, GD_length, 1)) < endregionptr) ||				    (endregionptr >= &matched_region[MAX_REGION_LIMIT*2 + MAXPATT])) break;				times ++;			}			times = 0;			while (times < MAXPGTIMES) {	/* I have already read the initial page since endpage is beginpage initially */				if ((curtextbegin = backward_delimiter(text, beginregionptr, GD_pattern, GD_length, 0)) > beginregionptr) break;				if (beginpage > 0) {					if (beginregionptr - pagesize < &matched_region[MAXPATT]) {						if ((num = beginregionptr - &matched_region[MAXPATT]) <= 0) break;					}					else num = pagesize;					beginpage -= num;					beginregionptr -= num;				}				else break;				times ++;				fseek(infp, beginpage, 0);				fread(beginregionptr, 1, num, infp);			}		}#else	/*1*/		/* Find forward delimiter (including delimiter) */		times = 0;		fseek(infp, (*p1)->offset, 0);		while (times < MAXTIMES) {			if ((num = fread(text+RegionLimit*times, 1, RegionLimit, infp)) > 0)				curtextend = forward_delimiter(text, text+RegionLimit*times+num, GD_pattern, GD_length, 1);			if ((curtextend < text+RegionLimit*times+num) || (num < RegionLimit)) break;			times ++;		}		/* Find backward delimiter (including delimiter) */		times = 0;		while (times < MAXTIMES) {			num = ((*p1)->offset - RegionLimit*(times+1)) > 0 ? ((*p1)->offset - RegionLimit*(times+1)) : 0;			fseek(infp, num, 0);			if (num > 0) {				fread(text-RegionLimit*(times+1), 1, RegionLimit, infp);				curtextbegin = backward_delimiter(text, text-RegionLimit*(times+1), GD_pattern, GD_length, 0);			}			else {				fread(text-RegionLimit*times-(*p1)->offset, 1, (*p1)->offset, infp);				curtextbegin = backward_delimiter(text, text-RegionLimit*times-(*p1)->offset, GD_pattern, GD_length, 0);			}			if ((num <= 0) || (curtextbegin > text-RegionLimit*(times+1))) break;			times ++;		}#endif	/*1*/		/* set interval and delete the entry */		begininterval = (*p1)->offset - (text - curtextbegin);		endinterval = (*p1)->offset + (curtextend - text); 		if (strncmp(curtextbegin, GD_pattern, GD_length)) {			/* always pass enclosing delimiters to agrep; since we have seen text before curtextbegin + we have space, we can overwrite */			memcpy(curtextbegin - GD_length, GD_pattern, GD_length);			curtextbegin -= GD_length;		}#if	DEBUG		c = *curtextend;		*curtextend = '\0';		printf("%s [%d < %d < %d], text = %d: %s\n", CurrentFileName, begininterval, (*p1)->offset, endinterval, text, curtextbegin);		*curtextend = c;#endif	/*DEBUG*/		tp1 = *p1;		*p1 = (*p1)->next;		my_free(tp1, sizeof(struct offsets));		if (curtextend <= curtextbegin) continue;	/* error in offsets/delims */		/*		 * Don't call memagrep since that is heavy weight. Call exec		 * directly after doing agrep_search()'s preprocessing here.		 * PS: can add agrep variable not to do delim search if called from here		 * since that prevents unnecessarily scanning the buffer for the 2nd time.		 */		CurrentByteOffset = begininterval+1;		SetCurrentByteOffset = 1;		first_search = 1;		if (first_search) {			if ((ret = memagrep_search(AM, APattern, curtextend-curtextbegin, curtextbegin, 0, outfp)) > 0)				totalret ++; /* += ret */ 			else if ((ret < 0) && (errno == AGREP_ERROR)) {				fclose(infp);				return -1;			}			first_search = 0;		}		else {	/* All agrep globals are properly set: has a bug because agrep's globals aren't properly reinitialized without agrep_search :-( */			agrep_finalfp = (FILE *)outfp;			agrep_outlen = 0;			agrep_outbuffer = NULL;			agrep_outpointer = 0;			execfd = agrep_initialfd = -1;			agrep_inbuffer = curtextbegin;			agrep_inlen = curtextend - curtextbegin;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -