⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 io.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 3 页
字号:
/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal.  All Rights Reserved. *//* ./glimpse/index/io.c */#include "glimpse.h"#include <stdio.h>#include <sys/stat.h>#include <errno.h>extern char INDEX_DIR[MAX_LINE_LEN];extern int memory_usage;#include "utils.c"int	REAL_INDEX_BUF = DEF_REAL_INDEX_BUF,	MAX_ALL_INDEX = DEF_MAX_ALL_INDEX,	FILEMASK_SIZE = DEF_FILEMASK_SIZE,	REAL_PARTITION = DEF_REAL_PARTITION;/* Escapes single quotes in "original" string with backquote (\) s.t. it can be passed on to the shell as a file name: returns its second argument for printf *//* Called before passing any argument to the system() routine in glimpse or glimspeindex source code *//* Works only if the new name is going to be passed as argument to the shell within two ''s */char *escapesinglequote(original, new)	char	*original, *new;{	char	*oldnew = new;	while (*original != '\0') {		if (*original == '\'') {			*new ++ = '\'';	/* close existing ' : this guy will be a part of a file name starting from a ' */			*new ++ = '\\';	/* add escape character */			*new ++ = '\'';	/* add single quote from original here */		}		*new ++ = *original ++; /* start the real single quote to continute existing file name if *original was ' */	}	*new = *original;	return oldnew;}/* --------------------------------------------------------------------get_array_of_lines()input: an input filename, address of the table, maximum number of entriesof the table, and a overflow handling flag.output: a set of strings in the table.when overflow is ON, the function returns after the table is filled.otherwise the function will exit if overflow occurs.In normal return, the function returns the number of entries read.----------------------------------------------------------------------*/get_array_of_lines(inputfile, table, max_entry, overflow_ok)char *inputfile;char **table[];int  max_entry;  /* max number of entries in the table */int  overflow_ok;   /* flag for handling overflow */{	int  tx=0;    /* index for table */	FILE *file_in;	unsigned char buffer[MAX_NAME_BUF];	char *np; 		int  line_length;	int  num_lines;	if((file_in = fopen(inputfile, "r")) == NULL) {		if (overflow_ok) return 0;		fprintf(stderr, "can't open for reading: %s\n", inputfile);		exit(2);	}	fgets(buffer, MAX_NAME_BUF, file_in);	sscanf(buffer, "%d", &num_lines);	if ((num_lines < 0) || (num_lines > MaxNum24bPartition)) {		fclose(file_in);		if (overflow_ok) return 0;		fprintf(stderr, "Error in reading: %s\n", inputfile);		exit(2);	}	while(fgets(buffer, MAX_NAME_BUF, file_in)) {		line_length = strlen(buffer);		if (line_length == 1) continue;		buffer[line_length-1] = '\0';  /* discard the '\n' */#if	BG_DEBUG		np = (char *) my_malloc(sizeof(char) * (line_length + 2));#else	/*BG_DEBUG*/		np = (char *) my_malloc(sizeof(char) * (line_length + 2));#endif	/*BG_DEBUG*/		if(np == NULL) {		    int	i=0;		    fclose(file_in);		    for (i=0; i<tx; i++) {#if	BG_DEBUG			memory_usage -= (strlen(LIST_GET(table, i)) + 2);#endif	/*BG_DEBUG*/			if (LIST_GET(table, i) != NULL) {				my_free(LIST_GET(table, i), 0);				LIST_SUREGET(table, i) = NULL;			}		    }		    if (overflow_ok) {			fclose(file_in);			return 0;		    }		    fprintf(stderr, "malloc failure in get_array_of_lines\n");		    exit(2);		}		LIST_ADD(table, tx, np, char*);		tx ++;		/* table[tx++] = (unsigned char *)np; */		strcpy(np, buffer);		if(tx > max_entry) {		    fclose(file_in);		    if(overflow_ok) {			fclose(file_in);			return(tx);		    }		    fprintf(stderr, "overflow in get_array_of_lines()\n");		    exit(2);		}	}	fclose(file_in);	return(tx);   /* return number of lines read */}/* --------------------------------------------------------------------get_table():input: an input filename, address of the table, maximum number of entriesof the table, and a overflow handling flag.output: a set of integers in the table.when overflow_ok is ON, the function returns after the table is filled.otherwise the function will exit if overflow occurs.In normal return, the function returns the number of entries read.----------------------------------------------------------------------*/int get_table(inputfile, table, max_entry, overflow_ok)char *inputfile;int  table[];int  max_entry;int  overflow_ok;{	int  val = 0;	int  c = 0;	FILE *file_in;	int  tx=0;           /* number of entries read */	if((file_in = fopen(inputfile, "r")) == NULL) {		if (overflow_ok) return 0;		fprintf(stderr, "can't open %s for reading\n", inputfile);		exit(2);	}	while((c = getc(file_in)) != EOF) {		val = c << 24;		if ((c = getc(file_in)) == EOF) break;		val |= c << 16;		if ((c = getc(file_in)) == EOF) break;		val |= c << 8;		if ((c = getc(file_in)) == EOF) break;		val |= c;		table[tx++] = val;		if(tx > max_entry) {			if(!overflow_ok) {			    fprintf(stderr, "in get_table: table overflow\n");			    exit(2);			}			break;		}	}	fclose(file_in);	return(tx);}get_index_type(s, dashn, num, attr, delim)char s[];int *dashn, *num, *attr;char delim[];{	FILE *fp = fopen(s, "r");	char buf[MAX_LINE_LEN];	*dashn = *num = *attr = 0;	*delim = '\0';	if (fp == NULL) return 0;	fscanf(fp, "%s\n%%%d\n%%%d%s\n", buf, num, attr, delim);	/* printf("get_index_type(): %s %d %d %s\n", buf, num, attr, delim); */	fclose(fp);	if (strstr(buf, "1234567890")) *dashn = ON;	return *num;}/* Read offset from srcbuf first so that you can use it with srcbuf=destbuf */get_block_numbers(srcbuf, destbuf, partfp)	unsigned char *srcbuf, *destbuf;	FILE *partfp;{	int	offset, pat_size;	static int printederror = 0;	/* Does not do caching of blocks seen so far: done in OS hopefully */	offset = (srcbuf[0] << 24) |		(srcbuf[1] << 16) |		(srcbuf[2] << 8) |		(srcbuf[3]);	pat_size = decode32b(offset);	if (-1 == fseek(partfp, pat_size, 0)) {		if (!printederror) {			fprintf(stderr, "Warning! Error in the format of the index!\n");			printederror = 1;		}	}	destbuf[0] = '\n';	destbuf[1] = '\0';	destbuf[2] = '\0';	destbuf[3] = '\0';	if (fgets(destbuf, REAL_INDEX_BUF - MAX_WORD_BUF - 1, partfp) == NULL) {		destbuf[0] = '\n';		destbuf[1] = '\0';		destbuf[2] = '\0';		destbuf[3] = '\0';	}}int num_filter=0;int filter_len[MAX_FILTER];CHAR *filter[MAX_FILTER];CHAR *filter_command[MAX_FILTER];struct stat filstbuf;read_filters(index_dir, dofilter)char	*index_dir;int	dofilter;{    int len;    int patlen;    int patpos;    int commandpos;    FILE *filterfile;    char filterbuf[MAX_LINE_LEN];    char tempbuf[MAX_LINE_LEN];    char s[MAX_LINE_LEN];    num_filter = 0;    memset(filter, '\0', sizeof(CHAR *) * MAX_FILTER);    memset(filter_command, '\0', sizeof(CHAR *) * MAX_FILTER);    memset(filter_len, '\0', sizeof(int) * MAX_FILTER);    if (!dofilter) return;    sprintf(s, "%s/%s", index_dir, FILTER_FILE);    filterfile = fopen(s, "r");    if(filterfile == NULL) {	/* fprintf(stderr, "can't open filter file %s\n", s); -- no need */	num_filter = 0;    }    else if (fstat(fileno(filterfile), &filstbuf) == -1) {	num_filter = 0;    }    else {	while((num_filter < MAX_FILTER) && fgets(filterbuf, MAX_LINE_LEN, filterfile)) {		if ((len = strlen(filterbuf)) < 1) continue;		filterbuf[len-1] = '\0';		commandpos = 0;		while ((commandpos < len) && ((filterbuf[commandpos] == ' ') || (filterbuf[commandpos] == '\t'))) commandpos ++;	/* leading spaces */		if (commandpos >= len) continue;		if (filterbuf[commandpos] == '\'') {			commandpos ++;			patpos = commandpos;			patlen = 0;			while (commandpos < len) {				if (filterbuf[commandpos] == '\\') {					commandpos += 2;					patlen += 2;				}				else if (filterbuf[commandpos] != '\'') {					commandpos ++;					patlen ++;				}				else break;			}			if ((commandpos >= len) || (patlen <= 0)) continue;			commandpos ++;		}		else {			patpos = commandpos;			patlen = 0;			while ((commandpos < len) && (filterbuf[commandpos] != ' ') && (filterbuf[commandpos] != '\t')) {				commandpos ++;				patlen ++;			}			while ((commandpos < len) && ((filterbuf[commandpos] == ' ') || (filterbuf[commandpos] == '\t'))) commandpos ++;			if (commandpos >= len) continue;		}		memcpy(tempbuf, &filterbuf[patpos], patlen);		tempbuf[patlen] = '\0';		if ((filter_len[num_filter] = convert2agrepregexp(tempbuf, patlen)) == 0) continue;	/* inplace conversion */		filter[num_filter] = (unsigned char *) strdup(tempbuf);		filter_command[num_filter] = (unsigned char *)strdup(&filterbuf[commandpos]);		num_filter ++;	}	fclose(filterfile);    }}/* 1 if filter application was successful and the output (>1B) is in outname, 2 if some pattern matched but there is no output, 0 otherwise: sep 15-18 '94 *//* memagrep is initialized in partition.c for calls from dir.c, and it is already done by the time we call this function from main.c */apply_filter(inname, outname)	char	*inname, *outname;	/* outname is in-out, inname is in */{	int	i;	char	name[MAX_LINE_LEN], es1[MAX_LINE_LEN], es2[MAX_LINE_LEN];	int	name_len = strlen(inname);	char	s[MAX_LINE_LEN];	FILE	*dummyout;	FILE	*dummyin;	char	dummybuf[4];	char	prevoutname[MAX_LINE_LEN];	char	newoutname[MAX_LINE_LEN];	char	tempoutname[MAX_LINE_LEN];	char	tempinname[MAX_LINE_LEN];	int	ret = 0;	int	unlink_prevoutname = 0;	if (num_filter <= 0) return 0;	if ((dummyout = fopen("/dev/null", "w")) == NULL) return 0;	/* ready for memgrep */	name[0] = '\n';	special_get_name(inname, name_len, tempinname);	name_len = strlen(tempinname);	strcpy(name+1, tempinname);	strcpy(prevoutname, tempinname);	strcpy(newoutname, outname);	/* Current properly filtered output is always in prevoutname */	for(i=0; i<num_filter; i++) {		if (filter_len[i] > 0) {			char *suffix;			name[name_len + 1] = '\0';			if ((suffix = strstr(name+1, filter[i])) != NULL) {	/* Chris Dalton */				if (ret == 0) ret = 2;				/* yes, it matched: now apply the command and get the output */				/* printf("filtering %s\n", name); */				sprintf(s, "exec %s '%s' > '%s'", filter_command[i], escapesinglequote(prevoutname, es1), escapesinglequote(newoutname, es2));				system(s);				if (((dummyin = my_fopen(newoutname, "r")) == NULL) || (fread(dummybuf, 1, 1, dummyin) <= 0)) {					if (dummyin != NULL) fclose(dummyin);					unlink(newoutname);					continue;				}				/* Filter was successful: output exists and has atleast 1 byte in it */				fclose(dummyin);				if (unlink_prevoutname) {					unlink(prevoutname);					strcpy(tempoutname, prevoutname);					strcpy(prevoutname, newoutname);					strcpy(newoutname, tempoutname);				}				else {					strcpy(prevoutname, newoutname);					sprintf(newoutname, "%s.o", prevoutname);				}				ret = 1;				unlink_prevoutname = 1;#if	1				/* if the matched text was a proper suffix of the name, */				/* remove the suffix just processed before examining the */				/* name again. Chris Dalton */				/* And I don't know what the equivalent thing is with */				/* memagrep_search: since it doesn't return a pointer to */				/* the place where the match occured. Burra Gopal */				if (strcmp(filter[i], suffix) == 0) {					name_len -= strlen(suffix);					*suffix= '\0';				}#endif	/*1*/				if (strlen(newoutname) >= MAX_LINE_LEN - 1) break;			}		}		else {	/* must call memagrep */			name[name_len + 1] = '\n';	/* memagrep wants names to end with '\n': '\0' is not necessary */			/* printf("i=%d filterlen=%d filter=%s inlen=%d input=%s\n", i, -filter_len[i], filter[i], len_current_dir_buf, current_dir_buf); */			if (((filter_len[i] == -2) && (filter[i][0] == '.') && (filter[i][1] == '*')) ||			    (memagrep_search(-filter_len[i], filter[i], name_len + 2, name, 0, dummyout) > 0)) {				if (ret == 0) ret = 2;				/* yes, it matched: now apply the command and get the output */				/* printf("filtering %s\n", name); */				sprintf(s, "exec %s '%s' > '%s'", filter_command[i], escapesinglequote(prevoutname, es1), escapesinglequote(newoutname, es2));				system(s);				if (((dummyin = my_fopen(newoutname, "r")) == NULL) || (fread(dummybuf, 1, 1, dummyin) <= 0)) {					if (dummyin != NULL) fclose(dummyin);					unlink(newoutname);					continue;				}				/* Filter was successful: output exists and has atleast 1 byte in it */				fclose(dummyin);				if (unlink_prevoutname) {					unlink(prevoutname);					strcpy(tempoutname, prevoutname);					strcpy(prevoutname, newoutname);					strcpy(newoutname, tempoutname);				}				else {					strcpy(prevoutname, newoutname);					sprintf(newoutname, "%s.o", prevoutname);				}		  		ret = 1;				unlink_prevoutname = 1;				if (strlen(newoutname) >= MAX_LINE_LEN - 1) break;			}		}	}	if (ret == 1) strcpy(outname, prevoutname);	else {	/* dummy filter that copies input to output: caller can use tempinname but this has easy interface */		sprintf(s, "exec %s '%s' > '%s'\n", SYSTEM_CAT, escapesinglequote(tempinname, es1), escapesinglequote(outname, es2));		system(s);	}	fclose(dummyout);	return ret;}/* Use a modified wais stoplist to do this with simple strcmp's in a for loop */static_stop_list(word)	char	*word;{	return 0;}/* This is the stuff that used to be present in the old build_in.c *//* Some variables used throughout */FILE *TIMEFILE;		/* file descriptor for sorting .glimpse_filenames by time */#if	BG_DEBUGFILE  *LOGFILE; 	/* file descriptor for LOG output */#endif	/*BG_DEBUG*/FILE  *STATFILE;	/* file descriptor for statistical data about indexed files */FILE  *MESSAGEFILE;	/* file descriptor for important messages meant for the user */char  INDEX_DIR[MAX_LINE_LEN];char  sync_path[MAX_LINE_LEN];struct stat istbuf;struct stat excstbuf;struct stat incstbuf;int ICurrentFileOffset;int NextICurrentFileOffset;/* Some options used throughout */int GenerateHash = OFF;int KeepFilenames = OFF;int OneFilePerBlock = OFF;int total_size = 0;int total_deleted = 0;int MAXWORDSPERFILE = 0;int NUMERICWORDPERCENT = DEF_NUMERIC_WORD_PERCENT;int AddToIndex = OFF;int DeleteFromIndex = OFF;int PurgeIndex = ON;int FastIndex = OFF;int BuildDictionary = OFF;int BuildDictionaryExisting = OFF;int CompressAfterBuild = OFF;int IncludeHigherPriority = OFF;int FilenamesOnStdin = OFF;int ExtractInfo = OFF;int InfoAfterFilename = OFF;int FirstWordOfInfoIsKey = OFF;int UseFilters = OFF;int ByteLevelIndex = OFF;int RecordLevelIndex = OFF;	/* When we want a -o like index but want to do booleans on a per-record basis directly from index: robint@zedcor.com */				/* This type of index doesn't make sense with attributes since they span > 1 record; hence StructuredIndex == -2 => this = ON */int StoreByteOffset = OFF;	/* In RecordLevelIndex, store record # for each word or byte offset of the record: record # is the default (12/12/96) */char rdelim[MAX_LINE_LEN];char old_rdelim[MAX_LINE_LEN];int rdelim_len = 0;/* int IndexUnderscore = OFF; */int IndexableFile = OFF;int MAX_INDEX_PERCENT = DEF_MAX_INDEX_PERCENT;int MAX_PER_MB = DEF_MAX_PER_MB;int I_THRESHOLD = DEF_I_THRESHOLD;int BigHashTable = OFF;int IndexEverything = OFF;int HashTableSize = MAX_64K_HASH;int BuildTurbo = OFF;int SortByTime = OFF;int AddedMaxWordsMessage = OFF;int AddedMixedWordsMessage = OFF;int  icount=0; /* count the number of my_malloc for indices structure */int  hash_icount=0; /* to see how much was added to the current hash table */int  save_icount=0; /* to see how much was added to the index by the current file */int  numeric_icount=0; /* to see how many numeric words were there in the current file */int mask_int[32] = MASK_INT;int p_table[MAX_PARTITION];int memory_usage = 0;char *my_malloc(len)    int len;{    char *s;    static int i=100;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -