⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 get_index.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 4 页
字号:
		fprintf(stderr, "%s: run out of file descriptors!\n", GProgname);		return -1;	}	errno = 0;	if ((ret = fileagrep(index_argc, index_argv, 0, f_in)) < 0) {		fprintf(stderr, "%s: error in searching index\n", HARVEST_PREFIX);		fclose(f_in);		return -1;	}	fflush(f_in);	fclose(f_in);	f_in = NULL;	index_argv[patbufpos] = NULL;	/* For index-search with memgrep and get-filenames */	dummypat[0] = '\0';	if ((dummylen = memagrep_init(index_argc, index_argv, MAX_PAT, dummypat)) <= 0) {		fclose(f_in);		return -1;	}	/* Interpret the result */	if((f_in = fopen(infile, "r")) == NULL) {		fprintf(stderr, "%s: can't open for reading: %s/%s\n", GProgname, INDEX_DIR, infile);		return -1;	}	if (OneFilePerBlock) {	    for (patnum=0; patnum<num_mgrep_pat; patnum ++) {		for(i=0; i<round(OneFilePerBlock, 8*sizeof(int)); i++) {			multi_dest_index_set[patnum][i] = 0;		}		if (ByteLevelIndex) for(i=0; i<OneFilePerBlock; i++) {			free_list(&multi_dest_offset_table[patnum][i]);			/* multi_dest_offset_table[patnum][i] = NULL; bg, 28/9/1995 */		}		multi_dest_index_set[patnum][REAL_PARTITION - 1] = 0;		multi_dest_index_set[patnum][REAL_PARTITION - 2] = 0;	    }	}	else {	    for (patnum=0; patnum<num_mgrep_pat; patnum ++)		for(i=0; i<MAX_PARTITION; i++) {                	multi_dest_index_set[patnum][i] = 0;		}	}	dest_index_buf[0] = '\n';	/* memagrep needs buffer to begin with '\n' */	memset(allindexmark, '\0', num_mgrep_pat);	min = (index_tab[REAL_PARTITION - 1] == 1) ? 0 : index_tab[REAL_PARTITION - 2];	while(fgets(dest_index_buf+1, REAL_INDEX_BUF, f_in)) {		patnum=0;		sscanf(&dest_index_buf[1], "%d-", &patnum);#if	BG_DEBUG		fprintf(debug, "patnum=%d len=%d pat=%s attr=%d index-line: %s\n", patnum, pat_lens[mgrep_pat_index[patnum-1]], pat_list[mgrep_pat_index[patnum-1]], pat_attr[mgrep_pat_index[patnum-1]], dest_index_buf+1);#endif	/*BG_DEBUG*/		if ((patnum < 1) || (patnum > num_mgrep_pat)) continue;	/* error! */		setptr = multi_dest_index_set[patnum - 1];		offsetptr = multi_dest_offset_table[patnum - 1];		for(k=0; dest_index_buf[k] != ' '; k++);		dest_index_buf[k] = '\n';		if (!allindexmark[patnum - 1])			allindexmark[patnum - 1] = (char)get_set(&dest_index_buf[k], setptr, offsetptr, pat_lens[mgrep_pat_index[patnum-1]],								pat_list[mgrep_pat_index[patnum-1]], pat_attr[mgrep_pat_index[patnum-1]], outfile, partfp,								&setptr[REAL_PARTITION - 2], min);		/* To test the maximum disparity to stop unions within above */		if (!allindexmark[patnum-1]) min = setptr[REAL_PARTITION - 2];		for (patnum=0; patnum<num_mgrep_pat; patnum ++) {			if ((multi_dest_index_set[patnum][REAL_PARTITION - 2] < min) && (multi_dest_index_set[patnum][REAL_PARTITION - 1] != 1))				min = multi_dest_index_set[patnum][REAL_PARTITION - 2];		}		min += (index_tab[REAL_PARTITION - 1] == 1) ? 0 : index_tab[REAL_PARTITION - 2];	}#if	0	for (patnum=0; patnum<num_mgrep_pat; patnum++)		printf("%d=%d,%d\n", patnum, multi_dest_index_set[patnum][REAL_PARTITION - 1], multi_dest_index_set[patnum][REAL_PARTITION - 2]);#endif	/*0*/	for (patnum=0; patnum<num_mgrep_pat; patnum++)		sorted[patnum] = patnum;	if (ByteLevelIndex && !NOBYTELEVEL && (RecordLevelIndex || !(Only_first && !PRINTAPPXFILEMATCH))) {		max = 0;		for (patnum=1; patnum<num_mgrep_pat; patnum++) {		    if (multi_dest_index_set[patnum][REAL_PARTITION - 2] > multi_dest_index_set[max][REAL_PARTITION - 2])			max = patnum;		}		/* Sort them according to the lengths of the lists in increasing order: min first */		for (patnum=0; patnum<num_mgrep_pat; patnum++) {		    min = patnum;		    for (j=patnum+1; j<num_mgrep_pat; j++)			if (multi_dest_index_set[sorted[j]][REAL_PARTITION - 2] < multi_dest_index_set[sorted[min]][REAL_PARTITION - 2])			    min = j;		    if (min != patnum) {			temp = sorted[patnum];			sorted[patnum] = sorted[min];			sorted[min] = temp;		    }		}#if	USEFREQUENCIES		if (!RecordLevelIndex && (multi_dest_index_set[sorted[max]][REAL_PARTITION - 2] > MAX_DISPARITY * multi_dest_index_set[sorted[0]][REAL_PARTITION - 2])) {		    NOBYTELEVEL = 1;		    /* printf("4 "); */		    for (iii=0; iii<OneFilePerBlock; iii++) {			for (jjj=0; jjj<num_mgrep_pat; jjj++)			    free_list(&multi_dest_offset_table[jjj][iii]);			free_list(&offset_tab[iii]);		    }		}#endif	}	else if (!RecordLevelIndex && NOBYTELEVEL) {	    for (iii=0; iii<OneFilePerBlock; iii++) {		for (jjj=0; jjj<num_mgrep_pat; jjj++)		    free_list(&multi_dest_offset_table[jjj][iii]);		free_list(&offset_tab[iii]);	    }	}	/* Take intersection if parse=ANDPAT or 0 (one terminal pattern), union if OR_EXP; Take care of universal sets in offset_tab[REAL_PARTITION - 1] */	for (patnum=0; patnum<num_mgrep_pat; patnum++) {		if (OneFilePerBlock) {		    if (parse & OR_EXP) {			if (allindexmark[sorted[patnum]]) {			ret_is_1:			    index_tab[REAL_PARTITION - 1] = 1;			    for(i=0; i<round(OneFilePerBlock, 8*sizeof(int)) - 1; i++) {				index_tab[i] = 0xffffffff;			    }			    index_tab[i] = 0;			    for (j=0; j<8*sizeof(int); j++) {				if (i*8*sizeof(int) + j >= OneFilePerBlock) break;				index_tab[i] |= mask_int[j];			    }			    if (ByteLevelIndex && !RecordLevelIndex && !NOBYTELEVEL && !(Only_first && !PRINTAPPXFILEMATCH))	/* collect as many offsets as possible with RecordLevelIndex: free offset_tables at the end of process_query() */			      for (i=0; i<OneFilePerBlock; i++) {			        for (patnum=0;patnum<num_mgrep_pat;patnum++)				  free_list(&multi_dest_offset_table[sorted[patnum]][i]);				free_list(&offset_tab[i]);			      }			    if (ByteLevelIndex && !RecordLevelIndex) NOBYTELEVEL = 1;			    fclose(f_in);			    return 0;			}			index_tab[REAL_PARTITION - 1] = 0;			for (i=0; i<round(OneFilePerBlock, 8*sizeof(int)); i++) index_tab[i] |= multi_dest_index_set[sorted[patnum]][i];			if (ByteLevelIndex && !NOBYTELEVEL && (RecordLevelIndex || !(Only_first && !PRINTAPPXFILEMATCH))) {			    for (i=0; i<OneFilePerBlock; i++) {				sorted_union(&offset_tab[i], &multi_dest_offset_table[sorted[patnum]][i], &index_tab[REAL_PARTITION - 2], multi_dest_index_set[sorted[patnum]][REAL_PARTITION - 2], 0);				if (!RecordLevelIndex && NOBYTELEVEL) {	/* collect as many offsets as possible with RecordLevelIndex: free offset_tables at the end of process_query() */				    for (iii=0; iii<OneFilePerBlock; iii++) {					for (jjj=0; jjj<num_mgrep_pat; jjj++)					    free_list(&multi_dest_offset_table[jjj][iii]);					free_list(&offset_tab[iii]);				    }				    break;				}			    }			}		    }		    else {			if (((index_tab[REAL_PARTITION - 1] == 1) || first_time) && (allindexmark[sorted[patnum]])) {			both_are_1:			    if (first_time) {				index_tab[REAL_PARTITION - 1] = 1;				for(i=0; i<round(OneFilePerBlock, 8*sizeof(int)) - 1; i++) {				    index_tab[i] = 0xffffffff;				}				index_tab[i] = 0;				for (j=0; j<8*sizeof(int); j++) {				    if (i*8*sizeof(int) + j >= OneFilePerBlock) break;				    index_tab[i] |= mask_int[j];				}			    }			    first_time = 0;			    if (ByteLevelIndex && !RecordLevelIndex && !NOBYTELEVEL && !(Only_first && !PRINTAPPXFILEMATCH))	/* collect as many offsets as possible with RecordLevelIndex: free offset_tables at the end of process_query() */			      for (i=0; i<OneFilePerBlock; i++) {			        for (patnum=0;patnum<num_mgrep_pat;patnum++)			 	  free_list(&multi_dest_offset_table[sorted[patnum]][i]);			 	free_list(&offset_tab[i]);			      }			    if (ByteLevelIndex && !RecordLevelIndex) NOBYTELEVEL = 1;			    /*			    fclose(f_in);			    return 0;			    */			}			else if ((index_tab[REAL_PARTITION - 1] == 1) || first_time) {			    first_time = 0;			    index_tab[REAL_PARTITION - 1] = 0;			    for (i=0; i<round(OneFilePerBlock, 8*sizeof(int)); i++) index_tab[i] = multi_dest_index_set[sorted[patnum]][i];			    if (ByteLevelIndex && !NOBYTELEVEL && (RecordLevelIndex || !(Only_first && !PRINTAPPXFILEMATCH))) {				for (i=0; i<OneFilePerBlock; i++) {				    free_list(&offset_tab[i]);				    offset_tab[i] = multi_dest_offset_table[sorted[patnum]][i];				    multi_dest_offset_table[sorted[patnum]][i] = NULL;				}			    }			}			else if (allindexmark[sorted[patnum]]) {			    if (ByteLevelIndex && !RecordLevelIndex && !NOBYTELEVEL && !(Only_first && !PRINTAPPXFILEMATCH))	/* collect as many offsets as possible with RecordLevelIndex: free offset_tables at the end of process_query() */			      for (i=0; i<OneFilePerBlock; i++) free_list(&multi_dest_offset_table[sorted[patnum]][i]);			}			else {			    for (i=0; i<round(OneFilePerBlock, 8*sizeof(int)); i++) index_tab[i] &= multi_dest_index_set[sorted[patnum]][i];			    if (ByteLevelIndex && !NOBYTELEVEL && (RecordLevelIndex || !(Only_first && !PRINTAPPXFILEMATCH))) {				if (first_time || WHOLEFILESCOPE) {				    first_time = 0;				    for (i=0; i<OneFilePerBlock; i++) {					sorted_union(&offset_tab[i], &multi_dest_offset_table[sorted[patnum]][i], &index_tab[REAL_PARTITION - 2], multi_dest_index_set[sorted[patnum]][REAL_PARTITION - 2], 0);					if (!RecordLevelIndex && NOBYTELEVEL) {					    for (iii=0; iii<OneFilePerBlock; iii++) {						for (jjj=0; jjj<num_mgrep_pat; jjj++)						    free_list(&multi_dest_offset_table[jjj][iii]);						free_list(&offset_tab[iii]);					    }					    break;					}				    }				}				else {				    for (i=0; i<OneFilePerBlock; i++) {					if ((index_tab[block2index(i)] & mask_int[i % (8*sizeof(int))]))					    sorted_intersection(i, &offset_tab[i], &multi_dest_offset_table[sorted[patnum]][i], &index_tab[REAL_PARTITION - 2]);					else free_list(&multi_dest_offset_table[sorted[patnum]][i]);					/*					if (index_tab[REAL_PARTITION - 2] < MIN_OCCURRENCES) {					    if (!NOBYTELEVEL) {						    for (iii=0; iii<OneFilePerBlock; iii++) {							for (jjj=0; jjj<num_mgrep_pat; jjj++)							    free_list(&multi_dest_offset_table[jjj][iii]);							free_list(&offset_tab[iii]);						    }					    }					    NOBYTELEVEL = 1;					    OPTIMIZEBYTELEVEL = 1;					    break;					}					*/				    }				}			    }			}		    }		}		else {		    if (parse & OR_EXP) {			for (patnum=0; patnum<num_mgrep_pat; patnum++)				for(i=0; i<MAX_PARTITION; i++) index_tab[i] |= multi_dest_index_set[patnum][i];		    }		    else {			for (patnum=0; patnum<num_mgrep_pat; patnum++)				for(i=0; i<MAX_PARTITION; i++) index_tab[i] &= multi_dest_index_set[patnum][i];		    }		}	}#if	BG_DEBUG	fprintf(debug, "get_index(): the following partitions are ON\n");	for(i=0; i<((OneFilePerBlock > 0) ? round(OneFilePerBlock, 8*sizeof(int)) : MAX_PARTITION); i++) {	      if(index_tab[i]) fprintf(debug, "%d,%x\n", i, index_tab[i]);	}#endif	/*BG_DEBUG*/	fclose(f_in);	return 0;}/* All borrowed from main.c and are needed for searching the index */extern	CHAR	*pat_list[MAXNUM_PAT];  /* complete words within global pattern */extern	int	pat_lens[MAXNUM_PAT];   /* their lengths */extern	int	pat_attr[MAXNUM_PAT];   /* set of attributes */extern	int	num_pat;extern	CHAR	pat_buf[(MAXNUM_PAT + 2)*MAXPAT];extern	int	pat_ptr;extern	int	is_mgrep_pat[MAXNUM_PAT];extern	int	mgrep_pat_index[MAXNUM_PAT];extern	int	num_mgrep_pat;extern	unsigned int	*src_index_set;extern	struct offsets **src_offset_table;extern	char	tempfile[];extern	int	patindex;extern	int	patbufpos;extern	ParseTree terminals[MAXNUM_PAT];extern	int	GBESTMATCH;		/* Should I change -B to -# where # = no. of errors? */extern	int	bestmatcherrors;	/* set during index search, used later on */extern	FILE	*partfp;		/* glimpse partitions */extern	FILE	*nullfp;		/* to discard output: agrep -s doesn't work properly */extern	int	ComplexBoolean;extern	int	num_terminals;#if	0extern  struct token *hash_table[MAX_64K_HASH];#else	/*0*/extern	int	mini_array_len;#endif	/*0*/extern  int	WORDBOUND, NOUPPER, D, LINENUM;intveryfastsearch(argc, argv, num_pat, pat_list, pat_lens, minifp)	int	argc;	char	*argv[];	int	num_pat;	CHAR	*pat_list[MAXNUM_PAT];	int	pat_lens[MAXNUM_PAT];	FILE	*minifp;{	/*	 * Figure out from options if very fast search is possible.	 */	if (minifp == NULL) return 0;	if (!OneFilePerBlock) return 0;	/* you did not build index for speed anyway */	if (!(WORDBOUND && NOUPPER && (D<=0))) return 0;	if (LINENUM) return 0;	return 1;	/* if ((num_mgrep_pat == num_pat) || ((1 == num_pat) && (1 == checksg(pat_list[0], D, 0)))) return 1; */	/* either all >= 2 patterns are mgrep-able (simple) or there is just one simple pattern: i.e., "cast" can be used! */	/* return 0; */}intmini_agrep(inword, inlen, outfp)	CHAR	*inword;	int	inlen;	FILE	*outfp;{	static struct stat st;	static	int statted = 0;	unsigned char	s[MAX_LINE_LEN], word[MAX_NAME_LEN];	long	beginoffset, endoffset, curroffset;	unsigned char c;	int	j, num = 0, cmp, len;	if (!statted) {		sprintf((char*)s, "%s/%s", INDEX_DIR, INDEX_FILE);		if (stat(s, &st) == -1) {			fprintf(stderr, "Can't stat file: %s\n", s);			exit(2);		}		statted = 1;	}	j = 0;	while (*inword) {		if (*inword == '\\') {			inword++;			continue;		}		if (isupper(*(unsigned char *)inword)) word[j] = tolower(*(unsigned char *)inword);		else word[j] = *inword;		j++;		inword ++;	}	word[j] = '\0';	len = j;	if (!get_mini(word, len, &beginoffset, &endoffset, 0, mini_array_len, minifp)) return 0;	if (endoffset == -1) endoffset = st.st_size;	if (endoffset <= beginoffset) return 0;	/* We must find all occurrences of the word (in all attributes) so can't quit when we find the first match */	fseek(indexfp, beginoffset, 0);	curroffset = ftell(indexfp);	/* = beginoffset */	while ((curroffset < endoffset) && (fgets(s, MAX_LINE_LEN, indexfp) != NULL)) {		j = 0;		while ((j < MAX_LINE_LEN) && (s[j] != WORD_END_MARK) && (s[j] != ALL_INDEX_MARK) && (s[j] != '\0') && (s[j] != '\n')) j++;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -