⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 get_index.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 4 页
字号:
		if ((j >= MAX_LINE_LEN) || (s[j] == '\0') || (s[j] == '\n')) {			curroffset = ftell(indexfp);			continue;		}		/* else it is WORD_END_MARK or ALL_INDEX_MARK */		c = s[j];		s[j] = '\0';		cmp = strcmp(word, s);#if	WORD_SORTED		if (cmp < 0) break;	/* since index is sorted by word */		else#endif	/* WORD_SORTED */		if (cmp != 0) {	/* not IDENTICALLY EQUAL */			s[j] = c;			curroffset = ftell(indexfp);			continue;		}		s[j] = c;		fputs(s, outfp);		num++;		curroffset = ftell(indexfp);	}	return num;}/* Returns the number of times a successful search was conducted: unused info at present. */fillup_target(result_index_set, result_offset_table, parse)	unsigned int	*result_index_set;	struct offsets **result_offset_table;	long	parse;{	int	i=0;	FILE	*tmpfp;	int	dummylen = 0;	char	dummypat[MAX_PAT];	int	successes = 0, ret;	int	first_time = 1;	extern int	veryfast;	int	prev_INVERSE = INVERSE;	veryfast = veryfastsearch(index_argc, index_argv, num_pat, pat_list, pat_lens, minifp);	while (i < num_pat)  {		if (!veryfast) {		if (is_mgrep_pat[i] && (num_mgrep_pat > 1)) {	/* do later */			i++;			continue;		}		strcpy(index_argv[patindex], pat_list[i]);	/* i-th pattern in its right position */		}		/* printf("pat_list[%d] = %s\n", i, pat_list[i]); */		if ((tmpfp = fopen(tempfile, "w")) == NULL) {			fprintf(stderr, "%s: cannot open for writing: %s, errno=%d\n", GProgname, tempfile, errno);			return(-1);		}		errno = 0;/* do we need to check is_mgrep_pat[i] here? */		if (veryfast && is_mgrep_pat[i]) {			ret = mini_agrep(pat_list[i], pat_lens[i], tmpfp);		}		/* If this is the glimpse server, since the process doesn't die, most of its data pages might still remain in memory */		else if ((ret = fileagrep(index_argc, index_argv, 0, tmpfp)) < 0) {			/* reinitialization here takes care of agrep_argv changes AFTER split_pattern */			fprintf(stderr, "%s: error in searching index\n", HARVEST_PREFIX);			fclose(tmpfp);			return(-1);		}		/* Now, the output of index search is in tempfile: need to use files here since index is too large */		fflush(tmpfp);		fclose(tmpfp);		tmpfp = NULL;		/* Keep track of the maximum number of errors: will never enter veryfast */		if (GBESTMATCH) {			if (errno > bestmatcherrors)				bestmatcherrors = errno;		}		/* At this point, all index-search options are properly set due to the above fileagrep */		INVERSE = prev_INVERSE;		if (-1 ==get_index(tempfile, result_index_set, result_offset_table, pat_list[i], pat_lens[i], pat_attr[i], index_argv, index_argc, nullfp, partfp, parse, first_time))			return(-1);		successes ++;		first_time = 0;		i++;	}	fflush(stderr);	if (veryfast) return successes;	/* For index-search with memgrep in mgrep_get_index, and get-filenames */	dummypat[0] = '\0';	if ((dummylen = memagrep_init(index_argc, index_argv, MAX_PAT, dummypat)) <= 0) return(-1);	if (num_mgrep_pat > 1) {		CHAR *old_buf = (CHAR *)index_argv[patbufpos];	/* avoid my_free and re-my_malloc */		index_argv[patbufpos] = (char*)pat_buf;	/* this contains all the patterns with the right -m and -M options */#if	BG_DEBUG		fprintf(debug, "pat_buf = %s\n", pat_buf);#endif	/*BG_DEBUG*/		strcpy(index_argv[patindex], "-z");	/* no-op: patterns are in patbufpos; also avoid shift-left of index_argv */		if (-1 == mgrep_get_index(tempfile, result_index_set, result_offset_table,				pat_list, pat_lens, pat_attr, mgrep_pat_index, num_mgrep_pat, patbufpos,				index_argv, index_argc, nullfp, partfp, parse, first_time)) {			index_argv[patbufpos] = (char *)old_buf;	/* else will my_free array! */			fprintf(stderr, "%s: error in searching index\n", HARVEST_PREFIX);			return(-1);		}		successes ++;		first_time = 0;		index_argv[patbufpos] = (char *)old_buf;	}	return successes;}/* * Now, I search the index by doing an in-order traversal of the boolean parse tree starting at GParse. * The results at each node are stored in src_offset_table and src_index_set. Before the right child is * evaluated, results of the left child are stored in curr_offset_table and curr_index_set (accumulators) * and are unioned/intersected/noted with the right child's results (which get stored in src_...) and * passed on above. The accumulators are allocated at each internal node and freed after evaluation. * Left to right evaluation is good since number of curr_offset_tables that exist simultaneously depends * entirely on the maximum depth of a right branch (REAL_PARTITION is small so it won't make a difference). */intsearch_index(tree)	ParseTree *tree;{	int	prev_INVERSE;	int	i, j, iii;	int	first_time = 0;	/* since it is used AFTER left child has been computed */	unsigned int	*curr_index_set = NULL;	struct offsets **curr_offset_table = NULL;	if (ComplexBoolean) {	/* recursive */		if (tree == NULL) return -1;		if (tree->type == LEAF) {			/* always AND pat of individual words at each term: initialize accordingly */			if (OneFilePerBlock) {				for(i=0; i<round(OneFilePerBlock, 8*sizeof(int)); i++) src_index_set[i] = 0xffffffff;			}			else for(i=0; i<MAX_PARTITION; i++) src_index_set[i] = 1;			src_index_set[REAL_PARTITION - 1] = 0;			src_index_set[REAL_PARTITION - 2] = 0;			if (split_terminal(tree->terminalindex, tree->terminalindex+1) <= 0) return -1;			prev_INVERSE = INVERSE;	/* agrep's global to implement NOT */			if (tree->op & NOTPAT) INVERSE = 1;			if (fillup_target(src_index_set, src_offset_table, AND_EXP) <= 0) return -1;			INVERSE = prev_INVERSE;			return 1;		}		else if (tree->type == INTERNAL) {			/* Search the left node and see if the right node can be searched */			if (search_index(tree->data.internal.left) <= 0) return -1;			if (OneFilePerBlock && ((tree->op & OPMASK) == ORPAT) && (src_index_set[REAL_PARTITION - 1] == 1)) goto quit;	/* nothing to do */			if ((tree->data.internal.right == NULL) || (tree->data.internal.right->type == 0)) return -1;	/* uninitialized: see main.c */			curr_index_set = (unsigned int *)my_malloc(sizeof(int)*REAL_PARTITION);			memset(curr_index_set, '\0', sizeof(int)*REAL_PARTITION);			/* Save previous src_index_set and src_offset_table in fresh accumulators */			if (OneFilePerBlock) {				memcpy(curr_index_set, src_index_set, sizeof(int)*REAL_PARTITION);				curr_index_set[REAL_PARTITION - 1] = src_index_set[REAL_PARTITION - 1];				src_index_set[REAL_PARTITION - 1] = 0;				curr_index_set[REAL_PARTITION - 2] = src_index_set[REAL_PARTITION - 2];				src_index_set[REAL_PARTITION - 2] = 0;			}			else memcpy(curr_index_set, src_index_set, MAX_PARTITION * sizeof(int));			if (ByteLevelIndex && !NOBYTELEVEL && (RecordLevelIndex || !(Only_first && !PRINTAPPXFILEMATCH))) {				if ((curr_offset_table = (struct offsets **)my_malloc(sizeof(struct offsets *) * OneFilePerBlock)) == NULL) {					fprintf(stderr, "%s: malloc failure at: %s:%d\n", GProgname, __FILE__, __LINE__);					my_free(curr_index_set, REAL_PARTITION*sizeof(int));					return -1;				}				memcpy(curr_offset_table, src_offset_table, OneFilePerBlock * sizeof(struct offsets *));				memset(src_offset_table, '\0', sizeof(struct offsets *) * OneFilePerBlock);			}			/* Now evaluate the right node which automatically put the results in src_index_set/src_offset_table */			if (search_index(tree->data.internal.right) <= 0) {				if (curr_offset_table != NULL) free(curr_offset_table);				my_free(curr_index_set, REAL_PARTITION*sizeof(int));				return -1;			}			/*			 * Alpha substitution of the code in get_index():			 * index_tab <- src_index_set			 * dest_index_table <- curr_index_set			 * offset_tab <- src_offset_table			 * dest_offset_table <- curr_offset_table			 * ret <- src_index_set[REAL_PARTITION - 1] for ORPAT, curr_index_set for ANDPAT			 * frequency = src_index_set[REAL_PARTITION - 2] in both ORPAT and ANDPAT			 * first_time <- 0			 * return 0 <- goto quit			 * Slight difference since we want the results to go to src rather than curr.			 */			if (OneFilePerBlock) {			    if ((tree->op & OPMASK) == ORPAT) {				if (src_index_set[REAL_PARTITION - 1] == 1) {	/* curr..[..] can never be 1 since we would have quit above itself */				ret_is_1:				    src_index_set[REAL_PARTITION - 1] = 1;				    for(i=0; i<round(OneFilePerBlock, 8*sizeof(int)) - 1; i++) {					src_index_set[i] = 0xffffffff;				    }				    src_index_set[i] = 0;				    for (j=0; j<8*sizeof(int); j++) {					if (i*8*sizeof(int) + j >= OneFilePerBlock) break;					src_index_set[i] |= mask_int[j];				    }				    if (ByteLevelIndex && !NOBYTELEVEL && !(Only_first && !PRINTAPPXFILEMATCH)) for (i=0; i<OneFilePerBlock; i++) {					free_list(&curr_offset_table[i]);					if (!RecordLevelIndex) free_list(&src_offset_table[i]);	/* collect as many offsets as possible with RecordLevelIndex: free src_offset_table at the end of process_query() */				    }				    if (ByteLevelIndex && !RecordLevelIndex) NOBYTELEVEL = 1;				    goto quit;				}				src_index_set[REAL_PARTITION - 1] = 0;				for (i=0; i<round(OneFilePerBlock, 8*sizeof(int)); i++) src_index_set[i] |= curr_index_set[i];				if (ByteLevelIndex && !NOBYTELEVEL && (RecordLevelIndex || !(Only_first && !PRINTAPPXFILEMATCH))) {				    for (i=0; i<OneFilePerBlock; i++) {					sorted_union(&src_offset_table[i], &curr_offset_table[i], &src_index_set[REAL_PARTITION - 2], curr_index_set[REAL_PARTITION - 2], 0);					if (!RecordLevelIndex && NOBYTELEVEL) {	/* collect as many offsets as possible with RecordLevelIndex: free offset_tables at the end of process_query() */					    for (iii=0; iii<OneFilePerBlock; iii++) {						free_list(&src_offset_table[iii]);						free_list(&curr_offset_table[iii]);					    }					    break;					}				    }				}			    }			    else {				if (((src_index_set[REAL_PARTITION - 1] == 1) || first_time) && (curr_index_set[REAL_PARTITION - 1] == 1)) {				both_are_1:				    if (first_time) {					src_index_set[REAL_PARTITION - 1] = 1;					for(i=0; i<round(OneFilePerBlock, 8*sizeof(int)) - 1; i++) {					    src_index_set[i] = 0xffffffff;					}					src_index_set[i] = 0;					for (j=0; j<8*sizeof(int); j++) {					    if (i*8*sizeof(int) + j >= OneFilePerBlock) break;					    src_index_set[i] |= mask_int[j];					}				    }				    first_time = 0;				    if (ByteLevelIndex && !NOBYTELEVEL && !(Only_first && !PRINTAPPXFILEMATCH)) for (i=0; i<OneFilePerBlock; i++) {	/* collect as many offsets as possible with RecordLevelIndex: free offset_tables at the end of process_query() */					free_list(&curr_offset_table[i]);					if (!RecordLevelIndex) free_list(&src_offset_table[i]);	/* collect as many offsets as possible with RecordLevelIndex: free src_offset_table at the end of process_query() */				    }				    if (ByteLevelIndex && !RecordLevelIndex) NOBYTELEVEL = 1;				    /*				    goto quit;				    */				}				else if ((src_index_set[REAL_PARTITION - 1] == 1) || first_time) {				    first_time = 0;				    src_index_set[REAL_PARTITION - 1] = 0;				    for (i=0; i<round(OneFilePerBlock, 8*sizeof(int)); i++) src_index_set[i] = curr_index_set[i];				    if (ByteLevelIndex && !NOBYTELEVEL && (RecordLevelIndex || !(Only_first && !PRINTAPPXFILEMATCH))) {					for (i=0; i<OneFilePerBlock; i++) {					    free_list(&src_offset_table[i]);					    src_offset_table[i] = curr_offset_table[i];					    curr_offset_table[i] = NULL;					}				    }				}				else if (curr_index_set[REAL_PARTITION - 1] == 1) {				    if (ByteLevelIndex && !NOBYTELEVEL && !(Only_first && !PRINTAPPXFILEMATCH))	/* collect as many offsets as possible with RecordLevelIndex: free offset_tables at the end of process_query() */				      for (i=0; i<OneFilePerBlock; i++) free_list(&curr_offset_table[i]);				}				else {				    for (i=0; i<round(OneFilePerBlock, 8*sizeof(int)); i++) src_index_set[i] &= curr_index_set[i];				    if (ByteLevelIndex && !NOBYTELEVEL && (RecordLevelIndex || !(Only_first && !PRINTAPPXFILEMATCH))) {					if (first_time || WHOLEFILESCOPE) {					    first_time = 0;					    for (i=0; i<OneFilePerBlock; i++) {						sorted_union(&src_offset_table[i], &curr_offset_table[i], &src_index_set[REAL_PARTITION - 2], curr_index_set[REAL_PARTITION - 2], 0);						if (!RecordLevelIndex && NOBYTELEVEL) {	/* collect as many offsets as possible with RecordLevelIndex: free offset_tables at the end of process_query() */						    for (iii=0; iii<OneFilePerBlock; iii++) {							free_list(&src_offset_table[iii]);							free_list(&curr_offset_table[iii]);						    }						    break;						}					    }					}					else {					    for (i=0; i<OneFilePerBlock; i++) {						if ((src_index_set[block2index(i)] & mask_int[i % (8*sizeof(int))]))						    sorted_intersection(i, &src_offset_table[i], &curr_offset_table[i], &src_index_set[REAL_PARTITION - 2]);						else free_list(&curr_offset_table[i]);						/*						if (src_index_set[REAL_PARTITION - 2] < MIN_OCCURRENCES) {						    if (!NOBYTELEVEL) {							    for (iii=0; iii<OneFilePerBlock; iii++) {								free_list(&src_offset_table[iii]);								free_list(&curr_offset_table[iii]);							    }						    }						    NOBYTELEVEL = 1;						    OPTIMIZEBYTELEVEL = 1;						    break;						}						*/					    }					}				    }				}			    }			}			else {			    if ((tree->op & OPMASK) == ORPAT)				for(i=0; i<MAX_PARTITION; i++) src_index_set[i] |= curr_index_set[i];			    else				for(i=0; i<MAX_PARTITION; i++) src_index_set[i] &= curr_index_set[i];			}		quit:			if (curr_offset_table != NULL) free(curr_offset_table);			/* Now if it is a NOT expression, complement the indices stuff knowing that it cannot be ByteLevelIndex */			if (tree->op & NOTPAT) {				if (ByteLevelIndex) {					/* Can't recover the discarded offsets */					fprintf(stderr, "%s: can't handle NOT of AND/OR terms with ByteLevelIndex: please simplify the query\n", HARVEST_PREFIX);					my_free(curr_index_set, REAL_PARTITION*sizeof(int));					return -1;				}				if (OneFilePerBlock)					for (i=0; i<round(OneFilePerBlock, 8*sizeof(int)); i++) src_index_set[i] = ~src_index_set[i];				else					for(i=0; i<MAX_PARTITION; i++) src_index_set[i] = (src_index_set[i] ? 0 : 1);			}		}		else return -1;	}	else {	/* iterative just as it is now: initialize */		if ((long)tree & OR_EXP) {			if (OneFilePerBlock) for(i=0; i<round(OneFilePerBlock, 8*sizeof(int)); i++) src_index_set[i] = 0;			else for(i=0; i<MAX_PARTITION; i++) src_index_set[i] = 0;		}		else {			if (OneFilePerBlock) for(i=0; i<round(OneFilePerBlock, 8*sizeof(int)); i++) src_index_set[i] = 0xffffffff;			else for(i=0; i<MAX_PARTITION; i++) src_index_set[i] = 1;		}		src_index_set[REAL_PARTITION - 1] = 0;		src_index_set[REAL_PARTITION - 2] = 0;		if (split_terminal(0, num_terminals) <= 0) return -1;		prev_INVERSE = INVERSE;	/* agrep's global to implement NOT */		INVERSE = 0;		if (fillup_target(src_index_set, src_offset_table, (long)tree) <= 0) return -1;		INVERSE = prev_INVERSE;	}	return 1;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -