📄 get_index.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 4 页
字号:
12 3 4 下一页
/* Copyright (c) 1994 Burra Gopal, Udi Manber.  All Rights Reserved. */#include "glimpse.h"#include "defs.h"#if	BG_DEBUGextern	FILE	*debug;#endif	/*BG_DEBUG*/extern	char	INDEX_DIR[MAX_LINE_LEN];extern	int	Only_first;extern	int	PRINTAPPXFILEMATCH;extern	int	OneFilePerBlock;extern	int	StructuredIndex;extern	int	WHOLEFILESCOPE;extern	unsigned int *dest_index_set;extern	unsigned char *dest_index_buf;extern	int	mask_int[32];extern	int	errno;extern	int	ByteLevelIndex;extern  int	RecordLevelIndex;extern  int	rdelim_len;extern  char	rdelim[MAX_LINE_LEN];extern  char	old_rdelim[MAX_LINE_LEN];extern	int	NOBYTELEVEL;extern	int	OPTIMIZEBYTELEVEL;extern	int	RegionLimit;extern	int	PRINTINDEXLINE;extern	struct offsets **src_offset_table;extern	unsigned int	*multi_dest_index_set[MAXNUM_PAT];extern	struct offsets **multi_dest_offset_table[MAXNUM_PAT];extern	char	*index_argv[MAX_ARGS];extern	int	index_argc;extern	CHAR	GProgname[MAXNAME];extern	FILE	*indexfp, *minifp;extern int REAL_PARTITION, REAL_INDEX_BUF, MAX_ALL_INDEX, FILEMASK_SIZE;extern int p_table[MAX_PARTITION];extern int GNumpartitions;extern	int	INVERSE;	/* agrep's global: need here to implement ~ in index-search */extern int	last_Y_filenumber;#define USEFREQUENCIES	0	/* set to one if we want to stop collecting offsets sometimes since words "look" like they are in the stop list... */free_list(p1)	struct offsets 	**p1;{	struct offsets	*tp1;	while (*p1 != NULL) {		tp1 = *p1;		*p1 = (*p1)->next;		my_free(tp1, sizeof(struct offsets));	}}/* Unions offset lists list2 with list1 sorted in increasing order (deletes elements from list2) => changes both list1 and list2: f += #elems added */sorted_union(list1, list2, f, pf, cf)	struct offsets **list1, **list2;	int	*f, pf, cf;{	register struct offsets **p1 = list1, *p2;	register int	count = *f;	/* don't update *f if setting NOBYTELEVEL */	if (!RecordLevelIndex && NOBYTELEVEL) {	/* cannot come here! */		free_list(list1);		free_list(list2);		return;	}#if	USEFREQUENCIES	if (!RecordLevelIndex && ( ((pf > MIN_OCCURRENCES)  && (count > MAX_UNION * pf)) || (count > MAX_ABSOLUTE) ||	     ((count > MIN_OCCURRENCES) && (pf > MAX_UNION * count)) || (pf > MAX_ABSOLUTE) )) {		/* enough if we check the second condition at the beginning since it won't surely be satisfied after this when count ++ */		NOBYTELEVEL = 1;		return;	}#endif	while (*list2 != NULL) {		/* extract 1st element, update list2 */		p2 = *list2;		*list2 = (*list2)->next;		p2->next = NULL;		/* find position to insert p2, and do so */		p1 = list1;		while (((*p1) != NULL) && ((*p1)->offset < p2->offset)) p1 = &(*p1)->next;		if (*p1 == NULL) {	/* end of list1: append list2 to it and return */			*p1 = p2;			p2->next = *list2;			*list2 = NULL;			if (cf > 0)  count = *f + cf;#if	USEFREQUENCIES			if (!RecordLevelIndex && ( ((pf > MIN_OCCURRENCES) && (count > MAX_UNION * pf)) || (count > MAX_ABSOLUTE))) {				NOBYTELEVEL = 1;				return;			}#endif			*f = count;			return;		}		else if (p2->offset == (*p1)->offset) my_free(p2, sizeof(struct offsets));		else {			p2->next = *p1;			*p1 = p2;			count ++;#if	USEFREQUENCIES			if (!RecordLevelIndex && ( ((pf > MIN_OCCURRENCES)  && (count > MAX_UNION * pf)) || (count > MAX_ABSOLUTE) )) {				NOBYTELEVEL = 1;				return;			}#endif			/* update list1 */			list1 = &(*p1)->next;		}	}	*f = count;}/* Intersects offset lists list2 with list1 sorted in increasing order (deletes elements from list2) => changes both list1 and list2 */sorted_intersection(filenum, list1, list2, f)	struct offsets **list1, **list2;	int	*f;{	register struct offsets **p1 = list1, *p2, *tp1;	register int diff;	struct offsets *tp;	if (!RecordLevelIndex && NOBYTELEVEL) {	/* cannot come here! */		free_list(list1);		free_list(list2);		return;	}	/*	NOT NECESSARY SINCE done INITIALIZED TO 0 ON CREATION AND MADE 0 BELOW	tp = *list1;	while (tp != NULL) {		tp->done = 0;		tp = tp->next;	}	*/#if	0printf("sorted_intersection BEGIN: list1=\n\t");tp = *list1;while (tp != NULL) {	printf("%d ", tp->offset);	tp = tp->next;}printf("\n");printf("list2=\n\t");tp = *list2;while (tp != NULL) {	printf("%d ", tp->offset);	tp = tp->next;}printf("\n");#endif	/* find position to intersect list2, and do so: REMEBER: list1 is in increasing order, and so is list2 !!! */	p1 = list1;	while ( ((*p1) != NULL) && (*list2 != NULL) ) {		diff = (*list2)->offset - (*p1)->offset;		if ( (diff == 0) || (!RecordLevelIndex && (diff >= -RegionLimit) && (diff <= RegionLimit)) ) {			(*p1)->done = 1; /* p1 is in */			p1 = &(*p1)->next;			/* Can't increment p2 here since it might keep others after p1 also in */		}		else {			if (diff < 0) {				p2 = *list2;				*list2 = (*list2)->next;				my_free(p2, sizeof(struct offsets));				/* p1 can intersect with list2's next */			}			else {				if((*p1)->done && 0) p1 = &(*p1)->next; /* imposs */	/* THIS CHECK ALWAYS YEILDS 0 FROM 25/08/1996: bgopal@cs.arizona.edu */				else {					tp1 = *p1;					*p1 = (*p1)->next;					my_free(tp1, sizeof(struct offsets));					(*f) --;				}				/* list2 can intersect with p1's next */			}		}	}	while (*list2 != NULL) {		p2 = *list2;		*list2 = (*list2)->next;		my_free(p2, sizeof(struct offsets));	}	p1 = list1;	while (*p1 != NULL) {		if ((*p1)->done == 0) {			tp1 = *p1;			*p1 = (*p1)->next;			my_free(tp1, sizeof(struct offsets));			(*f) --;		}		else {			(*p1)->done = 0;	/* for the next round! */			p1 = &(*p1)->next;		}	}#if	0printf("sorted_intersection END: list1=\n\t");tp = *list1;while (tp != NULL) {	printf("%d ", tp->offset);	tp = tp->next;}printf("\n");printf("list2=\n\t");tp = *list2;while (tp != NULL) {	printf("%d ", tp->offset);	tp = tp->next;}printf("\n");#endif}purge_offsets(p1)	struct offsets **p1;{	struct offsets *tp1;	while (*p1 != NULL) {		if ((*p1)->sign == 0) {			tp1 = *p1;			(*p1) = (*p1)->next;			my_free(tp1, sizeof(struct offsets));		}		else p1 = &(*p1)->next;	}}/* Returns 1 if it is a Universal set, 0 otherwise. Constraint: WORD_END_MARK/ALL_INDEX_MARK must occur at or after buffer[0] */get_set(buffer, set, offset_table, patlen, pattern, patattr, outfile, partfp, frequency, prevfreq)	unsigned char	*buffer;	unsigned int	*set;	struct offsets **offset_table;	int	patlen;	char	*pattern;	int	patattr;	FILE	*outfile;	FILE	*partfp;	int	*frequency, prevfreq;{	int	bdx2, j;	int	ret;	int	x=0, y=0, diff, even_words=1, prevy;	int	indexattr = 0;	struct offsets *o, *tailo, *heado;	int	delim = encode8b(0);	int	curfreq = 0;	unsigned char c;	/* buffer[0] is '\n', search must start from buffer[1] */	bdx2 = 1;	if (OneFilePerBlock)		while((bdx2<REAL_INDEX_BUF+1) && (buffer[bdx2] != WORD_END_MARK) && (buffer[bdx2] != ALL_INDEX_MARK)) bdx2++;	else while((bdx2<REAL_INDEX_BUF+1) && (buffer[bdx2] != WORD_END_MARK)) bdx2++;	if (bdx2 >= REAL_INDEX_BUF+1) return 0;	if (StructuredIndex) {		if (StructuredIndex < MaxNum8bPartition - 1) {			indexattr = decode8b(buffer[bdx2+1]);		}		else {			indexattr = decode16b((buffer[bdx2+1] << 8) | (buffer[bdx2 + 2]));		}		/* printf("i=%d p=%d\n", indexattr, patattr); */		if ((patattr > 0) && (indexattr != patattr)) {#if	BG_DEBUG			fprintf(debug, "indexattr=%d DOES NOT MATCH patattr=%d\n", indexattr, patattr);#endif	/*BG_DEBUG*/			return 0;		}	}	if (PRINTINDEXLINE) {		c = buffer[bdx2];		buffer[bdx2] = '\0';		printf("%s %d", &buffer[1], indexattr);		buffer[bdx2] = c;		if (c == ALL_INDEX_MARK) printf(" ! ");		else printf(" : ");	}	if (OneFilePerBlock && (buffer[bdx2] == ALL_INDEX_MARK)) {		/* A intersection Univ-set = A: so src_index_set won't change; A union Univ-set = Univ-set: so src_index_set = all 1s */#if	BG_DEBUG		buffer[bdx2] = '\0';		fprintf(debug, "All indices search for %s\n", buffer + 1);		buffer[bdx2] = ALL_INDEX_MARK;#endif	/*BG_DEBUG*/		set[REAL_PARTITION - 1] = 1;		for(bdx2=0; bdx2<round(OneFilePerBlock, 8*sizeof(int)) - 1; bdx2++) {			set[bdx2] = 0xffffffff;		}		set[bdx2] = 0;		for (j=0; j<8*sizeof(int); j++) {			if (bdx2*8*sizeof(int) + j >= OneFilePerBlock) break;			set[bdx2] |= mask_int[j];		}		set[REAL_PARTITION - 1] = 1;		if (ByteLevelIndex && !RecordLevelIndex) NOBYTELEVEL = 1;	/* With RecordLevelIndex, I want NOBYTELEVEL to be unused (i.e., !NOBYTELEVEL is always true) */		return 1;	}	else if (!OneFilePerBlock) {	/* check only if index+partitions are NOT split */#if	BG_DEBUG		buffer[bdx2] = '\0';		fprintf(debug, "memagrep-line: %s\t\tpattern: %s\n", buffer, pattern);#endif	/*BG_DEBUG*/		/* ignore if pattern with all its options matches block number sequence: bg+udi: Feb/16/93 */		buffer[bdx2] = '\n';	/* memagrep needs buffer to end with '\n' */		if ((ret = memagrep_search(patlen, pattern, bdx2+1, buffer, 0, outfile)) <= 0) return 0;		else buffer[bdx2] = WORD_END_MARK;	}	if ((StructuredIndex > 0) && (StructuredIndex < MaxNum8bPartition - 1)) bdx2 ++;	else if (StructuredIndex > 0) bdx2 += 2;	bdx2++;	/* bdx2 now points to the first byte of the offset */	even_words = 1;	/* Code identical to that in merge_in() in glimpseindex */	if (OneFilePerBlock) {	    get_block_numbers(&buffer[bdx2], &buffer[bdx2], partfp);	    while((bdx2<REAL_INDEX_BUF) && (buffer[bdx2] != '\n') && (buffer[bdx2] != '\0')) {		/* First get the file name */		x = 0;		if (ByteLevelIndex) {		    if (OneFilePerBlock <= MaxNum8bPartition) {			x = decode8b(buffer[bdx2]);			bdx2 ++;		    }		    else if (OneFilePerBlock <= MaxNum16bPartition) {			x = (buffer[bdx2] << 8) | buffer[bdx2+1];			x = decode16b(x);			bdx2 += 2;		    }		    else {			x = (buffer[bdx2] << 16) | (buffer[bdx2+1] << 8) | buffer[bdx2+2];			x = decode24b(x);			bdx2 += 3;		    }		}		else if (OneFilePerBlock <= MaxNum8bPartition) {		    x = decode8b(buffer[bdx2]);		    bdx2 ++;		}		else if (OneFilePerBlock <= MaxNum12bPartition) {		    if (even_words) {			x = ((buffer[bdx2+1] & 0x0000000f) << 8) | buffer[bdx2];			x = decode12b(x);			bdx2 += 2;			even_words = 0;		    }		    else {	/* odd number of words so far */			x = ((buffer[bdx2-1] & 0x000000f0) << 4) | buffer[bdx2];			x = decode12b(x);			bdx2 ++;			even_words = 1;		    }		}
12 3 4 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -