⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 get_index.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 4 页
字号:
		else if (OneFilePerBlock <= MaxNum16bPartition) {		    x = (buffer[bdx2] << 8) | buffer[bdx2+1];		    x = decode16b(x);		    bdx2 += 2;		}		else {		    x = (buffer[bdx2] << 16) | (buffer[bdx2+1] << 8) | buffer[bdx2+2];		    x = decode24b(x);		    bdx2 += 3;		}		if ((last_Y_filenumber > 0) && (x >= last_Y_filenumber)) continue;		set[block2index(x)] |= block2mask(x);		if (PRINTINDEXLINE) {			printf("%d [", x);		}		prevy = 0;		if (ByteLevelIndex) {		    heado = tailo = NULL;		    curfreq = 0;		    while ((bdx2<REAL_INDEX_BUF) && (buffer[bdx2] != '\n') && (buffer[bdx2] != '\0')) {			y = decode8b(buffer[bdx2]);			if ((y & 0x000000c0) == 0) {	/* one byte offset */			    diff = y&0x0000003f;			    y = prevy + diff;			    bdx2 ++;			}			else if ((y & 0x000000c0) == 0x40) {	/* two byte offset */			    diff = decode8b(buffer[bdx2+1]);			    y = prevy + (((y & 0x0000003f) * MaxNum8bPartition) + diff);			    bdx2 += 2;			}			else if ((y & 0x000000c0) == 0x80) {	/* three byte offset */			    diff = decode16b((buffer[bdx2+1] << 8) | buffer[bdx2+2]);			    y = prevy + (((y & 0x0000003f) * MaxNum16bPartition) + diff);			    bdx2 += 3;			}			else {	/* four byte offset */			    diff = decode24b((buffer[bdx2+1] << 16) | (buffer[bdx2+2] << 8) | buffer[bdx2+3]);			    y = prevy + (((y & 0x0000003f) * MaxNum24bPartition) + diff);			    bdx2 += 4;			}			prevy = y;			if (PRINTINDEXLINE) {			    printf(" %d", y);			}			curfreq ++;			if(RecordLevelIndex ||			   (!(Only_first && !PRINTAPPXFILEMATCH) && !NOBYTELEVEL &&	/* below borrowed from sorted_union */#if	USEFREQUENCIES			    !(((prevfreq>MIN_OCCURRENCES)&&(curfreq+*frequency > MAX_UNION*prevfreq)) || (curfreq+*frequency > MAX_ABSOLUTE))#else			    1#endif			    			    )			  ) {			    /* These o's will be in sorted order. Just collect all of them and merge with &offset_table[x]. */			    o = (struct offsets *)my_malloc(sizeof(struct offsets));			    o->offset = y;			    o->next = NULL;			    o->sign = o->done = 0;			    if (heado == NULL) {				heado = o;				tailo = o;			    }			    else {				tailo->next = o;				tailo = o;			    }			}			else if (!RecordLevelIndex) {			    if (heado != NULL) free_list(&heado);			    /* printf("1 "); */			    NOBYTELEVEL = 1;	/* can't return since have to or the bitmasks */			}			if ((bdx2<REAL_INDEX_BUF) && (buffer[bdx2] == delim)) {	/* look at offsets corr. to a new file now */				bdx2 ++;				break;			}		    }		    if (heado == NULL) *frequency += curfreq;		    else if (RecordLevelIndex || (!(Only_first && !PRINTAPPXFILEMATCH) && !NOBYTELEVEL)) {			sorted_union(&offset_table[x], &heado, frequency, prevfreq, curfreq);	/* this will free heado's elements and ++ *frequency */			if (!RecordLevelIndex && NOBYTELEVEL) *frequency += curfreq;	/* can't return since have to or the bitmasks */			if (heado != NULL) free_list(&heado);		    }		}		if (PRINTINDEXLINE) {		    printf("] ");		}	    }	}	else {	    while((bdx2<MAX_INDEX_BUF) && (buffer[bdx2] != '\n') && (buffer[bdx2] != '\0') && (buffer[bdx2] < MAX_PARTITION)) {		if ((last_Y_filenumber > 0) && (p_table[buffer[bdx2]] >= last_Y_filenumber)) {			bdx2 ++;			continue;		}		if (PRINTINDEXLINE) {		    for (j=p_table[buffer[bdx2]]; j<p_table[buffer[bdx2] + 1]; j++)			if ((last_Y_filenumber > 0) && (j >= last_Y_filenumber)) break;			else printf("%d [] ", j);		}		set[buffer[bdx2]] = 1;		bdx2++;	    }	}	if (PRINTINDEXLINE) {	    printf("\n");	}	return 0;}/* * This is a very simple function: it gets the list of matched lines from the index, * and sets the block numbers corr. to files that need to be searched in "index_tab". * It also sets the file-offsets that have to be searched in "offset_tab" (byte-level). */get_index(infile, index_tab, offset_tab, pattern, patlen, patattr, index_argv, index_argc, outfile, partfp, parse, first_time)char *infile;unsigned int  *index_tab;struct offsets **offset_tab;char *pattern;int patlen;int patattr;char *index_argv[];int index_argc;FILE *outfile;FILE *partfp;int parse;int first_time;{	int  i=0, j, iii;	FILE *f_in;	struct offsets **offsetptr = multi_dest_offset_table[0];	/* cannot be NULL if ByteLevelIndex: main.c takes care of that */	int ret=0; 	if (OneFilePerBlock && (parse & OR_EXP) && (index_tab[REAL_PARTITION - 1] == 1)) return 0;	if (((infile == NULL) || !strcmp(infile, "")) /* || (index_tab == NULL) || (offset_tab == NULL) || (pattern == NULL)*/) return -1;	if((f_in = fopen(infile, "r")) == NULL) {		fprintf(stderr, "%s: can't open for reading: %s/%s\n", GProgname, INDEX_DIR, infile);		return -1;	}	if (OneFilePerBlock)	    for(i=0; i<round(OneFilePerBlock, 8*sizeof(int)); i++) {                dest_index_set[i] = 0;	    }	else for(i=0; i<MAX_PARTITION; i++) {                dest_index_set[i] = 0;	    }	dest_index_buf[0] = '\n';	/* memagrep needs buffer to begin with '\n' */	dest_index_set[REAL_PARTITION - 2] = 0;	while(fgets(dest_index_buf+1, REAL_INDEX_BUF-1, f_in)) {#if	BG_DEBUG		fprintf(debug, "index-line: %s", dest_index_buf+1);#endif	/*BG_DEBUG*/		if ((ret = get_set(&dest_index_buf[0], dest_index_set, offsetptr, patlen, pattern, patattr, outfile, partfp, &dest_index_set[REAL_PARTITION - 2], index_tab[REAL_PARTITION - 2])) != 0)			break;	/* all index mark touched */	}	if (!RecordLevelIndex && NOBYTELEVEL) {	    for (iii=0; iii<OneFilePerBlock; iii++) {		free_list(&offset_tab[iii]);		free_list(&offsetptr[iii]);	    }	}	if (INVERSE) {	    if (OneFilePerBlock) {		if (ByteLevelIndex && !RecordLevelIndex) NOBYTELEVEL = 1;	/* can't collect all offsets where pattern DOES NOT occur! */		for (i=0; i<round(OneFilePerBlock, 8*sizeof(int)) - 1; i++) dest_index_set[i] = ~dest_index_set[i];		for (j=0; j<8*sizeof(int); j++) {		    if (i*8*sizeof(int) + j >= OneFilePerBlock) break;		    if (dest_index_set[i] & mask_int[j]) dest_index_set[i] &= ~mask_int[j];		    else dest_index_set[i] |= mask_int[j];		}	    }	    else {		for(i=0; i<MAX_PARTITION; i++) {		    if (i>=GNumpartitions-1) break;	/* STUPID: get_table returns 1 + part_num, where part_num was no. of partitions glimpseindex found */		    if ((i == 0) || (i == '\n')) continue;		    if (dest_index_set[i]) dest_index_set[i] = 0;		    else dest_index_set[i] = 1;		}	    }	}	/* Take intersection if parse=ANDPAT or 0 (one terminal pattern), union if OR_EXP; Take care of universal sets in index_tab[REAL_PARTITION - 1] */	if (OneFilePerBlock) {	    if (parse & OR_EXP) {		if (ret) {		ret_is_1:		    index_tab[REAL_PARTITION - 1] = 1;		    for(i=0; i<round(OneFilePerBlock, 8*sizeof(int)) - 1; i++) {			index_tab[i] = 0xffffffff;		    }		    index_tab[i] = 0;		    for (j=0; j<8*sizeof(int); j++) {			if (i*8*sizeof(int) + j >= OneFilePerBlock) break;			index_tab[i] |= mask_int[j];		    }		    if (ByteLevelIndex && !RecordLevelIndex && !NOBYTELEVEL && !(Only_first && !PRINTAPPXFILEMATCH)) for (i=0; i<OneFilePerBlock; i++) {	/* collect as many offsets as possible with RecordLevelIndex: free offset_tables at the end of process_query() */			free_list(&offsetptr[i]);			free_list(&offset_tab[i]);		    }		    if (ByteLevelIndex && !RecordLevelIndex) NOBYTELEVEL = 1;		    fclose(f_in);		    return 0;		}		index_tab[REAL_PARTITION - 1] = 0;		for (i=0; i<round(OneFilePerBlock, 8*sizeof(int)); i++) index_tab[i] |= dest_index_set[i];		if (ByteLevelIndex && !NOBYTELEVEL && (RecordLevelIndex || !(Only_first && !PRINTAPPXFILEMATCH))) {		    for (i=0; i<OneFilePerBlock; i++) {			sorted_union(&offset_tab[i], &offsetptr[i], &index_tab[REAL_PARTITION - 2], dest_index_set[REAL_PARTITION - 2], 0);			if (!RecordLevelIndex && NOBYTELEVEL) {	/* collect as many offsets as possible with RecordLevelIndex: free offset_tables at the end of process_query() */			    for (iii=0; iii<OneFilePerBlock; iii++) {				free_list(&offset_tab[iii]);				free_list(&offsetptr[iii]);			    }			    break;			}		    }		}	    }	    else {		if (((index_tab[REAL_PARTITION - 1] == 1) || first_time) && (ret)) {		both_are_1:		    if (first_time) {			index_tab[REAL_PARTITION - 1] = 1;			for(i=0; i<round(OneFilePerBlock, 8*sizeof(int)) - 1; i++) {			    index_tab[i] = 0xffffffff;			}			index_tab[i] = 0;			for (j=0; j<8*sizeof(int); j++) {			    if (i*8*sizeof(int) + j >= OneFilePerBlock) break;			    index_tab[i] |= mask_int[j];			}		    }		    first_time = 0;		    if (ByteLevelIndex && !RecordLevelIndex && !NOBYTELEVEL && !(Only_first && !PRINTAPPXFILEMATCH)) for (i=0; i<OneFilePerBlock; i++) {	/* collect as many offsets as possible with RecordLevelIndex: free offset_tables at the end of process_query() */			free_list(&offsetptr[i]);			free_list(&offset_tab[i]);		    }		    if (ByteLevelIndex && !RecordLevelIndex) NOBYTELEVEL = 1;		    /*		    fclose(f_in);		    return 0;		    */		}		else if ((index_tab[REAL_PARTITION - 1] == 1) || first_time) {		    first_time = 0;		    index_tab[REAL_PARTITION - 1] = 0;		    for (i=0; i<round(OneFilePerBlock, 8*sizeof(int)); i++) index_tab[i] = dest_index_set[i];		    if (ByteLevelIndex && !NOBYTELEVEL && (RecordLevelIndex || !(Only_first && !PRINTAPPXFILEMATCH))) {			for (i=0; i<OneFilePerBlock; i++) {			    free_list(&offset_tab[i]);			    offset_tab[i] = offsetptr[i];			    offsetptr[i] = NULL;			}		    }		}		else if (ret) {		    if (ByteLevelIndex && !RecordLevelIndex && !NOBYTELEVEL && !(Only_first && !PRINTAPPXFILEMATCH)) for (i=0; i<OneFilePerBlock; i++) free_list(&offsetptr[i]);	/* collect as many offsets as possible with RecordLevelIndex: free offset_tables at the end of process_query() */		}		else {		    for (i=0; i<round(OneFilePerBlock, 8*sizeof(int)); i++) index_tab[i] &= dest_index_set[i];		    if (ByteLevelIndex && !NOBYTELEVEL && (RecordLevelIndex || !(Only_first && !PRINTAPPXFILEMATCH))) {			if (first_time || WHOLEFILESCOPE) {			    first_time = 0;			    for (i=0; i<OneFilePerBlock; i++) {				sorted_union(&offset_tab[i], &offsetptr[i], &index_tab[REAL_PARTITION - 2], dest_index_set[REAL_PARTITION - 2], 0);				if (!RecordLevelIndex && NOBYTELEVEL) {	/* collect as many offsets as possible with RecordLevelIndex: free offset_tables at the end of process_query() */				    for (iii=0; iii<OneFilePerBlock; iii++) {					free_list(&offset_tab[iii]);					free_list(&offsetptr[iii]);				    }				    break;				}			    }			}			else {			    for (i=0; i<OneFilePerBlock; i++) {				if ((index_tab[block2index(i)] & mask_int[i % (8*sizeof(int))]))				    sorted_intersection(i, &offset_tab[i], &offsetptr[i], &index_tab[REAL_PARTITION - 2]);				else free_list(&offsetptr[i]);				/*				if (index_tab[REAL_PARTITION - 2] < MIN_OCCURRENCES) {				    if (!NOBYTELEVEL) {					    for (iii=0; iii<OneFilePerBlock; iii++) {						free_list(&offset_tab[iii]);						free_list(&offsetptr[iii]);					    }				    }				    NOBYTELEVEL = 1;				    OPTIMIZEBYTELEVEL = 1;				    break;				}				*/			    }			}		    }		}	    }	}	else {	    if (parse & OR_EXP)		for(i=0; i<MAX_PARTITION; i++) index_tab[i] |= dest_index_set[i];	    else		for(i=0; i<MAX_PARTITION; i++) index_tab[i] &= dest_index_set[i];	}#if	BG_DEBUG	fprintf(debug, "get_index(): the following partitions are ON\n");	for(i=0; i<((OneFilePerBlock > 0) ? round(OneFilePerBlock, 8*sizeof(int)) : MAX_PARTITION); i++) {	      if(index_tab[i]) fprintf(debug, "%d,%x\n", i, index_tab[i]);	}#endif	/*BG_DEBUG*/	fclose(f_in);	return 0;}/* * Same as above, but uses mgrep to search the index for many patterns at one go, * and interprets the output obtained from the -M and -P options (set in main.c). */mgrep_get_index(infile, index_tab, offset_tab, pat_list, pat_lens, pat_attr, mgrep_pat_index, num_mgrep_pat, patbufpos, index_argv, index_argc, outfile, partfp, parse, first_time)char *infile;unsigned int  *index_tab;struct offsets **offset_tab;char *pat_list[];int pat_lens[];int pat_attr[];int mgrep_pat_index[];int num_mgrep_pat;int patbufpos;char *index_argv[];int index_argc;FILE *outfile;FILE *partfp;int parse;int first_time;{	int  i=0, j, temp, iii, jjj;	FILE *f_in;	int ret;	int x=0, y=0, even_words=1;	int patnum;	unsigned int *setptr;	struct offsets **offsetptr;	CHAR dummypat[MAX_PAT];	int  dummylen=0;	char  allindexmark[MAXNUM_PAT];	int k;	int sorted[MAXNUM_PAT], min, max; 	if (OneFilePerBlock && (parse & OR_EXP) && (index_tab[REAL_PARTITION - 1] == 1)) return 0;	/* Do the mgrep() */	if ((f_in = fopen(infile, "w")) == NULL) {

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -