⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 build_in.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 5 页
字号:
    printf("%d\n", system(s));#endif#if	0    printf("merged\n");    sprintf(s, "exec %s -10 '%s/%s'\n", SYSTEM_HEAD, escapesinglequote(INDEX_DIR, es1), INDEX_FILE);    system(s);#endif	/*0*/}/* --------------------------------------------------------------------build_hash():input: a set of filenames in name_list[], a partition table p_table[]output: a hash table hash_table[].-----------------------------------------------------------------------*/build_hash(){    int	fd;                          /* opened file number */    int  i, pn;                  /* pn: current partition */    int  num_read;    char word[256];    struct stat stbuf;    int offset;    int toread;    unsigned char *buffer;	/* running pointer for getword = place where reads begin */    unsigned char *bx;		/* running pointer for read-loop, initially buffer */    unsigned char *buffer_end;	/* place where getword should stop */    unsigned char *buffer_begin;/* constant pointer to beginning */    unsigned char *next_record;	/* pointer that tells where the current record ends: if buffer (returned by getword) is >= this, increment ICurrentFileOffset */    unsigned char *last_record;	/* pointer that tells where the last record ends: may or may not be > buffer_end, but surely <= bx the last byte read */    int residue;	/* extra variable to store buffer_begin + BLOCK_SIZE - buffer_end */    int tried_once = 0;    int attribute;    int ret;    char outname[MAX_LINE_LEN];    char *unlinkname = NULL;    int pid = getpid();    if (StructuredIndex) region_initialize();    init_hash_table();#ifdef debug    printf("entering build_hash(), part_num=%d\n", part_num);#endif    tried_once = 0;try_again_1:    buffer_begin = buffer = (unsigned char *) my_malloc(sizeof(char)* BLOCK_SIZE + 10);	/* always read in units of BLOCK_SIZE or less */    if(buffer == NULL) {	fprintf(stderr, "not enough memory in build_hash\n");	if (tried_once) return;	traverse1();	init_hash_table();	tried_once = 1;	goto try_again_1;    }    bx = buffer;    if (OneFilePerBlock) {	for(i=0; i<file_num; i++) {	    unlinkname = NULL;	    if ((disable_list != NULL) && (i<old_file_num) && (disable_list[block2index(i)] & mask_int[i%(8*sizeof(int))])) continue;	    if (LIST_GET(name_list, i) == NULL) continue;	    if ((ret = tuncompress_file(LIST_GET(name_list, i), outname, TC_EASYSEARCH | TC_OVERWRITE | TC_NOPROMPT)) > 0) {	/* do not remove old .TZ file */		if (StructuredIndex && (-1 == region_create(outname))) {		    fprintf(stderr, "permission denied or non-existent file: %s\n", LIST_GET(name_list, i));		    remove_filename(i, -1);		    continue;		}		if (((fd = my_open(outname, O_RDONLY, 0)) == -1) ) {		    fprintf(stderr, "permission denied or non-existent file: %s\n", LIST_GET(name_list, i));		    remove_filename(i, -1);		    if (StructuredIndex) region_destroy();	/* cannot happen! */		    unlink(outname);		    continue;		}		unlinkname = outname;		goto index_file1;	    }	    /* Try to apply the filter */	    sprintf(outname, "%s/.glimpse_apply.%d", INDEX_DIR, pid);	    if ((ret = apply_filter(LIST_GET(name_list, i), outname)) == 1) {		/* Some pattern matched AND some filter was successful */		if (StructuredIndex && (-1 == region_create(outname))) {		    fprintf(stderr, "permission denied or non-existent file: %s\n", LIST_GET(name_list, i));		    remove_filename(i, -1);		    continue;		}		if (((fd = my_open(outname, O_RDONLY)) == -1) ) {	/* error: shouldn't have returned 1! */		    fprintf(stderr, "permission denied or non-existent file: %s\n", LIST_GET(name_list, i));		    remove_filename(i, -1);		    if (StructuredIndex) region_destroy();	/* cannot happen! */		    unlink(outname);		    continue;		}		unlinkname = outname;		goto index_file1;	    }	    else if (ret == 2) {		/* Some pattern matched but no filter was successful */		if (filetype(LIST_GET(name_list, i), 0, NULL, NULL)) {	/* try to index input file if it satisfies filetype */		    remove_filename(i, -1);		    unlink(outname);		    continue;		}		unlinkname = outname;	    }	    if (StructuredIndex && (-1 == region_create(LIST_GET(name_list, i)))) {		fprintf(stderr, "permission denied or non-existent file: %s\n", LIST_GET(name_list, i));		remove_filename(i, -1);		continue;	    }            if (((fd = my_open(LIST_GET(name_list, i), O_RDONLY, 0)) == -1) ) {		fprintf(stderr, "permission denied or non-existent file: %s\n", LIST_GET(name_list, i));		remove_filename(i, -1);		if (StructuredIndex) region_destroy();	/* cannot happen! */		if (unlinkname != NULL) unlink(unlinkname);		continue;            }	index_file1:#ifdef SW_DEBUG	    if (AddToIndex || FastIndex) printf("adding words of %s in %d\n", LIST_GET(name_list,i), i);	    printf("%s\n", LIST_GET(name_list, i));#endif	    /* my_stat(LIST_GET(name_list, i), &stbuf); Chris Dalton */	    fstat(fd, &stbuf);#ifdef	SW_DEBUG	    printf("filesize: %d\n", stbuf.st_size);#endif#ifdef	UDI_DEBUG	    printf("%s  ", LIST_GET(name_list, i));	    printf("size: %d  ", stbuf.st_size);#endif	    /* buffer always points to a BLOCK_SIZE block of allocated memory */	    buffer = buffer_begin;	    residue = 0;	    if (RecordLevelIndex) {		if (!StoreByteOffset) NextICurrentFileOffset = ICurrentFileOffset = 1;		else NextICurrentFileOffset = ICurrentFileOffset = 0;	    }	    for (offset = 0; offset < stbuf.st_size; offset += BLOCK_SIZE) {		offset -= residue;		if (!RecordLevelIndex) NextICurrentFileOffset = ICurrentFileOffset = offset;		toread = offset + BLOCK_SIZE >= stbuf.st_size ? stbuf.st_size - offset : BLOCK_SIZE;		lseek(fd, offset, SEEK_SET);		bx= buffer;		num_read = 0;		while ((toread > 0) && ((num_read = read(fd, bx, toread)) < toread)) {		    if (num_read <= 0) {			buffer = bx;			fprintf(stderr, "read error on file %s at offset %d\n", LIST_GET(name_list, i), offset);			goto break_break1;	/* C doesn't have break; break; */		    }		    bx += num_read;		    toread -= num_read;		}		if (num_read >= toread) {			bx += num_read;			toread -= num_read;		}		buffer_end = bx;		residue = 0;		if (buffer_end == buffer_begin + BLOCK_SIZE) {			if (RecordLevelIndex) {				buffer_end = backward_delimiter(buffer_end /* NOT bx */, buffer, rdelim, rdelim_len, 0);			}			else {				while ((INDEXABLE(*(buffer_end-1))) && (buffer_end > buffer_begin + MAX_WORD_SIZE)) buffer_end --;			}			residue = buffer_begin + BLOCK_SIZE - buffer_end;			/* if (residue > 0) printf("residue = %d in %s at %d\n", residue, LIST_GET(name_list, i), offset); */		}		if (RecordLevelIndex) {			next_record = forward_delimiter(buffer, buffer_end, rdelim, rdelim_len, 0);		}		bx = buffer; 		PrintedLongWordWarning = 0;		while ((buffer=(unsigned char *) getword(LIST_GET(name_list, i), word, buffer, buffer_end, &attribute, &next_record)) < buffer_end) {		    if (RecordLevelIndex) {			if (buffer >= next_record) {			    next_record = forward_delimiter(buffer, buffer_end, rdelim, rdelim_len, 0);			    if (StoreByteOffset) ICurrentFileOffset += next_record - buffer;			    else ICurrentFileOffset ++;			}		    }		    /* printf("%s\n", word); */		    if(word[0] == '\0') continue;		    if(icount - hash_icount >= I_THRESHOLD) {#if	BG_DEBUG			fprintf(LOGFILE, "reached I_THRESHOLD at %d\n", icount - hash_icount);#endif	/*BG_DEBUG*/			traverse1();			init_hash_table();			hash_icount = icount;		    }		    insert_h(word, i, attribute);		}		if (word[0] != '\0') {		    /* printf("%s\n", word); */		    if(icount - hash_icount >= I_THRESHOLD) {#if	BG_DEBUG			fprintf(LOGFILE, "reached I_THRESHOLD at %d\n", icount - hash_icount);#endif	/*BG_DEBUG*/			traverse1();			init_hash_table();			hash_icount = icount;		    }		    insert_h(word, i, attribute);		}		if (RecordLevelIndex) {			if (buffer >= next_record) {			    /* next_record = forward_delimiter(buffer, buffer_end, rdelim, rdelim_len, 0); */			    ICurrentFileOffset ++;			}		}		buffer = buffer_begin;		next_record = buffer;	    }	break_break1:            close(fd);	    if (unlinkname != NULL) unlink(unlinkname);#ifdef	UDI_DEBUG	    printf("add to index: %d\n",icount-save_icount);#endif	    if ((MAXWORDSPERFILE > 0) && (icount-save_icount > MAXWORDSPERFILE)) {		fprintf(MESSAGEFILE, "%d words are contributed by %s\n",			icount-save_icount, LIST_GET(name_list, i));		AddedMaxWordsMessage = ON;	    }	    if (IndexNumber && NUMERICWORDPERCENT && (numeric_icount * 100 > (icount - save_icount) * NUMERICWORDPERCENT) && (icount - save_icount > MIN_WORDS)) {		fprintf(MESSAGEFILE, "NUMBERS occur in %d%% of %d words contributed by %s\n", (numeric_icount * 100)/(icount - save_icount), icount - save_icount, LIST_GET(name_list, i));		AddedMixedWordsMessage = ON;	    }	    numeric_icount=0;	    save_icount=icount;	    if (StructuredIndex) region_destroy();        }	traverse1();	init_hash_table();	hash_icount = icount;	my_free(buffer_begin, BLOCK_SIZE + 10);	return;    }    for(pn=1; pn < part_num; pn++)	/* partition # 0 is not accessed */    {	if (pn == '\n') continue;	/* There cannot be a partition # '\n' or 0: see partition.c */	for(i=p_table[pn]; i<p_table[pn+1]; i++) {	    unlinkname = NULL;	    if ((disable_list != NULL) && (i<old_file_num) && (disable_list[block2index(i)] & mask_int[i%(8*sizeof(int))])) continue;	    if (LIST_GET(name_list, i) == NULL) continue;	    if (BuildDictionaryExisting) {		if (((fd = my_open(LIST_GET(name_list, i), O_RDONLY, 0)) == -1) ) {		    fprintf(stderr, "permission denied or non-existent file: %s\n", LIST_GET(name_list, i));		    remove_filename(i, -1);		    continue;		}		if (!CompressAfterBuild) unlinkname = LIST_GET(name_list, i);	/* not needed anymore */		goto index_file2;	    }	    if ((ret = tuncompress_file(LIST_GET(name_list, i), outname, TC_EASYSEARCH | TC_OVERWRITE | TC_NOPROMPT)) > 0) {	/* do not remove old .TZ file */		if (StructuredIndex && (-1 == region_create(outname))) {		    fprintf(stderr, "permission denied or non-existent file: %s\n", LIST_GET(name_list, i));		    remove_filename(i, -1);		    continue;		}		if (((fd = my_open(outname, O_RDONLY, 0)) == -1) ) {		    fprintf(stderr, "permission denied or non-existent file: %s\n", LIST_GET(name_list, i));		    remove_filename(i, -1);		    if (StructuredIndex) region_destroy();	/* cannot happen! */		    unlink(outname);		    continue;		}		if (BuildDictionary && CompressAfterBuild) strcpy(LIST_GET(name_list, i), outname); /* name of clear file will be smaller, so enough space */		else unlinkname = outname;		goto index_file2;	    }	    /* Try to apply the filter */	    sprintf(outname, "%s/.glimpse_apply.%d", INDEX_DIR, pid);	    if ((ret = apply_filter(LIST_GET(name_list, i), outname)) == 1) {		/* Some pattern matched AND some filter was successful */		if (StructuredIndex && (-1 == region_create(outname))) {		    fprintf(stderr, "permission denied or non-existent file: %s\n", LIST_GET(name_list, i));		    remove_filename(i, -1);		    continue;		}		if (((fd = my_open(outname, O_RDONLY)) == -1) ) {	/* error: shouldn't have returned 1! */		    fprintf(stderr, "permission denied or non-existent file: %s\n", LIST_GET(name_list, i));		    remove_filename(i, -1);		    if (StructuredIndex) region_destroy();	/* cannot happen! */		    unlink(outname);		    continue;		}		unlinkname = outname;		goto index_file2;	    }	    else if (ret == 2) {		/* Some pattern matched but no filter was successful */		if (filetype(LIST_GET(name_list, i), 0, NULL, NULL)) {	/* try to index input file if it satisfies filetype */		    remove_filename(i, -1);		    unlink(outname);		    continue;		}		unlinkname = outname;	    }	    if (StructuredIndex && (-1 == region_create(LIST_GET(name_list, i)))) {		fprintf(stderr, "permission denied or non-existent file: %s\n", LIST_GET(name_list, i));		remove_filename(i, -1);		continue;	    }            if (((fd = my_open(LIST_GET(name_list, i), O_RDONLY)) == -1) ) {		fprintf(stderr, "permission denied or non-existent file: %s\n", LIST_GET(name_list, i));		remove_filename(i, -1);		if (StructuredIndex) region_destroy();	/* cannot happen! */		if (unlinkname != NULL) unlink(unlinkname);		continue;            }	index_file2:#ifdef SW_DEBUG	    if (AddToIndex || FastIndex) printf("adding words of %s in %d\n", LIST_GET(name_list, i), pn);	    printf("%s\n", LIST_GET(name_list, i));#endif	    /* my_stat(LIST_GET(name_list, i), &stbuf); Chris Dalton */	    fstat(fd, &stbuf);#ifdef	SW_DEBUG	    printf("filesize: %d\n", stbuf.st_size);#endif#ifdef	UDI_DEBUG	    printf("%s  ", LIST_GET(name_list, i));	    printf("size: %d  ", stbuf.st_size);#endif	    /* buffer always points to a BLOCK_SIZE block of allocated memory */	    buffer = buffer_begin;	    residue = 0;	    if (RecordLevelIndex) {		if (!StoreByteOffset) NextICurrentFileOffset = ICurrentFileOffset = 1;		else NextICurrentFileOffset = ICurrentFileOffset = 0;	    }	    for (offset = 0; offset < stbuf.st_size; offset += BLOCK_SIZE) {		offset -= residue;		if (!RecordLevelIndex) NextICurrentFileOffset = ICurrentFileOffset = offset;		toread = offset + BLOCK_SIZE >= stbuf.st_size ? stbuf.st_size - offset : BLOCK_SIZE;		lseek(fd, offset, SEEK_SET);		bx= buffer;		num_read = 0;		while ((toread > 0) && ((num_read = read(fd, bx, toread)) < toread)) {		    if (num_read <= 0) {			buffer = bx;			fprintf(stderr, "read error on file %s at offset %d\n", LIST_GET(name_list, i), offset);			goto break_break2;	/* C doesn't have break; break; */		    }		    bx += num_read;		    toread -= num_read;		}		if (num_read >= toread) {			bx += num_read;			toread -= num_read;		}		buffer_end = bx;		residue = 0;		if (buffer_end == buffer_begin + BLOCK_SIZE) {			if (RecordLevelIndex) {				buffer_end = backward_delimiter(buffer_end /* NOT bx */, buffer, rdelim, rdelim_len, 0);			}			else {				while ((INDEXABLE(*(buffer_end-1))) && (buffer_end > buffer_begin + MAX_WORD_SIZE)) buffer_end --;			}			residue = buffer_begin + BLOCK_SIZE - buffer_end;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -