📄 build_in.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 5 页
字号:
			/* if (residue > 0) printf("residue = %d in %s at %d\n", residue, LIST_GET(name_list, i), offset); */		}		if (RecordLevelIndex) {			next_record = forward_delimiter(buffer, buffer_end, rdelim, rdelim_len, 0);		}		bx = buffer; 		PrintedLongWordWarning = 0;		while ((buffer=(unsigned char *) getword(LIST_GET(name_list, i), word, buffer, buffer_end, &attribute, &next_record)) < buffer_end) {		    if (RecordLevelIndex) {			if (buffer >= next_record) {			    next_record = forward_delimiter(buffer, buffer_end, rdelim, rdelim_len, 0);			    if (StoreByteOffset) ICurrentFileOffset += next_record - buffer;			    else ICurrentFileOffset ++;			}		    }		    /* printf("%s\n", word); */		    if(word[0] == '\0') continue;		    if(icount - hash_icount >= I_THRESHOLD) {#if	BG_DEBUG			fprintf(LOGFILE, "reached I_THRESHOLD at %d\n", icount - hash_icount);#endif	/*BG_DEBUG*/			traverse1();			init_hash_table();			hash_icount = icount;		    }		    insert_h(word, pn, attribute);		}		if (word[0] != '\0') {		    /* printf("%s\n", word); */		    if(icount - hash_icount >= I_THRESHOLD) {#if	BG_DEBUG			fprintf(LOGFILE, "reached I_THRESHOLD at %d\n", icount - hash_icount);#endif	/*BG_DEBUG*/			traverse1();			init_hash_table();			hash_icount = icount;		    }		    insert_h(word, pn, attribute);		}		if (RecordLevelIndex) {			if (buffer >= next_record) {			    /* next_record = forward_delimiter(buffer, buffer_end, rdelim, rdelim_len, 0); */			    ICurrentFileOffset ++;			}		}		buffer = buffer_begin;		next_record = buffer;	    }	break_break2:            close(fd);	    if (unlinkname != NULL) unlink(unlinkname);#ifdef	UDI_DEBUG	    printf("add to index: %d\n",icount-save_icount);#endif	    if ((MAXWORDSPERFILE > 0) && (icount-save_icount > MAXWORDSPERFILE)) {		fprintf(MESSAGEFILE, "%d words are contributed by %s\n",			icount-save_icount, LIST_GET(name_list, i));		AddedMaxWordsMessage = ON;	    }	    if (IndexNumber && NUMERICWORDPERCENT && (numeric_icount * 100 > (icount - save_icount) * NUMERICWORDPERCENT) && (icount - save_icount > MIN_WORDS)) {		fprintf(MESSAGEFILE, "NUMBERS occur in %d%% of %d words contributed by %s\n", (numeric_icount * 100)/(icount - save_icount), icount - save_icount, LIST_GET(name_list, i));		AddedMixedWordsMessage = ON;	    }	    numeric_icount=0;	    save_icount=icount;	    if (StructuredIndex) region_destroy();        }    }    traverse1();    init_hash_table();    hash_icount = icount;    my_free(buffer_begin, BLOCK_SIZE + 10);}init_hash_table(){    int i;    for(i=0; i<HashTableSize; i++) hash_table[i] = (struct token *)NULL;    return;}/* ------------------------------------------------------------------------input: a word (a string), a hash table (each entry points to a list of       tokens. (a token is a structure containing 'word' and a pointer       to a list of indices)).function: insert the word to appropriate position in the table.          if the inserted word is already in the data structure, then          update the list of indices corresponding to that 'word'.          otherwise create a new token.THERE ARE NO STATE CHANGES UNLESS WE ARE SURE THAT MALLOCS WON'T FAIL: BG---------------------------------------------------------------------------*/voidinsert_h(word, pn, attribute)char *word;int  pn;int attribute;{    int hash_value=0;    struct token *tp;    struct token *tp_bak;    struct indices *iip;    int  wordlen = strlen(word);    int j;    static int tried_once = 0;    /* all words with same attribute at same place in hash table */    hash_value = hashword(word, wordlen);    tp_bak = tp = hash_table[hash_value];    while(tp != NULL) {        if((strcmp(word, tp->word) == 0) && (tp->attribute == attribute)) {             insert_index(tp, pn);	     tried_once = 0;             return;	/* already in there */        }	tp_bak = tp;        tp = tp->next_t;    }    /* this is a new word, insert it */    if((tp = (struct token *) tokenalloc(sizeof(struct token))) == NULL) {	tp_bak = NULL;	if (tried_once == 1) {		fprintf(stderr, "not enough memory in insert_h1 at icount=%d. skipping...\n", icount);		tried_once = 0;		return;	/* ignore word altogether */	}	traverse1();	init_hash_table();	tried_once = 1;	/* memory allocation failed in malloc#1 */	insert_h(word, pn, attribute);	/* next call can't fail here since traverse() calls *allfree() */        return;    }    if((tp->word = (char *) wordalloc(sizeof(char) * (wordlen+1))) == NULL) {	tp_bak = NULL;	if (tried_once == 2) {		fprintf(stderr, "not enough memory in insert_h2 at icount=%d. skipping...\n", icount);		tokenfree(tp, sizeof(struct token));		tried_once = 0;		return;	/* ignore word altogether */	}	tokenfree(tp, sizeof(struct token));	traverse1();	init_hash_table();	tried_once = 2;	/* memory allocation failed in malloc#2 */	insert_h(word, pn, attribute);	/* next call can't fail here or above since traverse() calls *allfree() */        return;    }    strcpy(tp->word, word);    tp->attribute = attribute;    /* the index list has a first index */    if((iip = (struct indices *) indicesalloc(sizeof(struct indices))) == NULL) {	tp_bak = NULL;	if (tried_once == 3) {		fprintf(stderr, "not enough memory in insert_h3 at icount=%d. skipping...\n", icount);		wordfree(tp->word, wordlen + 1);		tokenfree(tp, sizeof(struct token));		tried_once = 0;		return;	/* ignore word altogether */	}	wordfree(tp->word, wordlen + 1);	tokenfree(tp, sizeof(struct token));	traverse1();	init_hash_table();	tried_once = 3;	/* memory allocation failed in malloc#3 */	insert_h(word, pn, attribute);	/* next call can't fail here or above or above-above since traverse() calls *allfree() */        return;    }    icount++;    if (IndexNumber && NUMERICWORDPERCENT) {	int	i=0;	while(word[i] != '\0') {	    if (!isalpha(((unsigned char *)word)[i])) break;	    i++;	}	if (word[i] != '\0') numeric_icount ++;    }#ifdef SW_DEBUG    if((icount & 01777) == 0) printf("icount = %d\n", icount);#endif    if (!CountWords) {	iip->index[0] = pn;	iip->offset[0] = ICurrentFileOffset;    }    for (j=1; j<INDEX_SET_SIZE; j++)	iip->index[j] = INDEX_ELEM_FREE;    /* assign both head and tail */    iip->next_i = NULL;    tp->ip = iip;    tp->lastip = iip;    if(tp_bak == NULL) hash_table[hash_value] = tp;    else  tp_bak->next_t = tp;    tp->next_t = NULL;    tp->totalcount = 1;    tried_once = 0;	/* now sure that there has been no memory allocation failure while inserting this word */    return;}/* -------------------------------------------------------------------insert_index(): insert an index, i.e., pn, into an indices structure.The indices structure is a linked list where the 'first' one is alwaysthe active indices structure. When the active one is filled with 8 indicesan indicies structure is created and becomes the active one.tp points to the token structure. so, tp->ip is always the activeindices structure.THERE ARE NO STATE CHANGES UNLESS WE ARE SURE THAT MALLOCS WON'T FAIL: BG------------------------------------------------------------------- */voidinsert_index(tp, pn)struct token *tp;               /* insert a index into a indices structure */int pn;{    struct indices *iip, *temp;    struct indices *ip = (ByteLevelIndex ? tp->lastip : tp->ip);    static int tried_once = 0;    int j;    if (CountWords) {	/* I am not interested in maintaining where a word occurs: only the number of times it occurs */	ip->offset[0] ++;	return;    }    /* Check for stop-list */    if (OneFilePerBlock && !ByteLevelIndex && (file_num > MaxNum8bPartition) && (tp->totalcount > (file_num * MAX_INDEX_PERCENT / 100))) return;    if (ByteLevelIndex && (tp->totalcount > ( (((total_size>>20) > 0) && ((total_size>>20)*MAX_PER_MB < MAX_ALL_INDEX)) ? ((total_size>>20) * MAX_PER_MB) : MAX_ALL_INDEX) )) return;    if (ByteLevelIndex) {	for (j=INDEX_SET_SIZE; j>0; j--) {	    if(ip->index[j-1] == INDEX_ELEM_FREE) continue;	    if ((ip->index[j-1] == pn) && (ip->offset[j-1] == ICurrentFileOffset)) return;	/* in identical position */	    else break;	}    }    else {	for (j=INDEX_SET_SIZE; j>0; j--) {	    if(ip->index[j-1] == INDEX_ELEM_FREE) continue;	    if (ip->index[j-1] == pn) return;	/* current word is not the first appearance in partition pn */	    else break;	}    }    /* ip->index[j] is the place to insert new pn provided j < INDEX_SET_SIZE */    if(j < INDEX_SET_SIZE) {	ip->offset[j] = ICurrentFileOffset;        ip->index[j] = pn;        return;    }    if((iip = (struct indices *) indicesalloc(sizeof(struct indices)))==NULL) {	if (tried_once == 1) {		fprintf(stderr, "not enough memory in insert_index at icount=%d. skipping...\n", icount);		tried_once = 0;		return;	/* ignore index altogether */	}	traverse1();	init_hash_table();	tried_once = 1;	/* memory allocation failed in malloc#1 */	insert_index(tp, pn);        return;    }    icount++;    if (ByteLevelIndex) {	/* insert at the end */	tp->lastip->next_i = iip;	iip->next_i = NULL;	tp->lastip = iip;    }    else {	iip->next_i = tp->ip;	tp->ip = iip;    }    iip->offset[0] = ICurrentFileOffset;    iip->index[0] = pn;    for (j=1; j<INDEX_SET_SIZE; j++)	iip->index[j] = INDEX_ELEM_FREE;    tp->totalcount ++;    if ( (OneFilePerBlock && !ByteLevelIndex && (file_num > MaxNum8bPartition) && (tp->totalcount > (file_num * MAX_INDEX_PERCENT / 100))) ||	    (ByteLevelIndex && (tp->totalcount > ( (((total_size>>20) > 0) && ((total_size>>20)*MAX_PER_MB < MAX_ALL_INDEX)) ? ((total_size>>20) * MAX_PER_MB) : MAX_ALL_INDEX) )) ) {	for (iip=tp->ip; iip != NULL; temp = iip, iip = iip->next_i, indicesfree(temp, sizeof(struct indices)));	tp->ip = NULL;	/* never need to insert anything else here */    }/*printf("returning from insert_index()\n");fflush(stderr);*/    tried_once = 0;    return;}/* Scan the indexed "word" from an index line: see io.c/merge_splits() */scanword(word, buffer, buffer_end, attr)unsigned char *word, *buffer, *buffer_end;unsigned int *attr;{    int i = MAX_WORD_SIZE;    while ((i-- != 0) && (buffer <= buffer_end) && (*buffer != ALL_INDEX_MARK) && (*buffer != WORD_END_MARK) && (*buffer != '\n') && (*buffer != '\0'))	*word ++ = *buffer ++;    *word = '\0';    *attr = encode16b(0);    if (StructuredIndex) {	if ((*buffer == ALL_INDEX_MARK) || (*buffer == WORD_END_MARK)) {	    buffer ++;	    *attr = ((*buffer) << 8) | (*(buffer + 1));	}    }}/* Globals used in merge, and also in glimpse's main.c */extern unsigned int *src_index_set;extern unsigned int *dest_index_set;extern unsigned char *src_index_buf;extern unsigned char *dest_index_buf;extern unsigned char *merge_index_buf;/* merge index file f1 and f2, then put the result in index file f3 */merge_in(f1, f2, f3)FILE *f1, *f2, *f3;{    int src_mark, dest_mark;    int	src_num, dest_num;    int src_end_pt, dest_end_pt;    int cmp=0; /* the result of strcmp */    int bdx, bdx1, bdx2, merge_len, i, j;    int TAIL1=0;    char word1[MAX_WORD_SIZE+6];	/* used only for strcmp() */    char word2[MAX_WORD_SIZE+6];	/* used only for strcmp() */    unsigned int attr1, attr2;    int x=0, y=0, even_words = 1;    /* LOOK OUT FOR: [memset, fgets, endpt-forloop, scanword] 4-tuples: invariant */#if	debugprintf("in merge_in()\n"); fflush(stdout);#endif    memset(dest_index_buf, '\0', REAL_INDEX_BUF);    if (fgets(dest_index_buf, REAL_INDEX_BUF, f2) == NULL) dest_index_buf[0] = '\0';    else dest_index_buf[REAL_INDEX_BUF - 1] = '\0';    dest_end_pt = strlen(dest_index_buf);    scanword(word2, dest_index_buf, dest_index_buf+dest_end_pt, &attr2);#ifdef debug    printf("word2 = %s\n", word2);#endif    memset(src_index_buf, '\0', REAL_INDEX_BUF);    while(fgets(src_index_buf, REAL_INDEX_BUF, f1)) {	src_index_buf[REAL_INDEX_BUF - 1] = '\0';	src_end_pt = strlen(src_index_buf);    	scanword(word1, src_index_buf, src_index_buf+src_end_pt, &attr1);#ifdef debug	printf("word1 = %s\n", word1);#endif	while (((cmp = strncmp(word1, word2, MAX_WORD_SIZE+4)) > 0) || (StructuredIndex && (cmp == 0) && (attr1 > attr2))) {	    fputs(dest_index_buf, f3);	    memset(dest_index_buf, '\0', dest_end_pt+2);	    if(fgets(dest_index_buf, REAL_INDEX_BUF, f2) == NULL) {		dest_index_buf[REAL_INDEX_BUF - 1] = '\0';		TAIL1 = ON;		break;	    }	    dest_index_buf[REAL_INDEX_BUF - 1] = '\0';	    dest_end_pt = strlen(dest_index_buf);    	    scanword(word2, dest_index_buf, dest_index_buf+dest_end_pt, &attr2);	}        if(TAIL1 == ON) break;        if ((cmp == 0) && (attr1 == attr2)) { /* we need to join the index of word1 and word2 */#ifdef debug	    printf("joining src_index_buf and dest_index_buf\n");
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -