📄 io.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 3 页
字号:
    if ((s = malloc(len)) != NULL) memory_usage += len;    else fprintf(stderr, "malloc failed after memory_usage = %x Bytes\n", memory_usage);    /* Don't exit since might do traverse here: exit in glimpse though */#if	BG_DEBUG    printf("m:%x ", memory_usage);    i--;    if (i==0) {	printf("\n");	i = 100;    }#endif	/*BG_DEBUG*/    return s;}my_free(ptr, size)	void *ptr;	int size;{    if (ptr) free(ptr);    memory_usage -= size;#if	BG_DEBUG    printf("f:%x ", memory_usage);#endif	/*BG_DEBUG*/}int file_num = 0;int old_file_num = 0;	/* upto what file number should disable list be accessed: < file_num if incremental indexing */int new_file_num = -1;	/* after purging index, how many files are left: for save_data_structures() */int  bp=0;                          /* buffer pointer */unsigned char word[MAX_WORD_BUF];int FirstTraverse1 = ON;struct  indices *ip;/* Globals used in merge, and also in glimpse's get_index.c */unsigned int *src_index_set = NULL;unsigned int *dest_index_set = NULL;unsigned char *src_index_buf = NULL;unsigned char *dest_index_buf = NULL;unsigned char *merge_index_buf = NULL;/* * Routines for zonal memory allocation for glimpseindex and very fast search in glimpse. */int next_free_token = 0;struct token *free_token = NULL; /*[I_THRESHOLD/AVG_OCCURRENCES]; */int next_free_indices = 0;struct indices *free_indices = NULL; /*[I_THRESHOLD]; */int next_free_word = 0;char *free_word = NULL; /*[I_THRESHOLD/AVG_OCCURRENCES * AVG_WORD_LEN]; */extern int usemalloc;/* * The beauty of this allocation scheme is that "free" does not need to be implemented! */tokenallfree(){	next_free_token = 0;}tokenfree(e, len)struct token *e;int len;{	if (usemalloc) my_free(e, sizeof(struct token));}struct token *tokenalloc(len)int	len;{	struct token *e;	if (usemalloc) (e) = (struct token *)my_malloc(sizeof(struct token));	else {		if (free_token == NULL) free_token = (struct token *)my_malloc(sizeof(struct token) * I_THRESHOLD / INDICES_PER_TOKEN);		if (free_token == NULL) {fprintf(stderr, "malloc failure in tokenalloc()\n"); exit(2);}		else (e) = ((next_free_token >= I_THRESHOLD / INDICES_PER_TOKEN) ? (NULL) : (&(free_token[next_free_token ++])));	}	return e;}indicesallfree(){	next_free_indices = 0;}indicesfree(e, len)struct indices *e;int len;{	if (usemalloc) my_free(e, sizeof(struct indices));}struct indices *indicesalloc(len)int	len;{	struct indices *e;	if (usemalloc) (e) = (struct indices *)my_malloc(sizeof(struct indices));	else {		if (free_indices == NULL) free_indices = (struct indices *)my_malloc(sizeof(struct indices) * I_THRESHOLD);		if (free_indices == NULL) {fprintf(stderr, "malloc failure in indicesalloc()\n"); exit(2);}		else (e) = ((next_free_indices >= I_THRESHOLD) ? (NULL) : (&(free_indices[next_free_indices ++])));	}	return e;}/* For words in a token structure */wordallfree(){	next_free_word = 0;}wordfree(s, len)char *s;int len;{	if (usemalloc) my_free(s, len);}char *wordalloc(len)int	len;{	char *s;	if (usemalloc) (s) = (char *)my_malloc(len);	else {		if (free_word == NULL) free_word = (char *)my_malloc(AVG_WORD_LEN * I_THRESHOLD/INDICES_PER_TOKEN);		if (free_word == NULL) {fprintf(stderr, "malloc failure in wordalloc()\n"); exit(2); }		else (s) = ((next_free_word + len + 2 >= AVG_WORD_LEN * I_THRESHOLD/INDICES_PER_TOKEN) ? (NULL) : (&(free_word[next_free_word])));		if (s != NULL) next_free_word += (len);		/* 2 for 1 char word with '\0' */	}	return s;}struct mini *mini_array = NULL;int mini_array_len = 0;#if	WORD_SORTED/* * Routines that operate on the index using the mini-index. * * The index is a list of words+delim+attr+offset+\n sorted * by the word (using strcmp). * * The mini-index keeps track of the offsets in the index * where every WORDS_PER_REGION-th word in the index occurs. * There is no direct way for glimpse to seek into the mini * file for the exact offset of this word since unlike hash * values words are of variable length. * * This is small enough to be kept in memory and searched * directly with full word case insensitive string compares * with binary search. For 256000 words in index there will be * 256000/128 = 2000 words in mini-index that will occupy * 2000*32 (avgword + off + delim/attr + sizeof(struct mini)), * which is less than 16 pages (can always be resident in mem). * * We just need to string search log_2(2000) + 128 words of * length 12B each in the worst case ===> VERY FAST. This is * not the best possible but space is the limit. If we hash the * whole index/regions in the index, we need TOO MUCH memory. *//* * Binary search mini_array[beginindex..endindex); return 1 if success, 0 if failure. * Sets begin and end offsets for direct search; initially beginindex=0, endindex=mini_array_len */intget_mini(word, len, beginoffset, endoffset, beginindex, endindex, minifp)	unsigned char *word;	int	len;	long	*beginoffset, *endoffset;	int	beginindex, endindex;	FILE	*minifp;{	int	cmp, midindex;	if ((mini_array == NULL) || (mini_array_len <= 0)) return 0;	midindex = beginindex + (endindex - beginindex)/2;	cmp = strcmp(word, mini_array[midindex].word);	if (cmp < 0) {	/* word DEFINITELY BEFORE midindex (but still at or after beginindex) */		if (beginindex >= midindex) {	/* range of search is just ONE element in array */			*beginoffset = mini_array[midindex].offset;			if (midindex + 1 < mini_array_len) {				*endoffset = mini_array[midindex + 1].offset;			}			else *endoffset = -1;	/* go till end of file */			return 1;		}		else return get_mini(word, len, beginoffset, endoffset, beginindex, midindex);	}	else {	/* word DEFINITELY AT OR AFTER midindex (but still before endindex) */		if ((cmp == 0) || (endindex <= midindex + 1)) {	/* range of search is just ONE element in array */			*beginoffset = mini_array[midindex].offset;			if (midindex + 1 < mini_array_len) {				*endoffset = mini_array[midindex + 1].offset;			}			else *endoffset = -1;	/* go till end of file */			return 1;		}		else return get_mini(word, len, beginoffset, endoffset, midindex, endindex);	}}/* Returns: #of words in mini_array if success or already read, -1 if failure */intread_mini(indexfp, minifp)	FILE	*indexfp, *minifp;	/* indexfp pointing right to first line of word+... */{	unsigned char	s[MAX_LINE_LEN], word[MAX_NAME_LEN];	int	wordnum = 0, wordlen;	long	offset;	struct stat st;	if ((mini_array != NULL) && (mini_array_len > 0)) return mini_array_len;	if (minifp == NULL) return 0;	if (fstat(fileno(minifp), &st) == -1) {		fprintf(stderr, "Can't stat: %s\n", s);		return -1;	}	rewind(minifp);	fscanf(minifp, "%d\n", &mini_array_len);	if ((mini_array_len <= 0) || (mini_array_len > (st.st_size / 4 /* \n, space, 1char offset, 1char word */))) {		fprintf(stderr, "Error in format of: %s\n", s);		return -1;	}	mini_array = (struct mini *)my_malloc(sizeof(struct mini) * mini_array_len);	memset(mini_array, '\0', sizeof(struct mini) * mini_array_len);	while ((wordnum < mini_array_len) && (fscanf(minifp, "%s %ld\n", word, &offset) != EOF)) {		wordlen = strlen((char *)word);		mini_array[wordnum].word = (char *)my_malloc(wordlen + 2);		strcpy((char *)mini_array[wordnum].word, (char *)word);		mini_array[wordnum].offset = offset;		wordnum ++;	}	return mini_array_len;}dump_mini(indexfile)	char	*indexfile;{	unsigned char	s[MAX_LINE_LEN], word[MAX_NAME_LEN];	FILE	*indexfp;	FILE	*minifp;	int	wordnum = 0, j, attr_num;	long	offset;	/* offset if offset of beginning of word */	char	temp_rdelim[MAX_LINE_LEN];	temp_rdelim[0] = '\0';  /* Initialize just in case. 10/25/99 --GV */	if ((indexfp = fopen(indexfile, "r")) == NULL) {		fprintf(stderr, "Can't open for reading: %s\n", indexfile);		return;	}	sprintf(s, "%s/%s.tmp", INDEX_DIR, MINI_FILE);	if ((minifp = fopen(s, "w")) == NULL) {		fprintf(stderr, "Can't open for writing: %s\n", s);		fclose(indexfp);		return;	}	fgets(s, 256, indexfp);	/* indexnumbers */	fgets(s, 256, indexfp);	/* onefileperblock */	fscanf(indexfp, "%%%d%s\n", &attr_num, temp_delim);	/* structured index */	offset = ftell(indexfp);	while (fgets(s, MAX_LINE_LEN, indexfp) != NULL) {		if ((wordnum % WORDS_PER_REGION) == 0) {			j = 0;			while ((j < MAX_LINE_LEN) && (s[j] != WORD_END_MARK) && (s[j] != ALL_INDEX_MARK) && (s[j] != '\n')) j++;			if ((j >= MAX_LINE_LEN) || (s[j] == '\n')) {				wordnum ++;				offset = ftell(indexfp);				continue;			}			/* else it is WORD_END_MARK or ALL_INDEX_MARK */			s[j] = '\0';			strcpy((char *)word, (char *)s);			if (fprintf(minifp, "%s %ld\n", word, offset) == EOF) {				fprintf(stderr, "Error: write failed at %s:%d\n", __FILE__, __LINE__);				break;			}			mini_array_len ++;		}		wordnum ++;		offset = ftell(indexfp);	}	fclose(indexfp);	fflush(minifp);	fclose(minifp);	/*	 * Add amount of space needed for mini_array at the beginning	 */	sprintf(s, "%s/%s", INDEX_DIR, MINI_FILE);	if ((minifp = fopen(s, "w")) == NULL) {		fprintf(stderr, "Can't open for writing: %s\n", s);		goto end;	}	sprintf(s, "%s/%s.tmp", INDEX_DIR, MINI_FILE);	if ((indexfp = fopen(s, "r")) == NULL) {		fprintf(stderr, "Can't open for reading: %s\n", s);		fclose(minifp);		goto end;	}	fprintf(minifp, "%d\n", mini_array_len);	while (fgets(s, MAX_LINE_LEN, indexfp) != NULL) {		fputs(s, minifp);	}	fflush(minifp);	fclose(minifp);end:	sprintf(s, "%s/%s.tmp", INDEX_DIR, MINI_FILE);	unlink(s);	return;}#else	/* WORD_SORTED */intget_mini(word, len, beginoffset, endoffset, beginindex, endindex, minifp)	unsigned char *word;	int	len;	long	*beginoffset, *endoffset;	int	beginindex, endindex;	FILE	*minifp;{	int	index;	unsigned char array[sizeof(int)];	extern int	glimpse_isserver;	/* in agrep/agrep.c */	index = hash64k(word, len);	if ((mini_array == NULL) || (mini_array_len <= 0) || !glimpse_isserver) {		if (minifp == NULL) return 0;		fseek(minifp, (long)(index*sizeof(int)), 0);		if (fread((void *)array, sizeof(int), 1, minifp) != 1) return 0;		*beginoffset = decode32b((array[0] << 24) | (array[1] << 16) | (array[2] << 8) | array[3]);		if (fread((void *)array, sizeof(int), 1, minifp) != 1)			*endoffset = -1;		else *endoffset = decode32b((array[0] << 24) | (array[1] << 16) | (array[2] << 8) | array[3]);		return 1;	}	*beginoffset = mini_array[index].offset;	if (index + 1 < endindex)		*endoffset = mini_array[index + 1].offset;	else *endoffset = -1;	return 1;}/* Returns: #of words in mini_array if success or already read, -1 if failure */intread_mini(indexfp, minifp)	FILE	*indexfp, *minifp;	/* indexfp pointing right to first line of word+... */{	unsigned char	s[MAX_LINE_LEN], array[sizeof(int)];	int	offset, hash_value;	if ((mini_array != NULL) && (mini_array_len > 0)) return mini_array_len;	if (minifp == NULL) return 0;	rewind(minifp);	mini_array_len = MINI_ARRAY_LEN;	mini_array = (struct mini *)my_malloc(sizeof(struct mini) * mini_array_len);	memset(mini_array, '\0', sizeof(struct mini) * mini_array_len);	hash_value = 0;	/* line# I am going to scan */	offset = 0;	while ((hash_value < MINI_ARRAY_LEN) && (fread((void *)array, sizeof(int), 1, minifp) == 1)) {		offset = (array[0] << 24) | (array[1] << 16) | (array[2] << 8) | array[3];		mini_array[hash_value++].offset = decode32b(offset);	}	for (; hash_value<MINI_ARRAY_LEN; hash_value++)		mini_array[hash_value].offset = -1;	/* end of index file */	return mini_array_len;}/* * 1. Find hash64k values of each word. Then fprintf it before the word and put it *    in another file. Sort it and put that as the real index. * 2. Then in the new index, dump offsets after stripping the hash value out, and *    dump the offset at the hash_value-th line into the mini file. * 3. The only problem is that the offsets obtained from the index into the parti- *    tions won't be in increasing order, but who cares? get_block_numbers() works! * 4. In merge_splits(), we have to re-sort everything by word for add-to-index *    and fast-index to work properly. */dump_mini(indexfile)	char	*indexfile;{	unsigned char	s[MAX_LINE_LEN], *t, word[MAX_NAME_LEN], c;	unsigned char	indexnumber[MAX_LINE_LEN], onefileperblock[MAX_LINE_LEN];	int	attr_num, linelen;	FILE	*indexfp;	FILE	*newindexfp;	FILE	*minifp;	long	offset;	/* offset if offset of beginning of word */	int	eoffset, j, hash_value, prev_hash_value;	/* NOT shorts!! */	int	rc;	/* return code from system(3) */	char	es1[MAX_LINE_LEN], es2[MAX_LINE_LEN], es3[MAX_LINE_LEN], temp_rdelim[MAX_LINE_LEN];	temp_rdelim[0] = '\0';  /* Initialize in case not read. 10/25/99 --GV */	/*	 * First change the sorting order of the index file.	 */	if ((indexfp = fopen(indexfile, "r")) == NULL) {		fprintf(stderr, "Can't open for reading: %s\n", indexfile);		exit(2);	}	sprintf(s, "%s.tmp", indexfile);	if ((newindexfp = fopen(s, "w")) == NULL) {		fprintf(stderr, "Can't open for writing: %s\n", s);		fclose(indexfp);		exit(2);	}	/* Must store since sort -n can screw it up */	fgets(indexnumber, 256, indexfp);	fgets(onefileperblock, 256, indexfp);	if ( !fscanf(indexfp, "%%%d\n", &attr_num) )		fscanf(indexfp, "%%%d%s\n", &attr_num, temp_rdelim);	while (fgets(s, MAX_LINE_LEN, indexfp) != NULL) {		j = 0;		linelen = strlen(s);		while ((j < linelen) && (s[j] != WORD_END_MARK) && (s[j] != ALL_INDEX_MARK) && (s[j] != '\n') && (s[j] != '\0')) j++;		if ((j >= linelen) || (s[j] == '\n') || (s[j] == '\0')) {			continue;		}		/* else it is WORD_END_MARK or ALL_INDEX_MARK */		c = s[j];		s[j] = '\0';		hash_value = hash64k(s, j);		s[j] = c;		fprintf(newindexfp, "%d ", hash_value);		if (fputs(s, newindexfp) == EOF) {			fprintf(stderr, "Error: write failed at %s:%d\n", __FILE__, __LINE__);			exit(2);		}	}	fclose(indexfp);	fflush(newindexfp);	fclose(newindexfp);#if	SFS_COMPAT	unlink(indexfile);#else	sprintf(s, "exec %s '%s'", SYSTEM_RM, escapesinglequote(indexfile, es1));	system(s);#endif#if	DONTUSESORT_T_OPTION || SFS_COMPAT	sprintf(s, "exec %s -n '%s.tmp' > '%s'\n", SYSTEM_SORT, escapesinglequote(indexfile, es1), escapesinglequote(indexfile, es2));#else	sprintf(s, "exec %s -n -T '%s' '%s.tmp' > '%s'\n", SYSTEM_SORT, escapesinglequote(INDEX_DIR, es1), escapesinglequote(indexfile, es2), escapesinglequote(indexfile, es3));#endif	rc = system(s);	if (rc >> 8) {		fprintf (stderr, "'sort' command:\n");		fprintf (stderr, "    %s\n", s);		fprintf (stderr, "failed with exit status %d\n", rc>>8);		exit(2);	}#if	SFS_COMPAT	sprintf(s, "%s.tmp", indexfile);	unlink(s);#else	sprintf(s, "exec %s '%s.tmp'", SYSTEM_RM, escapesinglequote(indexfile, es1));	system(s);#endif	system(sync_path);	/* sync() has a BUG */	/*	 * Now dump the mini-file's offsets and create the stripped index file	 */	if ((indexfp = fopen(indexfile, "r")) == NULL) {		fprintf(stderr, "Can't open for reading: %s\n", indexfile);		exit(2);	}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -