📄 hash.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
上一页 12
	fclose(hashfp);	return 1;}/* * VERY particular to the format of the hash-table file: * -- does an fscanf+2atoi's+strlen all in one scan. * Returns 0 if you are in padded are, -1 on EOF, else ~. */intmyhashread(fp, pint1, pint2, str, plen)	FILE	*fp;	int	*pint1;	int	*pint2;	char	*str;	int	*plen;{	int	numread;	int	int1, int2;	int	c;	if((int1 = getc(fp)) == '\n') return 0;	/* padded area */	if(int1 != 0) return -1;		/* formatting error! */	if ((int1 = getc(fp)) == EOF) return -1;	if ((int2 = getc(fp)) == EOF) return -1;	*pint1 = (int1 << 8) | int2;		/* hashindex */	if ((int1 = getc(fp)) == EOF) return -1;	if ((int2 = getc(fp)) == EOF) return -1;	*pint2 = (int1 << 8) | int2;		/* wordindex */	numread = 5;	*plen = 0;				/* wordname */	while((c = getc(fp)) != EOF) {		if ( (c == '\0') || (c == '\n') ){			ungetc(c, fp);			str[*plen] = '\0';			return numread;		}		str[(*plen)++] = c;		numread ++;		if (numread >= MAX_NAME_LEN) {			str[*plen - 1] = '\0';			return numread;		}	}	return -1;}inttbuild_hash(hash_table, hashfp, bytestoread)	hash_entry	*hash_table[HASH_TABLE_SIZE];	FILE		*hashfp;	int		bytestoread;{	int	hashindex;	int	wordindex;	int	numread = 0;	int	ret;	int	len;	char	*word;	char	dummybuf[MAX_WORD_BUF];	hash_entry *e;	if (bytestoread == -1) {	/* read until end of file */		while (1)		{			if (usemalloc) word = dummybuf;			else {				if (free_str == NULL) free_str = (char *)malloc(AVG_WORD_LEN * DEF_MAX_WORDS);				if (free_str == NULL) break;				word = &free_str[next_free_str];			}			if ((ret = myhashread(hashfp, &hashindex, &wordindex, word, &len)) == 0) continue;			if (ret == -1) break;			if ((hashindex >= HASH_TABLE_SIZE) || (hashindex < 0)) continue;	/* ignore */			hashalloc(e);			if (usemalloc) {				if ((word = (char *)malloc(len + 2)) == NULL) break;				strcpy(word, dummybuf);			}			else next_free_str += len + 2;			e->word = word;			e->val.attribute.freq = 0;	/* just exists in compress's dict: not found in text-file yet! */			e->val.attribute.index = wordindex;			e->next = hash_table[hashindex];			hash_table[hashindex] = e;#if	0			printf("word=%s index=%d\n", word, wordindex);#endif	/*0*/		}	}	else {	/* read only a specified number of bytes */		while (bytestoread > numread)		{			if (usemalloc) word = dummybuf;			else {				if (free_str == NULL) free_str = (char *)malloc(AVG_WORD_LEN * DEF_MAX_WORDS);				if (free_str == NULL) break;				word = &free_str[next_free_str];			}			if ((ret = myhashread(hashfp, &hashindex, &wordindex, word, &len)) <= 0) break;			if ((hashindex >= HASH_TABLE_SIZE) || (hashindex < 0)) continue;	/* ignore */			hashalloc(e);			if (usemalloc) {				if ((word = (char *)malloc(len + 2)) == NULL) break;				strcpy(word, dummybuf);			}			else next_free_str += len + 2;			e->word = word;			e->val.attribute.freq = 0;	/* just exists in compress's dict: not found in text-file yet! */			e->val.attribute.index = wordindex;			e->next = hash_table[hashindex];			hash_table[hashindex] = e;			wordindex ++;			numread += ret;#if	0			printf("%d %d %s\n", hashindex, wordindex, word);#endif	/*0*/		}	}	return (wordindex + 1);	/* the highest indexed word + 1 */}/* * Interprets srcbuf as a series of words separated by newlines and looks * for a complete occurrence of words in patbuf in it. If there IS an occurrence, * it builds the hash-table for THAT page. The hashfp must start at the * beginning on each call. */intbuild_partial_hash(hash_table, hashfp, srcbuf, srclen, patbuf, patlen, blocksize, loaded_hash_table)	hash_entry *hash_table[HASH_TABLE_SIZE];	FILE	*hashfp;	unsigned char	*srcbuf;	int	srclen;	unsigned char	*patbuf;	int	patlen;	int	blocksize;	char	loaded_hash_table[HASH_FILE_BLOCKS];{	unsigned char	*srcpos;	unsigned char	*srcinit, *srcend, dest[MAX_NAME_LEN];	int	blockindex = 0;	int	i, initlen, endlen;	unsigned char	*strings[MAX_NAME_LEN];	/* maximum pattern length */	int	numstrings = 0;	int	inword = 0;	/*	 * Find all the relevant strings in the pattern.	 */	i = 0;	while(i<patlen) {		if (isalnum(patbuf[i])) {			if (!inword) {				strings[numstrings++] = &dest[i];				inword = 1;			}			if (isupper(patbuf[i])) dest[i] = tolower(patbuf[i]);			else dest[i] = patbuf[i];		}		else {			dest[i] = '\0';	/* ignore that character */			inword = 0;		}		i++;	}#if	0	for (i=0; i<numstrings; i++) printf("word%d=%s\n", i, strings[i]);	getchar();#endif	/*0*/	srcpos = srcbuf;	while (srcpos < (srcbuf + srclen)) {		srcinit = srcpos;		initlen = strlen((char *)srcinit);		srcend = srcinit + initlen + 1;		endlen = strlen((char *)srcend);#if	0		printf("%s -- %s\n", srcinit, srcend);#endif	/*0*/		for (i=0; i<numstrings; i++)			if ((strcmp((char *)strings[i], (char *)srcinit) >= 0) && (strcmp((char *)strings[i], (char *)srcend) <= 0)) goto include_page;		blockindex++;		srcpos += (initlen + endlen + 2);		continue;	include_page:	/* Include it if any of the patterns fit within this range */		if (loaded_hash_table[blockindex++]) continue;#if	0		printf("build_partial_hash: hashing words in page# %d\n", blockindex);#endif	/*0*/		loaded_hash_table[blockindex - 1] = 1;		fseek(hashfp, (blockindex-1)*blocksize, 0);		tbuild_hash(hash_table, hashfp, blocksize);		srcpos += (initlen + endlen + 2);	}	return 0;}pad_hash_file(filename, FILEBLOCKSIZE)	unsigned char *filename;	int FILEBLOCKSIZE;{	FILE	*outfp, *infp, *indexfp;	int	offset = 0, len;	unsigned char buf[MAX_NAME_LEN];	int	pid = getpid();	int	i;	unsigned char	word[MAX_NAME_LEN];	unsigned char	prev_word[MAX_NAME_LEN];	unsigned int	hashindex, wordindex;	char		es1[MAX_LINE_LEN], es2[MAX_LINE_LEN];	if ((infp = fopen((char *)filename, "r")) == NULL) {		fprintf(stderr, "cannot open for reading: %s\n", filename);		exit(2);	}	sprintf(buf, "%s.index", filename);	if ((indexfp = fopen(buf, "w")) == NULL) {		fprintf(stderr, "cannot open for writing: %s\n", buf);		fclose(infp);		exit(2);	}	sprintf(buf, "%s.%d", filename, pid);	if ((outfp = fopen(buf, "w")) == NULL) {		fprintf(stderr, "cannot open for writing: %s\n", buf);		fclose(infp);		fclose(indexfp);		exit(2);	}	if ((FILEBLOCKSIZE % MIN_BLOCKSIZE) != 0) {		fprintf(stderr, "invalid block size %d: changing to %d\n", FILEBLOCKSIZE, MIN_BLOCKSIZE);		FILEBLOCKSIZE = MIN_BLOCKSIZE;	}	fprintf(indexfp, "%d\n", FILEBLOCKSIZE);	if ((char*)buf != fgets(buf, MAX_NAME_LEN, infp)) goto end_of_input;	len = strlen((char *)buf);	sscanf(buf, "%d %d %s\n", &hashindex, &wordindex, word);	putc(0, outfp);	putc((hashindex & 0xff00)>>8, outfp);	putc((hashindex & 0x00ff), outfp);	putc((wordindex & 0xff00)>>8, outfp);	putc((wordindex & 0x00ff), outfp);	fprintf(outfp, "%s", word);	buf[len-1] = '\0';			/* fgets gives you the newline too */        for (i=0; i< len; i++) if (isupper(buf[i])) buf[i] = tolower(buf[i]);	for (i=len-2; i>=0; i--) if (buf[i] == ' ') { i++; break; }	if (i < 0) i = 0;	strcpy((char *)prev_word, (char *)&buf[i]);	fprintf(indexfp, "%s", &buf[i]);	/* the first word */	putc(0, indexfp);			/* null terminated */	offset += strlen((char *)word)+5;	 while(fgets(buf, MAX_NAME_LEN, infp) == (char *)buf) {		len = strlen((char *)buf);		if (offset + len > FILEBLOCKSIZE) {			/* Put the last char of the prev. page */			fprintf(indexfp, "%s", prev_word);			putc(0, indexfp);	/* null terminated */							for (i=0; i<FILEBLOCKSIZE-offset; i++)	/* fill up with so many newlines until the next block size */				putc('\n', outfp);			sscanf(buf, "%d %d %s\n", &hashindex, &wordindex, word);			putc(0, outfp);			putc((hashindex & 0xff00)>>8, outfp);			putc((hashindex & 0x00ff), outfp);			putc((wordindex & 0xff00)>>8, outfp);			putc((wordindex & 0x00ff), outfp);			fprintf(outfp, "%s", word);                        buf[len-1] = '\0';			/* fgets gives you the newline too */                        for (i=0; i< len; i++) if (isupper(buf[i])) buf[i] = tolower(buf[i]);                        for (i=len-2; i>=0; i--) if (buf[i] == ' ') { i++; break; }                        if (i < 0) i = 0;                        strcpy((char *)prev_word, (char *)&buf[i]);			fprintf(indexfp, "%s", &buf[i]);	/* store the first word at each page */			putc(0, indexfp);			/* null terminated */			offset = 0;		}		else {			sscanf(buf, "%d %d %s\n", &hashindex, &wordindex, word);			putc(0, outfp);			putc((hashindex & 0xff00)>>8, outfp);			putc((hashindex & 0x00ff), outfp);			putc((wordindex & 0xff00)>>8, outfp);			putc((wordindex & 0x00ff), outfp);			fprintf(outfp, "%s", word);                        buf[len-1] = '\0';			/* fgets gives you the newline too */                        for (i=0; i<len; i++) if (isupper(buf[i])) buf[i] = tolower(buf[i]);                        for (i=len-2; i>=0; i--) if (buf[i] == ' ') { i++; break; }                        if (i < 0) i = 0;                        strcpy((char *)prev_word, (char *)&buf[i]);		}		offset += strlen((char *)word)+5;	}	fprintf(indexfp, "%s", prev_word);	putc(0, indexfp);			/* null terminated */end_of_input:	fclose(infp);	fflush(outfp);	fclose(outfp);	fflush(indexfp);	fclose(indexfp);	sprintf(buf, "exec %s '%s.%d' '%s'\n", SYSTEM_MV, tescapesinglequote(filename, es1), pid, tescapesinglequote(filename, es2));	system(buf);}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -