📄 hash.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* Copyright (c) 1994 Burra Gopal, Udi Manber.  All Rights Reserved. *//* * hash.c:	Hash table manipulation routines. Can be used to compute *		the dictionary as well as compress files. */#include "defs.h"int next_free_hash = 0;hash_entry *free_hash = NULL; /*[DEF_MAX_WORDS]; */int next_free_str = 0;char *free_str = NULL; /*[DEF_MAX_WORDS * AVG_WORD_LEN]; */extern int usemalloc;/* -----------------------------------------------------------------input: a word (a string of ascii character terminated by NULL)output: a hash_value of the input word.hash function: if the word has length <= 4        the hash value is just a concatenation of the last four bits        of the characters.        if the word has length > 4, then after the above operation,        the hash value is updated by adding each remaining character.        (and AND with the 16-bits mask).---------------------------------------------------------------- */intthash64k(word, len)unsigned char *word;int len;{    unsigned int hash_value=0;    unsigned int mask_4=017;    unsigned int mask_16=0177777;    int i;    if(len<=4) {        for(i=0; i<len; i++) {            hash_value = (hash_value << 4) | (word[i]&mask_4);            /* hash_value = hash_value  & mask_16; */        }    }    else {        for(i=0; i<4; i++) {            hash_value = (hash_value << 4) | (word[i]&mask_4);            /* hash_value = hash_value & mask_16;  */        }        for(i=4; i<len; i++)            hash_value = mask_16 & (hash_value + word[i]);    }    return(hash_value & mask_16);}hash_entry *get_hash(hash_table, word, len, i)	hash_entry *hash_table[HASH_TABLE_SIZE];	unsigned char	*word;	int	len;	int	*i;{	hash_entry *e;	*i = thash64k(word, len);	e = hash_table[*i];	while(e != NULL) {		if (!strcmp(e->word, (char *)word)) break;		else e = e->next;	}	return e;}/* * Assigns either the freq or the offset to the hash-entry. The kind of * information in the entry depends on the caller. Advice: different * hash-tables must be used to store information gathered during * the build operation and the compress operation by the appropriate * module. This can be specified by passing -1's for offset/freq resply. */hash_entry *insert_hash(hash_table, word, len, freq, offset)	hash_entry *hash_table[HASH_TABLE_SIZE];	unsigned char	*word;	int	len, freq, offset;{	int	i;	hash_entry *e;	e = get_hash(hash_table, word, len, &i);	if (e == NULL) {		hashalloc(e);		stralloc(e->word, len + 2);		strcpy(e->word, (char *)word);		e->val.offset = 0;		e->next = hash_table[i];		hash_table[i] = e;	}	if ((offset == -1) && (freq != -1)) {		e->val.attribute.freq += freq;		/* e->val.attribute.index has to be accessed from outside this function */	}	else if ((offset != -1) && (freq == -1)) {		e->val.offset = offset;		/* used in building the string table from the dictionary */	}	else {		fprintf(stderr, "error in accessing hash-table [frequencies/offsets]. skipping...\n");		return (NULL);	}#if	0	printf("%d %x\n", i, e);#endif	/*0*/	return e;}/* * HASHFILE format: the hash-file is a sequence of "'\0' hash-index word-index word-name" * The '\0' is there to indicate that this is not a padded line. Padded lines simply have * a '\n' as the first character (words don't have '\0' or '\n'). The hash and word indices * are 2 unsigned short integers in binary, MSB first. The word name therefore starts from the * 5th character and continues until a '\0' or '\n' is encountered. The total size of the * hash-table is therefore (|avgwordlen|+5)*numwords = appx 12 * 50000 = .6MB. * Note that there can be multiple lines with the same hash-index. *//* used when computing compress's dictionary */intdump_hash(hash_table, HASHFILE)	hash_entry *hash_table[HASH_TABLE_SIZE];	unsigned char	*HASHFILE;{	int	i;	FILE	*hashfp;	int	wordindex;	hash_entry *e, *t;	if ((hashfp = fopen((char *)HASHFILE, "w")) == NULL) {		fprintf(stderr, "cannot open for writing: %s\n", HASHFILE);		return 0;	}	/* We have a guarantee that the wordindex + 1 cannot exceed MAX_WORDS */	wordindex = 0;	for(i=0; i<HASH_TABLE_SIZE; i++) {		e = hash_table[i];		while (e != NULL) {			fprintf(hashfp, "%d %d %s\n", i, wordindex, e->word);			t = e->next;			strfree(e->word);			hashfree(e);			e = t;			wordindex ++;		}	}	fclose(hashfp);	return wordindex;}/* * These are routines that operate on hash-tables of 4K size (used in tbuild.c) *//* crazy hash function that operates on 4K hashtables */thash4k(word, len)	char 	*word;	int	len;{    unsigned int hash_value=0;    unsigned int mask_3=07;    unsigned int mask_12=07777;    int i;#if	0    /* discard prefix = the directory name */    if (len<=1) return 0;    i = len-1;    while(word[i] != '/') i--;    if ((i > 0) && (word[i] == '/')) {	word = &word[i+1];	len = strlen(word);    }#endif	/*0*/    if(len<=4) {	for(i=0; i<len; i++) {       	    hash_value = (hash_value << 3) | (word[i]&mask_3);	}    }    else {	for(i=0; i<4; i++) {       	    hash_value = (hash_value << 3) | (word[i]&mask_3);	}	for(i=4; i<len; i++) 	    hash_value = mask_12 & (hash_value + word[i]);    }    return(hash_value & mask_12);}hash_entry *get_small_hash(hash_table, word, len, i)	hash_entry *hash_table[SMALL_HASH_TABLE_SIZE];	unsigned char	*word;	int	len;	int	*i;{	hash_entry *e;	*i = thash4k(word, len);	e = hash_table[*i];	while(e != NULL) {		if (!strcmp(e->word, (char *)word)) break;		else e = e->next;	}	return e;}hash_entry *insert_small_hash(hash_table, word, len, freq, offset)	hash_entry *hash_table[SMALL_HASH_TABLE_SIZE];	unsigned char	*word;	int	len, freq, offset;{	int	i;	hash_entry *e;	e = get_small_hash(hash_table, word, len, &i);	if (e == NULL) {		hashalloc(e);		stralloc(e->word, len + 2);		strcpy(e->word, (char *)word);		e->val.offset = 0;		e->next = hash_table[i];		hash_table[i] = e;	}	if ((offset == -1) && (freq != -1)) {		e->val.attribute.freq += freq;		/* e->val.attribute.index has to be accessed from outside this function */	}	else if ((offset != -1) && (freq == -1)) {		e->val.offset = offset;		/* used in building the string table from the dictionary */	}	else {		fprintf(stderr, "error in accessing hash-table [frequencies/offsets]. skipping...\n");		return (NULL);	}#if	0	printf("%d %x\n", i, e);#endif	/*0*/	return e;}intdump_small_hash(hash_table, HASHFILE)	hash_entry *hash_table[SMALL_HASH_TABLE_SIZE];	unsigned char	*HASHFILE;{	int	i;	FILE	*hashfp;	int	wordindex;	hash_entry *e, *t;	if ((hashfp = fopen((char *)HASHFILE, "w")) == NULL) {		fprintf(stderr, "cannot open for writing: %s\n", HASHFILE);		return 0;	}	/* We have a guarantee that the wordindex + 1 cannot exceed MAX_WORDS */	wordindex = 0;	for(i=0; i<SMALL_HASH_TABLE_SIZE; i++) {		e = hash_table[i];		while (e != NULL) {			fprintf(hashfp, "%d %d %s\n", thash64k(e->word, strlen(e->word)), wordindex, e->word);	/* must look like I used 64K table */			t = e->next;			strfree(e->word);			hashfree(e);			e = t;			wordindex ++;		}	}	fclose(hashfp);	return wordindex;}/* * These are again routines that operate on big (64k) hash-tables *//* used only during debugging to see if output = input */intdump_hash_debug(hash_table, HASHFILE)	hash_entry *hash_table[HASH_TABLE_SIZE];	unsigned char	*HASHFILE;{	int	i;	FILE	*hashfp;	hash_entry *e;	if ((hashfp = fopen((char *)HASHFILE, "w")) == NULL) {		fprintf(stderr, "cannot open for writing: %s\n", HASHFILE);		return 0;	}	/* We have a guarantee that the wordindex + 1 cannot exceed MAX_WORDS */	for(i=0; i<HASH_TABLE_SIZE; i++) {		e = hash_table[i];		while (e != NULL) {			fprintf(hashfp, "%d %d %d %s\n", i, e->val.attribute.freq, e->val.attribute.index, e->word);			e = e->next;		}	}
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -