📄 hash.c
字号:
/* Copyright (c) 1994 Burra Gopal, Udi Manber. All Rights Reserved. *//* * hash.c: Hash table manipulation routines. Can be used to compute * the dictionary as well as compress files. */#include "defs.h"int next_free_hash = 0;hash_entry *free_hash = NULL; /*[DEF_MAX_WORDS]; */int next_free_str = 0;char *free_str = NULL; /*[DEF_MAX_WORDS * AVG_WORD_LEN]; */extern int usemalloc;/* -----------------------------------------------------------------input: a word (a string of ascii character terminated by NULL)output: a hash_value of the input word.hash function: if the word has length <= 4 the hash value is just a concatenation of the last four bits of the characters. if the word has length > 4, then after the above operation, the hash value is updated by adding each remaining character. (and AND with the 16-bits mask).---------------------------------------------------------------- */intthash64k(word, len)unsigned char *word;int len;{ unsigned int hash_value=0; unsigned int mask_4=017; unsigned int mask_16=0177777; int i; if(len<=4) { for(i=0; i<len; i++) { hash_value = (hash_value << 4) | (word[i]&mask_4); /* hash_value = hash_value & mask_16; */ } } else { for(i=0; i<4; i++) { hash_value = (hash_value << 4) | (word[i]&mask_4); /* hash_value = hash_value & mask_16; */ } for(i=4; i<len; i++) hash_value = mask_16 & (hash_value + word[i]); } return(hash_value & mask_16);}hash_entry *get_hash(hash_table, word, len, i) hash_entry *hash_table[HASH_TABLE_SIZE]; unsigned char *word; int len; int *i;{ hash_entry *e; *i = thash64k(word, len); e = hash_table[*i]; while(e != NULL) { if (!strcmp(e->word, (char *)word)) break; else e = e->next; } return e;}/* * Assigns either the freq or the offset to the hash-entry. The kind of * information in the entry depends on the caller. Advice: different * hash-tables must be used to store information gathered during * the build operation and the compress operation by the appropriate * module. This can be specified by passing -1's for offset/freq resply. */hash_entry *insert_hash(hash_table, word, len, freq, offset) hash_entry *hash_table[HASH_TABLE_SIZE]; unsigned char *word; int len, freq, offset;{ int i; hash_entry *e; e = get_hash(hash_table, word, len, &i); if (e == NULL) { hashalloc(e); stralloc(e->word, len + 2); strcpy(e->word, (char *)word); e->val.offset = 0; e->next = hash_table[i]; hash_table[i] = e; } if ((offset == -1) && (freq != -1)) { e->val.attribute.freq += freq; /* e->val.attribute.index has to be accessed from outside this function */ } else if ((offset != -1) && (freq == -1)) { e->val.offset = offset; /* used in building the string table from the dictionary */ } else { fprintf(stderr, "error in accessing hash-table [frequencies/offsets]. skipping...\n"); return (NULL); }#if 0 printf("%d %x\n", i, e);#endif /*0*/ return e;}/* * HASHFILE format: the hash-file is a sequence of "'\0' hash-index word-index word-name" * The '\0' is there to indicate that this is not a padded line. Padded lines simply have * a '\n' as the first character (words don't have '\0' or '\n'). The hash and word indices * are 2 unsigned short integers in binary, MSB first. The word name therefore starts from the * 5th character and continues until a '\0' or '\n' is encountered. The total size of the * hash-table is therefore (|avgwordlen|+5)*numwords = appx 12 * 50000 = .6MB. * Note that there can be multiple lines with the same hash-index. *//* used when computing compress's dictionary */intdump_hash(hash_table, HASHFILE) hash_entry *hash_table[HASH_TABLE_SIZE]; unsigned char *HASHFILE;{ int i; FILE *hashfp; int wordindex; hash_entry *e, *t; if ((hashfp = fopen((char *)HASHFILE, "w")) == NULL) { fprintf(stderr, "cannot open for writing: %s\n", HASHFILE); return 0; } /* We have a guarantee that the wordindex + 1 cannot exceed MAX_WORDS */ wordindex = 0; for(i=0; i<HASH_TABLE_SIZE; i++) { e = hash_table[i]; while (e != NULL) { fprintf(hashfp, "%d %d %s\n", i, wordindex, e->word); t = e->next; strfree(e->word); hashfree(e); e = t; wordindex ++; } } fclose(hashfp); return wordindex;}/* * These are routines that operate on hash-tables of 4K size (used in tbuild.c) *//* crazy hash function that operates on 4K hashtables */thash4k(word, len) char *word; int len;{ unsigned int hash_value=0; unsigned int mask_3=07; unsigned int mask_12=07777; int i;#if 0 /* discard prefix = the directory name */ if (len<=1) return 0; i = len-1; while(word[i] != '/') i--; if ((i > 0) && (word[i] == '/')) { word = &word[i+1]; len = strlen(word); }#endif /*0*/ if(len<=4) { for(i=0; i<len; i++) { hash_value = (hash_value << 3) | (word[i]&mask_3); } } else { for(i=0; i<4; i++) { hash_value = (hash_value << 3) | (word[i]&mask_3); } for(i=4; i<len; i++) hash_value = mask_12 & (hash_value + word[i]); } return(hash_value & mask_12);}hash_entry *get_small_hash(hash_table, word, len, i) hash_entry *hash_table[SMALL_HASH_TABLE_SIZE]; unsigned char *word; int len; int *i;{ hash_entry *e; *i = thash4k(word, len); e = hash_table[*i]; while(e != NULL) { if (!strcmp(e->word, (char *)word)) break; else e = e->next; } return e;}hash_entry *insert_small_hash(hash_table, word, len, freq, offset) hash_entry *hash_table[SMALL_HASH_TABLE_SIZE]; unsigned char *word; int len, freq, offset;{ int i; hash_entry *e; e = get_small_hash(hash_table, word, len, &i); if (e == NULL) { hashalloc(e); stralloc(e->word, len + 2); strcpy(e->word, (char *)word); e->val.offset = 0; e->next = hash_table[i]; hash_table[i] = e; } if ((offset == -1) && (freq != -1)) { e->val.attribute.freq += freq; /* e->val.attribute.index has to be accessed from outside this function */ } else if ((offset != -1) && (freq == -1)) { e->val.offset = offset; /* used in building the string table from the dictionary */ } else { fprintf(stderr, "error in accessing hash-table [frequencies/offsets]. skipping...\n"); return (NULL); }#if 0 printf("%d %x\n", i, e);#endif /*0*/ return e;}intdump_small_hash(hash_table, HASHFILE) hash_entry *hash_table[SMALL_HASH_TABLE_SIZE]; unsigned char *HASHFILE;{ int i; FILE *hashfp; int wordindex; hash_entry *e, *t; if ((hashfp = fopen((char *)HASHFILE, "w")) == NULL) { fprintf(stderr, "cannot open for writing: %s\n", HASHFILE); return 0; } /* We have a guarantee that the wordindex + 1 cannot exceed MAX_WORDS */ wordindex = 0; for(i=0; i<SMALL_HASH_TABLE_SIZE; i++) { e = hash_table[i]; while (e != NULL) { fprintf(hashfp, "%d %d %s\n", thash64k(e->word, strlen(e->word)), wordindex, e->word); /* must look like I used 64K table */ t = e->next; strfree(e->word); hashfree(e); e = t; wordindex ++; } } fclose(hashfp); return wordindex;}/* * These are again routines that operate on big (64k) hash-tables *//* used only during debugging to see if output = input */intdump_hash_debug(hash_table, HASHFILE) hash_entry *hash_table[HASH_TABLE_SIZE]; unsigned char *HASHFILE;{ int i; FILE *hashfp; hash_entry *e; if ((hashfp = fopen((char *)HASHFILE, "w")) == NULL) { fprintf(stderr, "cannot open for writing: %s\n", HASHFILE); return 0; } /* We have a guarantee that the wordindex + 1 cannot exceed MAX_WORDS */ for(i=0; i<HASH_TABLE_SIZE; i++) { e = hash_table[i]; while (e != NULL) { fprintf(hashfp, "%d %d %d %s\n", i, e->val.attribute.freq, e->val.attribute.index, e->word); e = e->next; } }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -