📄 huffman.c
字号:
/* * huffman - Encode/Decode files using Huffman encoding. * http://huffman.sourceforge.net * Copyright (C) 2003 Douglas Ryan Richardson */#include <stdio.h>#include <stdlib.h>#include <string.h>#include <assert.h>#include "huffman.h"#ifdef WIN32#include <winsock2.h>#include <malloc.h>#define alloca _alloca#else#include <netinet/in.h>#endiftypedef struct huffman_node_tag{ unsigned char isLeaf; unsigned long count; struct huffman_node_tag *parent; union { struct { struct huffman_node_tag *zero, *one; }; unsigned char symbol; };} huffman_node;typedef struct huffman_code_tag{ /* The length of this code in bits. */ unsigned long numbits; /* The bits that make up this code. The first bit is at position 0 in bits[0]. The second bit is at position 1 in bits[0]. The eighth bit is at position 7 in bits[0]. The ninth bit is at position 0 in bits[1]. */ unsigned char *bits;} huffman_code;static unsigned longnumbytes_from_numbits(unsigned long numbits){ return numbits / 8 + (numbits % 8 ? 1 : 0);}/* * get_bit returns the ith bit in the bits array * in the 0th position of the return value. */static unsigned charget_bit(unsigned char* bits, unsigned long i){ return (bits[i / 8] >> i % 8) & 1;}static voidreverse_bits(unsigned char* bits, unsigned long numbits){ unsigned long numbytes = numbytes_from_numbits(numbits); unsigned char *tmp = (unsigned char*)alloca(numbytes); unsigned long curbit; long curbyte = 0; memset(tmp, 0, numbytes); for(curbit = 0; curbit < numbits; ++curbit) { unsigned int bitpos = curbit % 8; if(curbit > 0 && curbit % 8 == 0) ++curbyte; tmp[curbyte] |= (get_bit(bits, numbits - curbit - 1) << bitpos); } memcpy(bits, tmp, numbytes);}/* * new_code builds a huffman_code from a leaf in * a Huffman tree. */static huffman_code*new_code(const huffman_node* leaf){ /* Build the huffman code by walking up to * the root node and then reversing the bits, * since the Huffman code is calculated by * walking down the tree. */ unsigned long numbits = 0; unsigned char* bits = NULL; huffman_code *p; while(leaf && leaf->parent) { huffman_node *parent = leaf->parent; unsigned char cur_bit = (unsigned char)(numbits % 8); unsigned long cur_byte = numbits / 8; /* If we need another byte to hold the code, then allocate it. */ if(cur_bit == 0) { size_t newSize = cur_byte + 1; bits = (unsigned char*)realloc(bits, newSize); bits[newSize - 1] = 0; /* Initialize the new byte. */ } /* If a one must be added then or it in. If a zero * must be added then do nothing, since the byte * was initialized to zero. */ if(leaf == parent->one) bits[cur_byte] |= 1 << cur_bit; ++numbits; leaf = parent; } if(bits) reverse_bits(bits, numbits); p = (huffman_code*)malloc(sizeof(huffman_code)); p->numbits = numbits; p->bits = bits; return p;}#define MAX_SYMBOLS 256typedef huffman_node* SymbolFrequencies[MAX_SYMBOLS];typedef huffman_code* SymbolEncoder[MAX_SYMBOLS];static huffman_node*new_leaf_node(unsigned char symbol){ huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node)); p->isLeaf = 1; p->symbol = symbol; p->count = 0; p->parent = 0; return p;}static huffman_node*new_nonleaf_node(unsigned long count, huffman_node *zero, huffman_node *one){ huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node)); p->isLeaf = 0; p->count = count; p->zero = zero; p->one = one; p->parent = 0; return p;}static voidfree_huffman_tree(huffman_node *subtree){ if(subtree == NULL) return; if(!subtree->isLeaf) { free_huffman_tree(subtree->zero); free_huffman_tree(subtree->one); } free(subtree);}static voidfree_code(huffman_code* p){ free(p->bits); free(p);}static voidfree_encoder(SymbolEncoder *pSE){ unsigned long i; for(i = 0; i < MAX_SYMBOLS; ++i) { huffman_code *p = (*pSE)[i]; if(p) free_code(p); } free(pSE);}static voidinit_frequencies(SymbolFrequencies *pSF){ memset(*pSF, 0, sizeof(SymbolFrequencies));#if 0 unsigned int i; for(i = 0; i < MAX_SYMBOLS; ++i) { unsigned char uc = (unsigned char)i; (*pSF)[i] = new_leaf_node(uc); }#endif}typedef struct buf_cache_tag{ unsigned char *cache; unsigned int cache_len; unsigned int cache_cur; unsigned char **pbufout; unsigned int *pbufoutlen;} buf_cache;static int init_cache(buf_cache* pc, unsigned int cache_size, unsigned char **pbufout, unsigned int *pbufoutlen){ assert(pc && pbufout && pbufoutlen); if(!pbufout || !pbufoutlen) return 1; pc->cache = (unsigned char*)malloc(cache_size); pc->cache_len = cache_size; pc->cache_cur = 0; pc->pbufout = pbufout; *pbufout = NULL; pc->pbufoutlen = pbufoutlen; *pbufoutlen = 0; return pc->cache ? 0 : 1;}static void free_cache(buf_cache* pc){ assert(pc); if(pc->cache) { free(pc->cache); pc->cache = NULL; }}static int flush_cache(buf_cache* pc){ assert(pc); if(pc->cache_cur > 0) { unsigned int newlen = pc->cache_cur + *pc->pbufoutlen; unsigned char* tmp = realloc(*pc->pbufout, newlen); if(!tmp) return 1; memcpy(tmp + *pc->pbufoutlen, pc->cache, pc->cache_cur); *pc->pbufout = tmp; *pc->pbufoutlen = newlen; pc->cache_cur = 0; } return 0;}static int write_cache(buf_cache* pc, const void *to_write, unsigned int to_write_len){ unsigned char* tmp; assert(pc && to_write); assert(pc->cache_len >= pc->cache_cur); /* If trying to write more than the cache will hold * flush the cache and allocate enough space immediately, * that is, don't use the cache. */ if(to_write_len > pc->cache_len - pc->cache_cur) { unsigned int newlen; flush_cache(pc); newlen = *pc->pbufoutlen + to_write_len; tmp = realloc(*pc->pbufout, newlen); if(!tmp) return 1; memcpy(tmp + *pc->pbufoutlen, to_write, to_write_len); *pc->pbufout = tmp; *pc->pbufoutlen = newlen; } else { /* Write the data to the cache. */ memcpy(pc->cache + pc->cache_cur, to_write, to_write_len); pc->cache_cur += to_write_len; } return 0;}static unsigned intget_symbol_frequencies(SymbolFrequencies *pSF, FILE *in){ int c; unsigned int total_count = 0; /* Set all frequencies to 0. */ init_frequencies(pSF); /* Count the frequency of each symbol in the input file. */ while((c = fgetc(in)) != EOF) { unsigned char uc = c; if(!(*pSF)[uc]) (*pSF)[uc] = new_leaf_node(uc); ++(*pSF)[uc]->count; ++total_count; } return total_count;}static unsigned intget_symbol_frequencies_from_memory(SymbolFrequencies *pSF, const unsigned char *bufin, unsigned int bufinlen){ unsigned int i; unsigned int total_count = 0; /* Set all frequencies to 0. */ init_frequencies(pSF); /* Count the frequency of each symbol in the input file. */ for(i = 0; i < bufinlen; ++i) { unsigned char uc = bufin[i]; if(!(*pSF)[uc]) (*pSF)[uc] = new_leaf_node(uc); ++(*pSF)[uc]->count; ++total_count; } return total_count;}/* * When used by qsort, SFComp sorts the array so that * the symbol with the lowest frequency is first. Any * NULL entries will be sorted to the end of the list. */static intSFComp(const void *p1, const void *p2){ const huffman_node *hn1 = *(const huffman_node**)p1; const huffman_node *hn2 = *(const huffman_node**)p2; /* Sort all NULLs to the end. */ if(hn1 == NULL && hn2 == NULL) return 0; if(hn1 == NULL) return 1; if(hn2 == NULL) return -1; if(hn1->count > hn2->count) return 1; else if(hn1->count < hn2->count) return -1; return 0;}#if 0static voidprint_freqs(SymbolFrequencies * pSF){ size_t i; for(i = 0; i < MAX_SYMBOLS; ++i) { if((*pSF)[i]) printf("%d, %ld\n", (*pSF)[i]->symbol, (*pSF)[i]->count); else printf("NULL\n"); }}#endif/* * build_symbol_encoder builds a SymbolEncoder by walking * down to the leaves of the Huffman tree and then, * for each leaf, determines its code. */static voidbuild_symbol_encoder(huffman_node *subtree, SymbolEncoder *pSF){ if(subtree == NULL) return; if(subtree->isLeaf) (*pSF)[subtree->symbol] = new_code(subtree); else { build_symbol_encoder(subtree->zero, pSF); build_symbol_encoder(subtree->one, pSF); }}/* * calculate_huffman_codes turns pSF into an array * with a single entry that is the root of the * huffman tree. The return value is a SymbolEncoder, * which is an array of huffman codes index by symbol value. */static SymbolEncoder*calculate_huffman_codes(SymbolFrequencies * pSF){ unsigned int i = 0; unsigned int n = 0; huffman_node *m1 = NULL, *m2 = NULL; SymbolEncoder *pSE = NULL; #if 0 printf("BEFORE SORT\n"); print_freqs(pSF);#endif /* Sort the symbol frequency array by ascending frequency. */ qsort((*pSF), MAX_SYMBOLS, sizeof((*pSF)[0]), SFComp);#if 0 printf("AFTER SORT\n"); print_freqs(pSF);#endif /* Get the number of symbols. */ for(n = 0; n < MAX_SYMBOLS && (*pSF)[n]; ++n) ; /* * Construct a Huffman tree. This code is based * on the algorithm given in Managing Gigabytes * by Ian Witten et al, 2nd edition, page 34. * Note that this implementation uses a simple * count instead of probability. */ for(i = 0; i < n - 1; ++i) { /* Set m1 and m2 to the two subsets of least probability. */ m1 = (*pSF)[0]; m2 = (*pSF)[1]; /* Replace m1 and m2 with a set {m1, m2} whose probability * is the sum of that of m1 and m2. */ (*pSF)[0] = m1->parent = m2->parent = new_nonleaf_node(m1->count + m2->count, m1, m2); (*pSF)[1] = NULL; /* Put newSet into the correct count position in pSF. */ qsort((*pSF), n, sizeof((*pSF)[0]), SFComp); } /* Build the SymbolEncoder array from the tree. */ pSE = (SymbolEncoder*)malloc(sizeof(SymbolEncoder)); memset(pSE, 0, sizeof(SymbolEncoder)); build_symbol_encoder((*pSF)[0], pSE); return pSE;}/* * Write the huffman code table. The format is: * 4 byte code count in network byte order. * 4 byte number of bytes encoded * (if you decode the data, you should get this number of bytes) * code1 * ... * codeN, where N is the count read at the begginning of the file. * Each codeI has the following format: * 1 byte symbol, 1 byte code bit length, code bytes. * Each entry has numbytes_from_numbits code bytes. * The last byte of each code may have extra bits, if the number of * bits in the code is not a multiple of 8. */static intwrite_code_table(FILE* out, SymbolEncoder *se, unsigned int symbol_count){ unsigned long i, count = 0; /* Determine the number of entries in se. */ for(i = 0; i < MAX_SYMBOLS; ++i) { if((*se)[i]) ++count; } /* Write the number of entries in network byte order. */ i = htonl(count); if(fwrite(&i, sizeof(i), 1, out) != 1) return 1; /* Write the number of bytes that will be encoded. */ symbol_count = htonl(symbol_count); if(fwrite(&symbol_count, sizeof(symbol_count), 1, out) != 1) return 1; /* Write the entries. */ for(i = 0; i < MAX_SYMBOLS; ++i) { huffman_code *p = (*se)[i]; if(p) { unsigned int numbytes; /* Write the 1 byte symbol. */ fputc((unsigned char)i, out); /* Write the 1 byte code bit length. */ fputc(p->numbits, out); /* Write the code bytes. */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -