📄 huffmancodec.java
字号:
package it.unimi.dsi.mg4j.compression;/* * MG4J: Managing Gigabytes for Java** Copyright (C) 2005-2007 Sebastiano Vigna ** This library is free software; you can redistribute it and/or modify it* under the terms of the GNU Lesser General Public License as published by the Free* Software Foundation; either version 2.1 of the License, or (at your option)* any later version.** This library is distributed in the hope that it will be useful, but* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License* for more details.** You should have received a copy of the GNU Lesser General Public License* along with this program; if not, write to the Free Software* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.**/import java.io.Serializable;import java.util.Arrays;import cern.colt.Sorting;import cern.colt.bitvector.BitVector;import cern.colt.function.IntComparator;/** An implementation of Huffman optimal prefix-free coding. * * <p>A Huffman coder is built starting from an array of frequencies corresponding to each * symbol. Frequency 0 symbols are allowed, but they will degrade the resulting code. * * <p>Instances of this class compute a <em>canonical</em> Huffman code * (Eugene S. Schwartz and Bruce Kallick, “Generating a Canonical Prefix Encoding”, <i>Commun. ACM</i> 7(3), pages 166−169, 1964), which can * by {@linkplain CanonicalFast64CodeWordDecoder quickly decoded using table lookups}. * The construction uses the most efficient one-pass in-place codelength computation procedure * described by Alistair Moffat and Jyrki Katajainen in “In-Place Calculation of Minimum-Redundancy Codes”, * <i>Algorithms and Data Structures, 4th International Workshop</i>, * number 955 in Lecture Notes in Computer Science, pages 393−402, Springer-Verlag, 1995. * * <p>We note by passing that this coded uses a {@link CanonicalFast64CodeWordDecoder}, which does not support codelengths above 64. * However, since the worst case for codelengths is given by Fibonacci numbers, and frequencies are to be provided as integers, * no codeword longer than the base-[(5<sup>1/2</sup> + 1)/2] logarithm of 5<sup>1/2</sup> · 2<sup>31</sup> (less than 47) will ever be generated. * @deprecated Moved to <code>dsiutils</code>. */@Deprecatedpublic class HuffmanCodec implements PrefixCodec, Serializable { private static final boolean DEBUG = false; private static final boolean ASSERTS = false; private static final long serialVersionUID = 2L; /** The number of symbols of this coder. */ public final int size; /** The codewords for this coder. */ private final BitVector[] codeWord; /** A cached singleton instance of the coder of this codec. */ private final Fast64CodeWordCoder coder; /** A cached singleton instance of the decoder of this codec. */ private final CanonicalFast64CodeWordDecoder decoder; /** Creates a new Huffman codec using the given vector of frequencies. * * @param frequency a vector of nonnnegative frequencies. */ public HuffmanCodec( final int[] frequency ) { size = frequency.length; if ( size == 0 || size == 1 ) { codeWord = new BitVector[ size ]; if ( size == 1 ) codeWord[ 0 ] = new BitVector( 0 ); coder = new Fast64CodeWordCoder( codeWord, new long[ size ] ); decoder = new CanonicalFast64CodeWordDecoder( new int[ size ], new int[ size ] ); return; } final long[] a = new long[ size ]; for( int i = size; i-- != 0; ) a[ i ] = frequency[ i ]; // Sort frequencies (this is the only n log n step). Arrays.sort( a ); // The following lines are from Moffat & Katajainen sample code. Please refer to their paper. // First pass, left to right, setting parent pointers. a[ 0 ] += a[ 1 ]; int root = 0; int leaf = 2; for ( int next = 1; next < size - 1; next++ ) { // Select first item for a pairing. if ( leaf >= size || a[ root ] < a[ leaf ] ) { a[ next ] = a[ root ]; a[ root++ ] = next; } else a[ next ] = a[ leaf++ ]; // Add on the second item. if ( leaf >= size || ( root < next && a[ root ] < a[ leaf ] ) ) { a[ next ] += a[ root ]; a[ root++ ] = next; } else a[ next ] += a[ leaf++ ]; } // Second pass, right to left, setting internal depths. a[ size - 2 ] = 0; for ( int next = size - 3; next >= 0; next-- ) a[ next ] = a[ (int)a[ next ] ] + 1; // Third pass, right to left, setting leaf depths. int available = 1, used = 0, depth = 0; root = size - 2; int next = size - 1; while ( available > 0 ) { while ( root >= 0 && a[ root ] == depth ) { used++; root--; } while ( available > used ) { a[ next-- ] = depth; available--; } available = 2 * used; depth++; used = 0; } // Reverse the order of symbol lengths, and store them into an int array. final int[] length = new int[ size ]; for( int i = size; i-- != 0; ) length[ i ] = (int)a[ size - 1 - i ]; // Sort symbols indices by decreasing frequencies (so symbols correspond to lengths). final int[] symbol = new int[ size ]; for( int i = size; i-- != 0; ) symbol[ i ] = i; Sorting.quickSort( symbol, 0, size, new IntComparator() { public int compare( int x, int y ) { return frequency[ y ] - frequency[ x ]; } }); // Assign codewords (just for the coder--the decoder needs just the lengths). int s = symbol[ 0 ]; int l = length[ 0 ]; long value = 0; BitVector v; codeWord = new BitVector[ size ]; final long[] longCodeWord = new long[ size ]; codeWord[ s ] = new BitVector( l ); for( int i = 1; i < size; i++ ) { s = symbol[ i ]; if ( length[ i ] == l ) value++; else { value++; value <<= length[ i ] - l; if ( ASSERTS ) assert length[ i ] > l; l = length[ i ]; } v = new BitVector( l ); for( int j = l; j-- != 0; ) if ( ( 1L << j & value ) != 0 ) v.set( l - 1 - j ); codeWord[ s ] = v; longCodeWord[ s ] = value; } coder = new Fast64CodeWordCoder( codeWord, longCodeWord ); decoder = new CanonicalFast64CodeWordDecoder( length, symbol ); if ( DEBUG ) { final BitVector[] codeWord = codeWords(); System.err.println( "Codes: " ); for( int i = 0; i < size; i++ ) System.err.println( i + " (" + codeWord[ i ].size() + " bits): " + codeWord[ i ] ); long totFreq = 0; for( int i = size; i-- != 0; ) totFreq += frequency[ i ]; long totBits = 0; for( int i = size; i-- != 0; ) totBits += frequency[ i ] * codeWord[ i ].size(); System.err.println( "Compression: " + totBits + " / " + totFreq * Character.SIZE + " = " + (double)totBits/(totFreq * Character.SIZE) ); }} public CodeWordCoder coder() { return coder; } public Decoder decoder() { return decoder; } public int size() { return size; } public BitVector[] codeWords() { return coder.codeWords(); } @Deprecated public PrefixCoder getCoder() { return coder(); } @Deprecated public Decoder getDecoder() { return decoder(); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -