📄 deflate.c
字号:
/* deflate.c -- compress data using the deflation algorithm
* Copyright (C) 1992-1993 Jean-loup Gailly
* This is free software; you can redistribute it and/or modify it under the
* terms of the GNU General Public License, see the file COPYING.
*/
/*
* PURPOSE
*
* Identify new text as repetitions of old text within a fixed-
* length sliding window trailing behind the new text.
*
* DISCUSSION
*
* The "deflation" process depends on being able to identify portions
* of the input text which are identical to earlier input (within a
* sliding window trailing behind the input currently being processed).
*
* The most straightforward technique turns out to be the fastest for
* most input files: try all possible matches and select the longest.
* The key feature of this algorithm is that insertions into the string
* dictionary are very simple and thus fast, and deletions are avoided
* completely. Insertions are performed at each input character, whereas
* string matches are performed only when the previous match ends. So it
* is preferable to spend more time in matches to allow very fast string
* insertions and avoid deletions. The matching algorithm for small
* strings is inspired from that of Rabin & Karp. A brute force approach
* is used to find longer strings when a small match has been found.
* A similar algorithm is used in comic (by Jan-Mark Wams) and freeze
* (by Leonid Broukhis).
* A previous version of this file used a more sophisticated algorithm
* (by Fiala and Greene) which is guaranteed to run in linear amortized
* time, but has a larger average cost, uses more memory and is patented.
* However the F&G algorithm may be faster for some highly redundant
* files if the parameter max_chain_length (described below) is too large.
*
* ACKNOWLEDGEMENTS
*
* The idea of lazy evaluation of matches is due to Jan-Mark Wams, and
* I found it in 'freeze' written by Leonid Broukhis.
* Thanks to many info-zippers for bug reports and testing.
*
* REFERENCES
*
* APPNOTE.TXT documentation file in PKZIP 1.93a distribution.
*
* A description of the Rabin and Karp algorithm is given in the book
* "Algorithms" by R. Sedgewick, Addison-Wesley, p252.
*
* Fiala,E.R., and Greene,D.H.
* Data Compression with Finite Windows, Comm.ACM, 32,4 (1989) 490-595
*
* INTERFACE
*
* void lm_init (int pack_level, ush *flags)
* Initialize the "longest match" routines for a new file
*
* ulg deflate (void)
* Processes a new input file and return its compressed length. Sets
* the compressed length, crc, deflate flags and internal file
* attributes.
*/
#include <stdio.h>
#include "tailor.h"
#include "gzip.h"
#include "lzw.h" /* just for consistency checking */
#ifdef RCSID
static char rcsid[] = "$Id: deflate.c,v 0.15 1993/06/24 10:53:53 jloup Exp $";
#endif
/* ===========================================================================
* Configuration parameters
*/
/* Compile with MEDIUM_MEM to reduce the memory requirements or
* with SMALL_MEM to use as little memory as possible. Use BIG_MEM if the
* entire input file can be held in memory (not possible on 16 bit systems).
* Warning: defining these symbols affects HASH_BITS (see below) and thus
* affects the compression ratio. The compressed output
* is still correct, and might even be smaller in some cases.
*/
#ifdef SMALL_MEM
# define HASH_BITS 13 /* Number of bits used to hash strings */
#endif
#ifdef MEDIUM_MEM
# define HASH_BITS 14
#endif
#ifndef HASH_BITS
# define HASH_BITS 15
/* For portability to 16 bit machines, do not use values above 15. */
#endif
/* To save space (see unlzw.c), we overlay prev+head with tab_prefix and
* window with tab_suffix. Check that we can do this:
*/
#if (WSIZE<<1) > (1<<BITS)
error: cannot overlay window with tab_suffix and prev with tab_prefix0
#endif
#if HASH_BITS > BITS-1
error: cannot overlay head with tab_prefix1
#endif
#define HASH_SIZE (unsigned)(1<<HASH_BITS)
#define HASH_MASK (HASH_SIZE-1)
#define WMASK (WSIZE-1)
/* HASH_SIZE and WSIZE must be powers of two */
#define NIL 0
/* Tail of hash chains */
#define FAST 4
#define SLOW 2
/* speed options for the general purpose bit flag */
#ifndef TOO_FAR
# define TOO_FAR 4096
#endif
/* Matches of length 3 are discarded if their distance exceeds TOO_FAR */
/* ===========================================================================
* Local data used by the "longest match" routines.
*/
typedef ush Pos;
typedef unsigned IPos;
/* A Pos is an index in the character window. We use short instead of int to
* save space in the various tables. IPos is used only for parameter passing.
*/
/* DECLARE(uch, window, 2L*WSIZE); */
/* Sliding window. Input bytes are read into the second half of the window,
* and move to the first half later to keep a dictionary of at least WSIZE
* bytes. With this organization, matches are limited to a distance of
* WSIZE-MAX_MATCH bytes, but this ensures that IO is always
* performed with a length multiple of the block size. Also, it limits
* the window size to 64K, which is quite useful on MSDOS.
* To do: limit the window size to WSIZE+BSZ if SMALL_MEM (the code would
* be less efficient).
*/
/* DECLARE(Pos, prev, WSIZE); */
/* Link to older string with same hash index. To limit the size of this
* array to 64K, this link is maintained only for the last 32K strings.
* An index in this array is thus a window index modulo 32K.
*/
/* DECLARE(Pos, head, 1<<HASH_BITS); */
/* Heads of the hash chains or NIL. */
ulg window_size = (ulg)2*WSIZE;
/* window size, 2*WSIZE except for MMAP or BIG_MEM, where it is the
* input file length plus MIN_LOOKAHEAD.
*/
long block_start;
/* window position at the beginning of the current output block. Gets
* negative when the window is moved backwards.
*/
local unsigned ins_h; /* hash index of string to be inserted */
#define H_SHIFT ((HASH_BITS+MIN_MATCH-1)/MIN_MATCH)
/* Number of bits by which ins_h and del_h must be shifted at each
* input step. It must be such that after MIN_MATCH steps, the oldest
* byte no longer takes part in the hash key, that is:
* H_SHIFT * MIN_MATCH >= HASH_BITS
*/
unsigned int near prev_length;
/* Length of the best match at previous step. Matches not greater than this
* are discarded. This is used in the lazy match evaluation.
*/
unsigned near strstart; /* start of string to insert */
unsigned near match_start; /* start of matching string */
local int eofile; /* flag set at end of input file */
local unsigned lookahead; /* number of valid bytes ahead in window */
unsigned near max_chain_length;
/* To speed up deflation, hash chains are never searched beyond this length.
* A higher limit improves compression ratio but degrades the speed.
*/
local unsigned int max_lazy_match;
/* Attempt to find a better match only when the current match is strictly
* smaller than this value. This mechanism is used only for compression
* levels >= 4.
*/
#define max_insert_length max_lazy_match
/* Insert new strings in the hash table only if the match length
* is not greater than this length. This saves time but degrades compression.
* max_insert_length is used only for compression levels <= 3.
*/
local int compr_level;
/* compression level (1..9) */
unsigned near good_match;
/* Use a faster search when the previous match is longer than this */
/* Values for max_lazy_match, good_match and max_chain_length, depending on
* the desired pack level (0..9). The values given below have been tuned to
* exclude worst case performance for pathological files. Better values may be
* found for specific files.
*/
typedef struct config {
ush good_length; /* reduce lazy search above this match length */
ush max_lazy; /* do not perform lazy search above this match length */
ush nice_length; /* quit search above this match length */
ush max_chain;
} config;
#ifdef FULL_SEARCH
# define nice_match MAX_MATCH
#else
int near nice_match; /* Stop searching when current match exceeds this */
#endif
local config configuration_table[10] = {
/* good lazy nice chain */
/* 0 */ {0, 0, 0, 0}, /* store only */
/* 1 */ {4, 4, 8, 4}, /* maximum speed, no lazy matches */
/* 2 */ {4, 5, 16, 8},
/* 3 */ {4, 6, 32, 32},
/* 4 */ {4, 4, 16, 16}, /* lazy matches */
/* 5 */ {8, 16, 32, 32},
/* 6 */ {8, 16, 128, 128},
/* 7 */ {8, 32, 128, 256},
/* 8 */ {32, 128, 258, 1024},
/* 9 */ {32, 258, 258, 4096}}; /* maximum compression */
/* Note: the deflate() code requires max_lazy >= MIN_MATCH and max_chain >= 4
* For deflate_fast() (levels <= 3) good is ignored and lazy has a different
* meaning.
*/
#define EQUAL 0
/* result of memcmp for equal strings */
/* ===========================================================================
* Prototypes for local functions.
*/
local void fill_window OF((void));
local ulg deflate_fast OF((void));
int longest_match OF((IPos cur_match));
#ifdef ASMV
void match_init OF((void)); /* asm code initialization */
#endif
#ifdef DEBUG
local void check_match OF((IPos start, IPos match, int length));
#endif
/* ===========================================================================
* Update a hash value with the given input byte
* IN assertion: all calls to to UPDATE_HASH are made with consecutive
* input characters, so that a running hash key can be computed from the
* previous key instead of complete recalculation each time.
*/
#define UPDATE_HASH(h,c) (h = (((h)<<H_SHIFT) ^ (c)) & HASH_MASK)
/* ===========================================================================
* Insert string s in the dictionary and set match_head to the previous head
* of the hash chain (the most recent string with same hash key). Return
* the previous length of the hash chain.
* IN assertion: all calls to to INSERT_STRING are made with consecutive
* input characters and the first MIN_MATCH bytes of s are valid
* (except for the last MIN_MATCH-1 bytes of the input file).
*/
#define INSERT_STRING(s, match_head) \
(UPDATE_HASH(ins_h, window[(s) + MIN_MATCH-1]), \
prev[(s) & WMASK] = match_head = head[ins_h], \
head[ins_h] = (s))
/* ===========================================================================
* Initialize the "longest match" routines for a new file
*/
void lm_init (pack_level, flags)
int pack_level; /* 0: store, 1: best speed, 9: best compression */
ush *flags; /* general purpose bit flag */
{
register unsigned j;
if (pack_level < 1 || pack_level > 9) error("bad pack level");
compr_level = pack_level;
/* Initialize the hash table. */
#if defined(MAXSEG_64K) && HASH_BITS == 15
for (j = 0; j < HASH_SIZE; j++) head[j] = NIL;
#else
memzero((char*)head, HASH_SIZE*sizeof(*head));
#endif
/* prev will be initialized on the fly */
/* Set the default configuration parameters:
*/
max_lazy_match = configuration_table[pack_level].max_lazy;
good_match = configuration_table[pack_level].good_length;
#ifndef FULL_SEARCH
nice_match = configuration_table[pack_level].nice_length;
#endif
max_chain_length = configuration_table[pack_level].max_chain;
if (pack_level == 1) {
*flags |= FAST;
} else if (pack_level == 9) {
*flags |= SLOW;
}
/* ??? reduce max_chain_length for binary files */
strstart = 0;
block_start = 0L;
#ifdef ASMV
match_init(); /* initialize the asm code */
#endif
lookahead = read_buf((char*)window,
sizeof(int) <= 2 ? (unsigned)WSIZE : 2*WSIZE);
if (lookahead == 0 || lookahead == (unsigned)EOF) {
eofile = 1, lookahead = 0;
return;
}
eofile = 0;
/* Make sure that we always have enough lookahead. This is important
* if input comes from a device such as a tty.
*/
while (lookahead < MIN_LOOKAHEAD && !eofile) fill_window();
ins_h = 0;
for (j=0; j<MIN_MATCH-1; j++) UPDATE_HASH(ins_h, window[j]);
/* If lookahead < MIN_MATCH, ins_h is garbage, but this is
* not important since only literal bytes will be emitted.
*/
}
/* ===========================================================================
* Set match_start to the longest match starting at the given string and
* return its length. Matches shorter or equal to prev_length are discarded,
* in which case the result is equal to prev_length and match_start is
* garbage.
* IN assertions: cur_match is the head of the hash chain for the current
* string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1
*/
#ifndef ASMV
/* For MSDOS, OS/2 and 386 Unix, an optimized version is in match.asm or
* match.s. The code is functionally equivalent, so you can use the C version
* if desired.
*/
int longest_match(cur_match)
IPos cur_match; /* current match */
{
unsigned chain_length = max_chain_length; /* max hash chain length */
register uch *scan = window + strstart; /* current string */
register uch *match; /* matched string */
register int len; /* length of current match */
int best_len = prev_length; /* best match length so far */
IPos limit = strstart > (IPos)MAX_DIST ? strstart - (IPos)MAX_DIST : NIL;
/* Stop when cur_match becomes <= limit. To simplify the code,
* we prevent matches with the string of window index 0.
*/
/* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16.
* It is easy to get rid of this optimization if necessary.
*/
#if HASH_BITS < 8 || MAX_MATCH != 258
error: Code too clever
#endif
#ifdef UNALIGNED_OK
/* Compare two bytes at a time. Note: this is not always beneficial.
* Try with and without -DUNALIGNED_OK to check.
*/
register uch *strend = window + strstart + MAX_MATCH - 1;
register ush scan_start = *(ush*)scan;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -