📄 puff.c
字号:
*
* - The maximum number of bits in a code is 15, so the possible lengths for
* any code are 1..15.
*
* - The fact that a length of zero is not permitted for a code has an
* interesting consequence. Normally if only one symbol is used for a given
* code, then in fact that code could be represented with zero bits. However
* in deflate, that code has to be at least one bit. So for example, if
* only a single distance base symbol appears in a block, then it will be
* represented by a single code of length one, in particular one 0 bit. This
* is an incomplete code, since if a 1 bit is received, it has no meaning,
* and should result in an error. So incomplete distance codes of one symbol
* should be permitted, and the receipt of invalid codes should be handled.
*
* - It is also possible to have a single literal/length code, but that code
* must be the end-of-block code, since every dynamic block has one. This
* is not the most efficient way to create an empty block (an empty fixed
* block is fewer bits), but it is allowed by the format. So incomplete
* literal/length codes of one symbol should also be permitted.
*
* - If there are only literal codes and no lengths, then there are no distance
* codes. This is represented by one distance code with zero bits.
*
* - The list of up to 286 length/literal lengths and up to 30 distance lengths
* are themselves compressed using Huffman codes and run-length encoding. In
* the list of code lengths, a 0 symbol means no code, a 1..15 symbol means
* that length, and the symbols 16, 17, and 18 are run-length instructions.
* Each of 16, 17, and 18 are follwed by extra bits to define the length of
* the run. 16 copies the last length 3 to 6 times. 17 represents 3 to 10
* zero lengths, and 18 represents 11 to 138 zero lengths. Unused symbols
* are common, hence the special coding for zero lengths.
*
* - The symbols for 0..18 are Huffman coded, and so that code must be
* described first. This is simply a sequence of up to 19 three-bit values
* representing no code (0) or the code length for that symbol (1..7).
*
* - A dynamic block starts with three fixed-size counts from which is computed
* the number of literal/length code lengths, the number of distance code
* lengths, and the number of code length code lengths (ok, you come up with
* a better name!) in the code descriptions. For the literal/length and
* distance codes, lengths after those provided are considered zero, i.e. no
* code. The code length code lengths are received in a permuted order (see
* the order[] array below) to make a short code length code length list more
* likely. As it turns out, very short and very long codes are less likely
* to be seen in a dynamic code description, hence what may appear initially
* to be a peculiar ordering.
*
* - Given the number of literal/length code lengths (nlen) and distance code
* lengths (ndist), then they are treated as one long list of nlen + ndist
* code lengths. Therefore run-length coding can and often does cross the
* boundary between the two sets of lengths.
*
* - So to summarize, the code description at the start of a dynamic block is
* three counts for the number of code lengths for the literal/length codes,
* the distance codes, and the code length codes. This is followed by the
* code length code lengths, three bits each. This is used to construct the
* code length code which is used to read the remainder of the lengths. Then
* the literal/length code lengths and distance lengths are read as a single
* set of lengths using the code length codes. Codes are constructed from
* the resulting two sets of lengths, and then finally you can start
* decoding actual compressed data in the block.
*
* - For reference, a "typical" size for the code description in a dynamic
* block is around 80 bytes.
*/
local int dynamic(struct state *s)
{
int nlen, ndist, ncode; /* number of lengths in descriptor */
int index; /* index of lengths[] */
int err; /* construct() return value */
short lengths[MAXCODES]; /* descriptor code lengths */
short lencnt[MAXBITS+1], lensym[MAXLCODES]; /* lencode memory */
short distcnt[MAXBITS+1], distsym[MAXDCODES]; /* distcode memory */
struct huffman lencode = {lencnt, lensym}; /* length code */
struct huffman distcode = {distcnt, distsym}; /* distance code */
static const short order[19] = /* permutation of code length codes */
{16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
/* get number of lengths in each table, check lengths */
nlen = bits(s, 5) + 257;
ndist = bits(s, 5) + 1;
ncode = bits(s, 4) + 4;
if (nlen > MAXLCODES || ndist > MAXDCODES)
return -3; /* bad counts */
/* read code length code lengths (really), missing lengths are zero */
for (index = 0; index < ncode; index++)
lengths[order[index]] = bits(s, 3);
for (; index < 19; index++)
lengths[order[index]] = 0;
/* build huffman table for code lengths codes (use lencode temporarily) */
err = construct(&lencode, lengths, 19);
if (err != 0) return -4; /* require complete code set here */
/* read length/literal and distance code length tables */
index = 0;
while (index < nlen + ndist) {
int symbol; /* decoded value */
int len; /* last length to repeat */
symbol = decode(s, &lencode);
if (symbol < 16) /* length in 0..15 */
lengths[index++] = symbol;
else { /* repeat instruction */
len = 0; /* assume repeating zeros */
if (symbol == 16) { /* repeat last length 3..6 times */
if (index == 0) return -5; /* no last length! */
len = lengths[index - 1]; /* last length */
symbol = 3 + bits(s, 2);
}
else if (symbol == 17) /* repeat zero 3..10 times */
symbol = 3 + bits(s, 3);
else /* == 18, repeat zero 11..138 times */
symbol = 11 + bits(s, 7);
if (index + symbol > nlen + ndist)
return -6; /* too many lengths! */
while (symbol--) /* repeat last or zero symbol times */
lengths[index++] = len;
}
}
/* build huffman table for literal/length codes */
err = construct(&lencode, lengths, nlen);
if (err < 0 || (err > 0 && nlen - lencode.count[0] != 1))
return -7; /* only allow incomplete codes if just one code */
/* build huffman table for distance codes */
err = construct(&distcode, lengths + nlen, ndist);
if (err < 0 || (err > 0 && ndist - distcode.count[0] != 1))
return -8; /* only allow incomplete codes if just one code */
/* decode data until end-of-block code */
return codes(s, &lencode, &distcode);
}
/*
* Inflate source to dest. On return, destlen and sourcelen are updated to the
* size of the uncompressed data and the size of the deflate data respectively.
* On success, the return value of puff() is zero. If there is an error in the
* source data, i.e. it is not in the deflate format, then a negative value is
* returned. If there is not enough input available or there is not enough
* output space, then a positive error is returned. In that case, destlen and
* sourcelen are not updated to facilitate retrying from the beginning with the
* provision of more input data or more output space. In the case of invalid
* inflate data (a negative error), the dest and source pointers are updated to
* facilitate the debugging of deflators.
*
* puff() also has a mode to determine the size of the uncompressed output with
* no output written. For this dest must be (unsigned char *)0. In this case,
* the input value of *destlen is ignored, and on return *destlen is set to the
* size of the uncompressed output.
*
* The return codes are:
*
* 2: available inflate data did not terminate
* 1: output space exhausted before completing inflate
* 0: successful inflate
* -1: invalid block type (type == 3)
* -2: stored block length did not match one's complement
* -3: dynamic block code description: too many length or distance codes
* -4: dynamic block code description: code lengths codes incomplete
* -5: dynamic block code description: repeat lengths with no first length
* -6: dynamic block code description: repeat more than specified lengths
* -7: dynamic block code description: invalid literal/length code lengths
* -8: dynamic block code description: invalid distance code lengths
* -9: invalid literal/length or distance code in fixed or dynamic block
* -10: distance is too far back in fixed or dynamic block
*
* Format notes:
*
* - Three bits are read for each block to determine the kind of block and
* whether or not it is the last block. Then the block is decoded and the
* process repeated if it was not the last block.
*
* - The leftover bits in the last byte of the deflate data after the last
* block (if it was a fixed or dynamic block) are undefined and have no
* expected values to check.
*/
int puff(unsigned char *dest, /* pointer to destination pointer */
unsigned long *destlen, /* amount of output space */
unsigned char *source, /* pointer to source data pointer */
unsigned long *sourcelen) /* amount of input available */
{
struct state s; /* input/output state */
int last, type; /* block information */
int err; /* return value */
/* initialize output state */
s.out = dest;
s.outlen = *destlen; /* ignored if dest is NIL */
s.outcnt = 0;
/* initialize input state */
s.in = source;
s.inlen = *sourcelen;
s.incnt = 0;
s.bitbuf = 0;
s.bitcnt = 0;
/* return if bits() or decode() tries to read past available input */
if (setjmp(s.env) != 0) /* if came back here via longjmp() */
err = 2; /* then skip do-loop, return error */
else {
/* process blocks until last block or error */
do {
last = bits(&s, 1); /* one if last block */
type = bits(&s, 2); /* block type 0..3 */
err = type == 0 ? stored(&s) :
(type == 1 ? fixed(&s) :
(type == 2 ? dynamic(&s) :
-1)); /* type == 3, invalid */
if (err != 0) break; /* return with error */
} while (!last);
}
/* update the lengths and return */
if (err <= 0) {
*destlen = s.outcnt;
*sourcelen = s.incnt;
}
return err;
}
#ifdef TEST
/* Example of how to use puff() */
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
local unsigned char *yank(char *name, unsigned long *len)
{
unsigned long size;
unsigned char *buf;
FILE *in;
struct stat s;
*len = 0;
if (stat(name, &s)) return NULL;
if ((s.st_mode & S_IFMT) != S_IFREG) return NULL;
size = (unsigned long)(s.st_size);
if (size == 0 || (off_t)size != s.st_size) return NULL;
in = fopen(name, "r");
if (in == NULL) return NULL;
buf = malloc(size);
if (buf != NULL && fread(buf, 1, size, in) != size) {
free(buf);
buf = NULL;
}
fclose(in);
*len = size;
return buf;
}
int main(int argc, char **argv)
{
int ret;
unsigned char *source;
unsigned long len, sourcelen, destlen;
if (argc < 2) return 2;
source = yank(argv[1], &len);
if (source == NULL) return 2;
sourcelen = len;
ret = puff(NIL, &destlen, source, &sourcelen);
if (ret)
printf("puff() failed with return code %d\n", ret);
else {
printf("puff() succeeded uncompressing %lu bytes\n", destlen);
if (sourcelen < len) printf("%lu compressed bytes unused\n",
len - sourcelen);
}
free(source);
return ret;
}
#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -