📄 decompress_bunzip2.c
字号:
/* Huffman decode value to get nextSym (with bounds checking) */ if (i > hufGroup->maxLen) return RETVAL_DATA_ERROR; j = (j >> (hufGroup->maxLen - i)) - base[i]; if ((unsigned)j >= MAX_SYMBOLS) return RETVAL_DATA_ERROR; nextSym = hufGroup->permute[j]; /* We have now decoded the symbol, which indicates either a new literal byte, or a repeated run of the most recent literal byte. First, check if nextSym indicates a repeated run, and if so loop collecting how many times to repeat the last literal. */ if ((unsigned)nextSym <= SYMBOL_RUNB) { /* RUNA or RUNB */ /* If this is the start of a new run, zero out counter */ if (!runPos) { runPos = 1; t = 0; } /* Neat trick that saves 1 symbol: instead of or-ing 0 or 1 at each bit position, add 1 or 2 instead. For example, 1011 is 1<<0 + 1<<1 + 2<<2. 1010 is 2<<0 + 2<<1 + 1<<2. You can make any bit pattern that way using 1 less symbol than the basic or 0/1 method (except all bits 0, which would use no symbols, but a run of length 0 doesn't mean anything in this context). Thus space is saved. */ t += (runPos << nextSym); /* +runPos if RUNA; +2*runPos if RUNB */ if (runPos < dbufSize) runPos <<= 1; goto end_of_huffman_loop; } /* When we hit the first non-run symbol after a run, we now know how many times to repeat the last literal, so append that many copies to our buffer of decoded symbols (dbuf) now. (The last literal used is the one at the head of the mtfSymbol array.) */ if (runPos) { runPos = 0; if (dbufCount + t >= dbufSize) return RETVAL_DATA_ERROR; uc = symToByte[mtfSymbol[0]]; byteCount[uc] += t; while (t--) dbuf[dbufCount++] = uc; } /* Is this the terminating symbol? */ if (nextSym > symTotal) break; /* At this point, nextSym indicates a new literal character. Subtract one to get the position in the MTF array at which this literal is currently to be found. (Note that the result can't be -1 or 0, because 0 and 1 are RUNA and RUNB. But another instance of the first symbol in the mtf array, position 0, would have been handled as part of a run above. Therefore 1 unused mtf position minus 2 non-literal nextSym values equals -1.) */ if (dbufCount >= dbufSize) return RETVAL_DATA_ERROR; i = nextSym - 1; uc = mtfSymbol[i]; /* Adjust the MTF array. Since we typically expect to move only a * small number of symbols, and are bound by 256 in any case, using * memmove here would typically be bigger and slower due to function * call overhead and other assorted setup costs. */ do { mtfSymbol[i] = mtfSymbol[i-1]; } while (--i); mtfSymbol[0] = uc; uc = symToByte[uc]; /* We have our literal byte. Save it into dbuf. */ byteCount[uc]++; dbuf[dbufCount++] = (unsigned)uc; /* Skip group initialization if we're not done with this group. Done * this way to avoid compiler warning. */ end_of_huffman_loop: if (symCount--) goto continue_this_group; } /* At this point, we've read all the Huffman-coded symbols (and repeated runs) for this block from the input stream, and decoded them into the intermediate buffer. There are dbufCount many decoded bytes in dbuf[]. Now undo the Burrows-Wheeler transform on dbuf. See http://dogma.net/markn/articles/bwt/bwt.htm */ /* Turn byteCount into cumulative occurrence counts of 0 to n-1. */ j = 0; for (i = 0; i < 256; i++) { k = j + byteCount[i]; byteCount[i] = j; j = k; } /* Figure out what order dbuf would be in if we sorted it. */ for (i = 0; i < dbufCount; i++) { uc = (unsigned char)(dbuf[i] & 0xff); dbuf[byteCount[uc]] |= (i << 8); byteCount[uc]++; } /* Decode first byte by hand to initialize "previous" byte. Note that it doesn't get output, and if the first three characters are identical it doesn't qualify as a run (hence writeRunCountdown=5). */ if (dbufCount) { if (origPtr >= dbufCount) return RETVAL_DATA_ERROR; bd->writePos = dbuf[origPtr]; bd->writeCurrent = (unsigned char)(bd->writePos & 0xff); bd->writePos >>= 8; bd->writeRunCountdown = 5; } bd->writeCount = dbufCount; return RETVAL_OK;}/* Undo burrows-wheeler transform on intermediate buffer to produce output. If start_bunzip was initialized with out_fd=-1, then up to len bytes of data are written to outbuf. Return value is number of bytes written or error (all errors are negative numbers). If out_fd!=-1, outbuf and len are ignored, data is written to out_fd and return is RETVAL_OK or error.*/int read_bunzip(bunzip_data *bd, char *outbuf, int len){ const unsigned *dbuf; int pos, current, previous, gotcount; /* If last read was short due to end of file, return last block now */ if (bd->writeCount < 0) return bd->writeCount; gotcount = 0; dbuf = bd->dbuf; pos = bd->writePos; current = bd->writeCurrent; /* We will always have pending decoded data to write into the output buffer unless this is the very first call (in which case we haven't Huffman-decoded a block into the intermediate buffer yet). */ if (bd->writeCopies) { /* Inside the loop, writeCopies means extra copies (beyond 1) */ --bd->writeCopies; /* Loop outputting bytes */ for (;;) { /* If the output buffer is full, snapshot state and return */ if (gotcount >= len) { bd->writePos =pos; bd->writeCurrent = current; bd->writeCopies++; return len; } /* Write next byte into output buffer, updating CRC */ outbuf[gotcount++] = current; bd->writeCRC = (bd->writeCRC << 8) ^ bd->crc32Table[(bd->writeCRC >> 24) ^ current]; /* Loop now if we're outputting multiple copies of this byte */ if (bd->writeCopies) { --bd->writeCopies; continue; } decode_next_byte: if (!bd->writeCount--) break; /* Follow sequence vector to undo Burrows-Wheeler transform */ previous = current; pos = dbuf[pos]; current = pos & 0xff; pos >>= 8; /* After 3 consecutive copies of the same byte, the 4th is a repeat count. We count down from 4 instead * of counting up because testing for non-zero is faster */ if (--bd->writeRunCountdown) { if (current != previous) bd->writeRunCountdown = 4; } else { /* We have a repeated run, this byte indicates the count */ bd->writeCopies = current; current = previous; bd->writeRunCountdown = 5; /* Sometimes there are just 3 bytes (run length 0) */ if (!bd->writeCopies) goto decode_next_byte; /* Subtract the 1 copy we'd output anyway to get extras */ --bd->writeCopies; } } /* Decompression of this block completed successfully */ bd->writeCRC = ~bd->writeCRC; bd->totalCRC = ((bd->totalCRC << 1) | (bd->totalCRC >> 31)) ^ bd->writeCRC; /* If this block had a CRC error, force file level CRC error. */ if (bd->writeCRC != bd->headerCRC) { bd->totalCRC = bd->headerCRC+1; return RETVAL_LAST_BLOCK; } } /* Refill the intermediate buffer by Huffman-decoding next block of input */ /* (previous is just a convenient unused temp variable here) */ previous = get_next_block(bd); if (previous) { bd->writeCount = previous; return (previous != RETVAL_LAST_BLOCK) ? previous : gotcount; } bd->writeCRC = ~0; pos = bd->writePos; current = bd->writeCurrent; goto decode_next_byte;}/* Allocate the structure, read file header. If in_fd==-1, inbuf must contain a complete bunzip file (len bytes long). If in_fd!=-1, inbuf and len are ignored, and data is read from file handle into temporary buffer. *//* Because bunzip2 is used for help text unpacking, and because bb_show_usage() should work for NOFORK applets too, we must be extremely careful to not leak any allocations! */int start_bunzip(bunzip_data **bdp, int in_fd, const unsigned char *inbuf, int len){ bunzip_data *bd; unsigned i; enum { BZh0 = ('B' << 24) + ('Z' << 16) + ('h' << 8) + '0' }; /* Figure out how much data to allocate */ i = sizeof(bunzip_data); if (in_fd != -1) i += IOBUF_SIZE; /* Allocate bunzip_data. Most fields initialize to zero. */ bd = *bdp = xzalloc(i); /* Setup input buffer */ bd->in_fd = in_fd; if (-1 == in_fd) { /* in this case, bd->inbuf is read-only */ bd->inbuf = (void*)inbuf; /* cast away const-ness */ bd->inbufCount = len; } else bd->inbuf = (unsigned char *)(bd + 1); /* Init the CRC32 table (big endian) */ crc32_filltable(bd->crc32Table, 1); /* Setup for I/O error handling via longjmp */ i = setjmp(bd->jmpbuf); if (i) return i; /* Ensure that file starts with "BZh['1'-'9']." */ i = get_bits(bd, 32); if ((unsigned)(i - BZh0 - 1) >= 9) return RETVAL_NOT_BZIP_DATA; /* Fourth byte (ascii '1'-'9'), indicates block size in units of 100k of uncompressed data. Allocate intermediate buffer for block. */ bd->dbufSize = 100000 * (i - BZh0); /* Cannot use xmalloc - may leak bd in NOFORK case! */ bd->dbuf = malloc_or_warn(bd->dbufSize * sizeof(int)); if (!bd->dbuf) { free(bd); xfunc_die(); } return RETVAL_OK;}void dealloc_bunzip(bunzip_data *bd){ free(bd->dbuf); free(bd);}/* Decompress src_fd to dst_fd. Stops at end of bzip data, not end of file. */USE_DESKTOP(long long) intunpack_bz2_stream(int src_fd, int dst_fd){ USE_DESKTOP(long long total_written = 0;) char *outbuf; bunzip_data *bd; int i; outbuf = xmalloc(IOBUF_SIZE); i = start_bunzip(&bd, src_fd, NULL, 0); if (!i) { for (;;) { i = read_bunzip(bd, outbuf, IOBUF_SIZE); if (i <= 0) break; if (i != safe_write(dst_fd, outbuf, i)) { i = RETVAL_UNEXPECTED_OUTPUT_EOF; break; } USE_DESKTOP(total_written += i;) } } /* Check CRC and release memory */ if (i == RETVAL_LAST_BLOCK) { if (bd->headerCRC != bd->totalCRC) { bb_error_msg("data integrity error when decompressing"); } else { i = RETVAL_OK; } } else if (i == RETVAL_UNEXPECTED_OUTPUT_EOF) { bb_error_msg("compressed file ends unexpectedly"); } else { bb_error_msg("decompression failed"); } dealloc_bunzip(bd); free(outbuf); return i ? i : USE_DESKTOP(total_written) + 0;}#ifdef TESTINGstatic char *const bunzip_errors[] = { NULL, "Bad file checksum", "Not bzip data", "Unexpected input EOF", "Unexpected output EOF", "Data error", "Out of memory", "Obsolete (pre 0.9.5) bzip format not supported"};/* Dumb little test thing, decompress stdin to stdout */int main(int argc, char **argv){ int i = unpack_bz2_stream(0, 1); char c; if (i < 0) fprintf(stderr,"%s\n", bunzip_errors[-i]); else if (read(0, &c, 1)) fprintf(stderr,"Trailing garbage ignored\n"); return -i;}#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -