📄 arith.c

📁 This is a little console mode utility program which is able to (de-)compress single files with a s
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
/************************** Start of ARITH.C *************************
 *
 * This is the arithmetic coding module used in Chapter 5.
 * Compile with BITIO.C, ERRHAND.C, and either MAIN-C.C or MAIN-E.C
 */
#include <stdio.h>
#include <stdlib.h>
#include "errhand.h"
#include "bitio.h"

/*
 * The SYMBOL structure is what is used to define a symbol in
 * arithmetic coding terms.  A symbol is defined as a range between
 * 0 and 1.  Since we are using integer math, instead of using 0 and 1
 * as our end points, we have an integer scale.  The low_count and
 * high_count define where the symbol falls in the range.
 */

typedef struct {
    unsigned short int low_count;
    unsigned short int high_count;
    unsigned short int scale;
} SYMBOL;

/*
 * Internal function prototypes, with or without ANSI prototypes.
 */

#ifdef __STDC__

void build_model( FILE *input, FILE *output );
void scale_counts( unsigned long counts[], unsigned char scaled_counts[] );
void build_totals( unsigned char scaled_counts[] );
void count_bytes( FILE *input, unsigned long counts[] );
void output_counts( FILE *output, unsigned char scaled_counts[] );
void input_counts( FILE *stream );
void convert_int_to_symbol( int symbol, SYMBOL *s );
void get_symbol_scale( SYMBOL *s );
int convert_symbol_to_int( int count, SYMBOL *s );
void initialize_arithmetic_encoder( void );
void encode_symbol( BIT_FILE *stream, SYMBOL *s );
void flush_arithmetic_encoder( BIT_FILE *stream );
short int get_current_count( SYMBOL *s );
void initialize_arithmetic_decoder( BIT_FILE *stream );
void remove_symbol_from_stream( BIT_FILE *stream, SYMBOL *s );

#else

void build_model();
void scale_counts();
void build_totals();
void count_bytes();
void output_counts();
void input_counts();
void convert_int_to_symbol();
void get_symbol_scale();
int convert_symbol_to_int();
void initialize_arithmetic_encoder();
void encode_symbol();
void flush_arithmetic_encoder();
short int get_current_count();
void initialize_arithmetic_decoder();
void remove_symbol_from_stream();

#endif

#define END_OF_STREAM 256
short int totals[ 258 ];            /* The cumulative totals                */

char *CompressionName = "Fixed order 0 model with arithmetic coding";
char *Usage           = "in-file out-file\n\n";

/*
 * This compress file routine is a fairly orthodox compress routine.
 * It first gathers statistics, and initializes the arithmetic
 * encoder.  It then encodes all the characters in the file, followed
 * by the EOF character.  The output stream is then flushed, and we exit.
 * Note that an extra two bytes are output.  When decoding an arithmetic
 * stream, we have to read in extra bits.  The decoding process takes
 * place in the msb of the low and high range ints, so when we are
 * decoding our last bit we will still have to have at least 15 junk
 * bits loaded into the registers.  The extra two bytes account for
 * that.
 */
void CompressFile( input, output, argc, argv )
FILE *input;
BIT_FILE *output;
int argc;
char *argv[];
{
    int c;
    SYMBOL s;

    build_model( input, output->file );
    initialize_arithmetic_encoder();

    while ( ( c = getc( input ) ) != EOF ) {
	convert_int_to_symbol( c, &s );
	encode_symbol( output, &s );
    }
    convert_int_to_symbol( END_OF_STREAM, &s );
    encode_symbol( output, &s );
    flush_arithmetic_encoder( output );
    OutputBits( output, 0L, 16 );
    while ( argc-- > 0 ) {
        printf( "Unused argument: %s\n", *argv );
        argv++;
    }
}

/*
 * This expand routine is also very conventional.  It reads in the
 * model, initializes the decoder, then sits in a loop reading in
 * characters.  When we decode an END_OF_STREAM, it means we can close
 * up the files and exit.  Note decoding a single character is a three
 * step process:  first we determine what the scale is for the current
 * symbol by looking at the difference between the high an low values.
 * We then see where the current input values fall in that range.
 * Finally, we look in our totals array to find out what symbol is
 * a match.  After that is done, we still have to remove that symbol
 * from the decoder.  Lots of work.
 */

void ExpandFile( input, output, argc, argv )
BIT_FILE *input;
FILE *output;
int argc;
char *argv[];
{
    SYMBOL s;
    int c;
    int count;

    input_counts( input->file );
    initialize_arithmetic_decoder( input );
    for ( ; ; ) {
	get_symbol_scale( &s );
	count = get_current_count( &s );
	c = convert_symbol_to_int( count, &s );
	if ( c == END_OF_STREAM )
            break;
	remove_symbol_from_stream( input, &s );
        putc( (char) c, output );
    }
    while ( argc-- > 0 ) {
        printf( "Unused argument: %s\n", *argv );
        argv++;
    }
}

/*
 * This is the routine that is called to scan the input file, scale
 * the counts, build the totals array, the output the scaled counts
 * to the output file.
 */

void build_model( input, output )
FILE *input;
FILE *output;
{
    unsigned long counts[ 256 ];
    unsigned char scaled_counts[ 256 ];

    count_bytes( input, counts );
    scale_counts( counts, scaled_counts );
    output_counts( output, scaled_counts );
    build_totals( scaled_counts );
}

/*
 * This routine runs through the file and counts the appearances of each
 * character.
 */
#ifndef SEEK_SET
#define SEEK_SET 0
#endif

void count_bytes( input, counts )
FILE *input;
unsigned long counts[];
{
    long input_marker;
    int i;
    int c;

    for ( i = 0 ; i < 256 ; i++ )
        counts[ i ] = 0;
    input_marker = ftell( input );
    while ( ( c = getc( input )) != EOF )
	counts[ c ]++;
    fseek( input, input_marker, SEEK_SET );
}

/*
 * This routine is called to scale the counts down.  There are two types
 * of scaling that must be done.  First, the counts need to be scaled
 * down so that the individual counts fit into a single unsigned char.
 * Then, the counts need to be rescaled so that the total of all counts
 * is less than 16384.
 */
void scale_counts( counts, scaled_counts )
unsigned long counts[];
unsigned char scaled_counts[];
{
    int i;
    unsigned long max_count;
    unsigned int total;
    unsigned long scale;

/*
 * The first section of code makes sure each count fits into a single byte.
 */
    max_count = 0;
    for ( i = 0 ; i < 256 ; i++ )
       if ( counts[ i ] > max_count )
	   max_count = counts[ i ];
    scale = max_count / 256;
    scale = scale + 1;
    for ( i = 0 ; i < 256 ; i++ ) {
        scaled_counts[ i ] = (unsigned char ) ( counts[ i ] / scale );
        if ( scaled_counts[ i ] == 0 && counts[ i ] != 0 )
            scaled_counts[ i ] = 1;
    }
/*
 * This next section makes sure the total is less than 16384.  I initialize
 * the total to 1 instead of 0 because there will be an additional 1 added
 * in for the END_OF_STREAM symbol;
 */
    total = 1;
    for ( i = 0 ; i < 256 ; i++ )
        total += scaled_counts[ i ];
    if ( total > ( 32767 - 256 ) )
        scale = 4;
    else if ( total > 16383 )
        scale = 2;
    else
        return;
    for ( i = 0 ; i < 256 ; i++ )
        scaled_counts[ i ] /= scale;
}

/*
 * This routine is used by both the encoder and decoder to build the
 * table of cumulative totals.  The counts for the characters in the
 * file are in the counts array, and we know that there will be a single
 * instance of the EOF symbol.
 */
void build_totals( scaled_counts )
unsigned char scaled_counts[];
{
    int i;

    totals[ 0 ] = 0;
    for ( i = 0 ; i < END_OF_STREAM ; i++ )
        totals[ i + 1 ] = totals[ i ] + scaled_counts[ i ];
    totals[ END_OF_STREAM + 1 ] = totals[ END_OF_STREAM ] + 1;
}

/*
 * In order for the compressor to build the same model, I have to store
 * the symbol counts in the compressed file so the expander can read
 * them in.  In order to save space, I don't save all 256 symbols
 * unconditionally.  The format used to store counts looks like this:
 *
 *  start, stop, counts, start, stop, counts, ... 0
 *
 * This means that I store runs of counts, until all the non-zero
 * counts have been stored.  At this time the list is terminated by
 * storing a start value of 0.  Note that at least 1 run of counts has
 * to be stored, so even if the first start value is 0, I read it in.
 * It also means that even in an empty file that has no counts, I have
 * to pass at least one count.
 *
 * In order to efficiently use this format, I have to identify runs of
 * non-zero counts.  Because of the format used, I don't want to stop a
 * run because of just one or two zeros in the count stream.  So I have
 * to sit in a loop looking for strings of three or more zero values in
 * a row.
 *
 * This is simple in concept, but it ends up being one of the most
 * complicated routines in the whole program.  A routine that just
 * writes out 256 values without attempting to optimize would be much
 * simpler, but would hurt compression quite a bit on small files.
 *
 */
void output_counts( output, scaled_counts )
FILE *output;
unsigned char scaled_counts[];
{
    int first;
    int last;
    int next;
    int i;

    first = 0;
    while ( first < 255 && scaled_counts[ first ] == 0 )
	    first++;
/*
 * Each time I hit the start of the loop, I assume that first is the
 * number for a run of non-zero values.  The rest of the loop is
 * concerned with finding the value for last, which is the end of the
 * run, and the value of next, which is the start of the next run.
 * At the end of the loop, I assign next to first, so it starts in on
 * the next run.
 */
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -