📄 arith.c
字号:
/************************** Start of ARITH.C *************************
*
* This is the arithmetic coding module used in Chapter 5.
* Compile with BITIO.C, ERRHAND.C, and either MAIN-C.C or MAIN-E.C
*/
#include <stdio.h>
#include <stdlib.h>
#include "errhand.h"
#include "bitio.h"
/*
* The SYMBOL structure is what is used to define a symbol in
* arithmetic coding terms. A symbol is defined as a range between
* 0 and 1. Since we are using integer math, instead of using 0 and 1
* as our end points, we have an integer scale. The low_count and
* high_count define where the symbol falls in the range.
*/
typedef struct {
unsigned short int low_count;
unsigned short int high_count;
unsigned short int scale;
} SYMBOL;
/*
* Internal function prototypes, with or without ANSI prototypes.
*/
#ifdef __STDC__
void build_model( FILE *input, FILE *output );
void scale_counts( unsigned long counts[], unsigned char scaled_counts[] );
void build_totals( unsigned char scaled_counts[] );
void count_bytes( FILE *input, unsigned long counts[] );
void output_counts( FILE *output, unsigned char scaled_counts[] );
void input_counts( FILE *stream );
void convert_int_to_symbol( int symbol, SYMBOL *s );
void get_symbol_scale( SYMBOL *s );
int convert_symbol_to_int( int count, SYMBOL *s );
void initialize_arithmetic_encoder( void );
void encode_symbol( BIT_FILE *stream, SYMBOL *s );
void flush_arithmetic_encoder( BIT_FILE *stream );
short int get_current_count( SYMBOL *s );
void initialize_arithmetic_decoder( BIT_FILE *stream );
void remove_symbol_from_stream( BIT_FILE *stream, SYMBOL *s );
#else
void build_model();
void scale_counts();
void build_totals();
void count_bytes();
void output_counts();
void input_counts();
void convert_int_to_symbol();
void get_symbol_scale();
int convert_symbol_to_int();
void initialize_arithmetic_encoder();
void encode_symbol();
void flush_arithmetic_encoder();
short int get_current_count();
void initialize_arithmetic_decoder();
void remove_symbol_from_stream();
#endif
#define END_OF_STREAM 256
short int totals[ 258 ]; /* The cumulative totals */
char *CompressionName = "Fixed order 0 model with arithmetic coding";
char *Usage = "in-file out-file\n\n";
/*
* This compress file routine is a fairly orthodox compress routine.
* It first gathers statistics, and initializes the arithmetic
* encoder. It then encodes all the characters in the file, followed
* by the EOF character. The output stream is then flushed, and we exit.
* Note that an extra two bytes are output. When decoding an arithmetic
* stream, we have to read in extra bits. The decoding process takes
* place in the msb of the low and high range ints, so when we are
* decoding our last bit we will still have to have at least 15 junk
* bits loaded into the registers. The extra two bytes account for
* that.
*/
void CompressFile( input, output, argc, argv )
FILE *input;
BIT_FILE *output;
int argc;
char *argv[];
{
int c;
SYMBOL s;
build_model( input, output->file );
initialize_arithmetic_encoder();
while ( ( c = getc( input ) ) != EOF ) {
convert_int_to_symbol( c, &s );
encode_symbol( output, &s );
}
convert_int_to_symbol( END_OF_STREAM, &s );
encode_symbol( output, &s );
flush_arithmetic_encoder( output );
OutputBits( output, 0L, 16 );
while ( argc-- > 0 ) {
printf( "Unused argument: %s\n", *argv );
argv++;
}
}
/*
* This expand routine is also very conventional. It reads in the
* model, initializes the decoder, then sits in a loop reading in
* characters. When we decode an END_OF_STREAM, it means we can close
* up the files and exit. Note decoding a single character is a three
* step process: first we determine what the scale is for the current
* symbol by looking at the difference between the high an low values.
* We then see where the current input values fall in that range.
* Finally, we look in our totals array to find out what symbol is
* a match. After that is done, we still have to remove that symbol
* from the decoder. Lots of work.
*/
void ExpandFile( input, output, argc, argv )
BIT_FILE *input;
FILE *output;
int argc;
char *argv[];
{
SYMBOL s;
int c;
int count;
input_counts( input->file );
initialize_arithmetic_decoder( input );
for ( ; ; ) {
get_symbol_scale( &s );
count = get_current_count( &s );
c = convert_symbol_to_int( count, &s );
if ( c == END_OF_STREAM )
break;
remove_symbol_from_stream( input, &s );
putc( (char) c, output );
}
while ( argc-- > 0 ) {
printf( "Unused argument: %s\n", *argv );
argv++;
}
}
/*
* This is the routine that is called to scan the input file, scale
* the counts, build the totals array, the output the scaled counts
* to the output file.
*/
void build_model( input, output )
FILE *input;
FILE *output;
{
unsigned long counts[ 256 ];
unsigned char scaled_counts[ 256 ];
count_bytes( input, counts );
scale_counts( counts, scaled_counts );
output_counts( output, scaled_counts );
build_totals( scaled_counts );
}
/*
* This routine runs through the file and counts the appearances of each
* character.
*/
#ifndef SEEK_SET
#define SEEK_SET 0
#endif
void count_bytes( input, counts )
FILE *input;
unsigned long counts[];
{
long input_marker;
int i;
int c;
for ( i = 0 ; i < 256 ; i++ )
counts[ i ] = 0;
input_marker = ftell( input );
while ( ( c = getc( input )) != EOF )
counts[ c ]++;
fseek( input, input_marker, SEEK_SET );
}
/*
* This routine is called to scale the counts down. There are two types
* of scaling that must be done. First, the counts need to be scaled
* down so that the individual counts fit into a single unsigned char.
* Then, the counts need to be rescaled so that the total of all counts
* is less than 16384.
*/
void scale_counts( counts, scaled_counts )
unsigned long counts[];
unsigned char scaled_counts[];
{
int i;
unsigned long max_count;
unsigned int total;
unsigned long scale;
/*
* The first section of code makes sure each count fits into a single byte.
*/
max_count = 0;
for ( i = 0 ; i < 256 ; i++ )
if ( counts[ i ] > max_count )
max_count = counts[ i ];
scale = max_count / 256;
scale = scale + 1;
for ( i = 0 ; i < 256 ; i++ ) {
scaled_counts[ i ] = (unsigned char ) ( counts[ i ] / scale );
if ( scaled_counts[ i ] == 0 && counts[ i ] != 0 )
scaled_counts[ i ] = 1;
}
/*
* This next section makes sure the total is less than 16384. I initialize
* the total to 1 instead of 0 because there will be an additional 1 added
* in for the END_OF_STREAM symbol;
*/
total = 1;
for ( i = 0 ; i < 256 ; i++ )
total += scaled_counts[ i ];
if ( total > ( 32767 - 256 ) )
scale = 4;
else if ( total > 16383 )
scale = 2;
else
return;
for ( i = 0 ; i < 256 ; i++ )
scaled_counts[ i ] /= scale;
}
/*
* This routine is used by both the encoder and decoder to build the
* table of cumulative totals. The counts for the characters in the
* file are in the counts array, and we know that there will be a single
* instance of the EOF symbol.
*/
void build_totals( scaled_counts )
unsigned char scaled_counts[];
{
int i;
totals[ 0 ] = 0;
for ( i = 0 ; i < END_OF_STREAM ; i++ )
totals[ i + 1 ] = totals[ i ] + scaled_counts[ i ];
totals[ END_OF_STREAM + 1 ] = totals[ END_OF_STREAM ] + 1;
}
/*
* In order for the compressor to build the same model, I have to store
* the symbol counts in the compressed file so the expander can read
* them in. In order to save space, I don't save all 256 symbols
* unconditionally. The format used to store counts looks like this:
*
* start, stop, counts, start, stop, counts, ... 0
*
* This means that I store runs of counts, until all the non-zero
* counts have been stored. At this time the list is terminated by
* storing a start value of 0. Note that at least 1 run of counts has
* to be stored, so even if the first start value is 0, I read it in.
* It also means that even in an empty file that has no counts, I have
* to pass at least one count.
*
* In order to efficiently use this format, I have to identify runs of
* non-zero counts. Because of the format used, I don't want to stop a
* run because of just one or two zeros in the count stream. So I have
* to sit in a loop looking for strings of three or more zero values in
* a row.
*
* This is simple in concept, but it ends up being one of the most
* complicated routines in the whole program. A routine that just
* writes out 256 values without attempting to optimize would be much
* simpler, but would hurt compression quite a bit on small files.
*
*/
void output_counts( output, scaled_counts )
FILE *output;
unsigned char scaled_counts[];
{
int first;
int last;
int next;
int i;
first = 0;
while ( first < 255 && scaled_counts[ first ] == 0 )
first++;
/*
* Each time I hit the start of the loop, I assume that first is the
* number for a run of non-zero values. The rest of the loop is
* concerned with finding the value for last, which is the end of the
* run, and the value of next, which is the start of the next run.
* At the end of the loop, I assign next to first, so it starts in on
* the next run.
*/
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -