📄 substringparser.hpp
字号:
/// @file substringparser.hpp
/// A flexible parser which can creates a symbol dictionary of substrings
///
/// @remarks
/// This is a general purpose class which is designed to be used with any
/// compressor that wants to parse files and count occurrences of certain
/// characters/words/Substrings. Parameters can be set to regulate how
/// symbols are extracted. Several utility functions are provided so that
/// this class can by default do much of the work in parsing files and
/// pruning dictionaries.
///
/// @author mouser
/// @date 2003.07.29
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
// recursive header protection
#ifndef Octane_SubstringParserH
#define Octane_SubstringParserH
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
// application includes
#include "../parser.hpp"
#include "../../bitio/bitio.hpp"
// system includes
#include <vector>
#include <set>
#include <assert.h>
using namespace std;
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
// Different types of parsing modes (greedy is fastest and usually not
// significantly worse than LFF (longest substring first). Deep is super
// slow and not noticeably better.
#define Dictionary_ParseMode_GREEDY 0
#define Dictionary_ParseMode_LFF 1
#define Dictionary_ParseMode_DEEP 2
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
// What kind of precision should we use for the weights? int should be fine.
typedef unsigned int TSubStrParserWeight;
#define TSubStrParserWeight_MAX UINTMAX
// Biggest substring we will ever parser
#define SubStrHuff_BiggestSubStringLen 100
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
/// The SubString Symbol class represents the primitive symbols used by the
/// SubString parser. Each symbol can hold an arbitrary string of character,
/// and is annotated with a 'weight' which stores the frequency of that
/// substring during the construction of the symbol table. Symbols also
/// contain a numerical index which allows each symbol to report its
/// symbol id (position in the symbol vector), which is how parsers interact
/// with other components of statistical coding (by symbol id #).
class SubstringSymbol
{
protected:
/// Frequency of the symbol, as calculated during a 'training' phase
TSubStrParserWeight weight;
/// Value represented by the symbol (i.e. the character, word, or Substring).
std::string value;
/// Each symbol keeps track of its position in symbol vector.
int symbolvectorpos;
public:
// Constructor and destructor
SubstringSymbol () {weight=0;value="";}
SubstringSymbol (const std::string &invalue, TSubStrParserWeight inweight=0) {weight=inweight; value=invalue;}
~SubstringSymbol () {;}
public:
/// The comparison operator used to order the priority queue.
bool operator>( const SubstringSymbol * &a ) const {return weight > (a->get_weight());}
public:
/// Returns the current weight (frequency) of the symbol.
TSubStrParserWeight get_weight() const {return weight;}
/// Increment weight/frequency of symbol.
void increment_weight(int increment) {weight+=increment;}
/// Set weight of symbol.
void set_weight(TSubStrParserWeight val) {weight=val;}
/// Get pointer to string value of symbol.
std::string* get_valuep() {return &value;}
/// Get string value of symbol (i.e. the text of the symbol).
std::string& get_value() {return value;}
/// Get length of symbol string.
int get_valuelen() {return (int)(value.length());}
/// Set value of symbol string.
void set_value(const std::string &invalue) {value=invalue;}
/// Return the symbol id # (the position of the symbol in the symbol vector).
int get_symbolvectorpos() {return symbolvectorpos;};
/// Set the symbol id #.
void set_symbolvectorpos(int inpos) {symbolvectorpos=inpos;};
public:
/// Returns true if this is a "primitive" protected symbol and should not be pruned.
/// used for things like insisting that the ascii characters are never pruned from symbol set even though they are not used during training.
bool dontprune() {if (value.length()<=1) return true; else return false;}
/// Returns the actual memory used by this symbol (uses string.capacity).
unsigned int get_memoryused() const {return (unsigned int)(sizeof(this)+value.capacity());}
/// The parser may evaluate the quality of its symbol set by estimating the "cost" of the symbolset
/// even in the absence of any specified coding and compression strategy;
/// it calculates the cost of each symbol as the inverse of frequency, lower is better.
double get_cost() {return 1.0/(1.0+weight);};
};
/// Greater operator for comparing weights of symbol pointers.
/// We need to make a special Greater<> classes used during sorting, since
/// our collection stl elements are built from pointers, and the default sort
/// will be done on pointer addresses instead of weights if we don't.
template< class T >
class TSubstringSymbolWeightGreater
{
public:
bool operator() (const T& pLeft, const T& pRight) const
{ return pLeft->get_weight()>pRight->get_weight(); }
};
/// Greater operator for comparing strings of symbol pointers alphabetically.
/// We need to make a special Greater<> classes used during sorting, since
/// our collection stl elements are built from pointers, and the default sort
/// will be done on pointer addresses instead of weights if we don't.
template< class T >
class TSubstringSymbolStringGreater
{
public:
bool operator() (const T& pLeft, const T& pRight) const
{ return pLeft->get_value()<pRight->get_value(); }
};
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
/// The SubstringParser class is a flexible parser, which uses arbitrarily
/// long substrings of text as symbols.
/// It can be configured to build a symbol set using only whole words, or with
/// substrings that are not affected by word boundary (max symbol length can be set).
/// Various input parsing strategies can be used, from greedy (fastest) to
/// longest first, to near optimal (slowest).
/// Various heuristics for pruning the symbol set can be applied, which can be
/// used to get the symbol set down to a desired size.
/// @ingroup Parsers
class SubstringParser : public OctaneParser
{
protected:
/// Typedef for stl set of symbols (used for fast alphabetical lookup)
typedef std::set< SubstringSymbol* , TSubstringSymbolStringGreater<SubstringSymbol*> > TSymbolSetTYPE;
/// Typedef for stl vector of symbols (used for fast lookup by id#)
typedef std::vector< SubstringSymbol* > TSymbolVectorTYPE;
/// STL set of symbols (used for fast alphabetical lookup)
TSymbolSetTYPE symbolset;
/// STL vector of symbols (used for fast lookup by id#)
TSymbolVectorTYPE symbolvector;
/// STL set iterator - this is a non-local variable to eliminate cost of building it on stack on every local function
TSymbolSetTYPE::iterator symbolset_pos;
/// Stl vector iterator - this is a non-local variable to eliminate cost of building it on stack on every local function
TSymbolVectorTYPE::iterator symbolvector_pos;
/// Current window buffer for parsing, filled as we read input characters
char inputbufferstr[SubStrHuff_BiggestSubStringLen] ;
/// Length of input window buffer
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -