📄 substringparser.hpp

📁 Octane v1.01.20 The Open Compression Toolkit for C++ . The Open Compression Toolkit is a set of mo
💻 HPP
📖 第 1 页 / 共 2 页
字号:
12 下一页
/// @file substringparser.hpp
/// A flexible parser which can creates a symbol dictionary of substrings
///
/// @remarks
///  This is a general purpose class which is designed to be used with any
///   compressor that wants to parse files and count occurrences of certain
///   characters/words/Substrings.  Parameters can be set to regulate how
///   symbols are extracted.  Several utility functions are provided so that
///   this class can by default do much of the work in parsing files and
///   pruning dictionaries.
///
/// @author mouser
/// @date   2003.07.29
//---------------------------------------------------------------------------

//---------------------------------------------------------------------------
// recursive header protection
#ifndef Octane_SubstringParserH
#define Octane_SubstringParserH
//---------------------------------------------------------------------------

//---------------------------------------------------------------------------
// application includes
#include "../parser.hpp"
#include "../../bitio/bitio.hpp"
// system includes
#include <vector>
#include <set>
#include <assert.h>
using namespace std;
//---------------------------------------------------------------------------




//---------------------------------------------------------------------------
// Different types of parsing modes (greedy is fastest and usually not
//  significantly worse than LFF (longest substring first).  Deep is super
//  slow and not noticeably better.
#define Dictionary_ParseMode_GREEDY		0
#define Dictionary_ParseMode_LFF		1
#define Dictionary_ParseMode_DEEP		2
//---------------------------------------------------------------------------


//---------------------------------------------------------------------------
// What kind of precision should we use for the weights? int should be fine.
typedef unsigned int TSubStrParserWeight;
#define TSubStrParserWeight_MAX	UINTMAX

// Biggest substring we will ever parser
#define SubStrHuff_BiggestSubStringLen 100
//---------------------------------------------------------------------------





//---------------------------------------------------------------------------
/// The SubString Symbol class represents the primitive symbols used by the
/// SubString parser.  Each symbol can hold an arbitrary string of character,
/// and is annotated with a 'weight' which stores the frequency of that
/// substring during the construction of the symbol table.  Symbols also
/// contain a numerical index which allows each symbol to report its
/// symbol id (position in the symbol vector), which is how parsers interact
/// with other components of statistical coding (by symbol id #).
class SubstringSymbol
{
protected:
	/// Frequency of the symbol, as calculated during a 'training' phase
	TSubStrParserWeight weight;
	/// Value represented by the symbol (i.e. the character, word, or Substring).
	std::string value;
	/// Each symbol keeps track of its position in symbol vector.
	int symbolvectorpos;
public:
	// Constructor and destructor
	SubstringSymbol () {weight=0;value="";}
	SubstringSymbol (const std::string &invalue, TSubStrParserWeight inweight=0) {weight=inweight; value=invalue;}
	~SubstringSymbol () {;}
public:
	/// The comparison operator used to order the priority queue.
	bool operator>( const SubstringSymbol * &a ) const {return weight > (a->get_weight());}
public:
	/// Returns the current weight (frequency) of the symbol.
	TSubStrParserWeight get_weight() const {return weight;}
	/// Increment weight/frequency of symbol.
	void increment_weight(int increment) {weight+=increment;}
	/// Set weight of symbol.
	void set_weight(TSubStrParserWeight val) {weight=val;}
	/// Get pointer to string value of symbol.
	std::string* get_valuep() {return &value;}
	/// Get string value of symbol (i.e. the text of the symbol).
	std::string& get_value() {return value;}
	/// Get length of symbol string.
	int get_valuelen() {return (int)(value.length());}
	/// Set value of symbol string.
	void set_value(const std::string &invalue) {value=invalue;}
	/// Return the symbol id # (the position of the symbol in the symbol vector).
	int get_symbolvectorpos() {return symbolvectorpos;};
	/// Set the symbol id #.
	void set_symbolvectorpos(int inpos) {symbolvectorpos=inpos;};
public:
	/// Returns true if this is a "primitive" protected symbol and should not be pruned.
	/// used for things like insisting that the ascii characters are never pruned from symbol set even though they are not used during training.
	bool dontprune() {if (value.length()<=1) return true; else return false;}
	/// Returns the actual memory used by this symbol (uses string.capacity).
	unsigned int get_memoryused() const {return (unsigned int)(sizeof(this)+value.capacity());}
	/// The parser may evaluate the quality of its symbol set by estimating the "cost" of the symbolset
	/// even in the absence of any specified coding and compression strategy;
	/// it calculates the cost of each symbol as the inverse of frequency, lower is better.
	double get_cost() {return 1.0/(1.0+weight);};
};


/// Greater operator for comparing weights of symbol pointers.
/// We need to make a special Greater<> classes used during sorting, since
///  our collection stl elements are built from pointers, and the default sort
///  will be done on pointer addresses instead of weights if we don't.
template< class T >
class TSubstringSymbolWeightGreater
	{
	public:
		bool operator() (const T& pLeft, const T& pRight) const
			{ return pLeft->get_weight()>pRight->get_weight(); }
	};


/// Greater operator for comparing strings of symbol pointers alphabetically.
/// We need to make a special Greater<> classes used during sorting, since
///  our collection stl elements are built from pointers, and the default sort
///  will be done on pointer addresses instead of weights if we don't.
template< class T >
class TSubstringSymbolStringGreater
	{
	public:
		bool operator() (const T& pLeft, const T& pRight) const
			{ return pLeft->get_value()<pRight->get_value(); }
	};
//---------------------------------------------------------------------------



//---------------------------------------------------------------------------
/// The SubstringParser class is a flexible parser, which uses arbitrarily
///  long substrings of text as symbols.
/// It can be configured to build a symbol set using only whole words, or with
///  substrings that are not affected by word boundary (max symbol length can be set).
/// Various input parsing strategies can be used, from greedy (fastest) to
///  longest first, to near optimal (slowest).
/// Various heuristics for pruning the symbol set can be applied, which can be
///  used to get the symbol set down to a desired size.
/// @ingroup Parsers
class SubstringParser : public OctaneParser
{
protected:
	/// Typedef for stl set of symbols (used for fast alphabetical lookup)
	typedef std::set< SubstringSymbol* , TSubstringSymbolStringGreater<SubstringSymbol*> > TSymbolSetTYPE;
	/// Typedef for stl vector of symbols (used for fast lookup by id#)
	typedef std::vector< SubstringSymbol* > TSymbolVectorTYPE;
	/// STL set of symbols (used for fast alphabetical lookup)
	TSymbolSetTYPE symbolset;
	/// STL vector of symbols (used for fast lookup by id#)
	TSymbolVectorTYPE symbolvector;
	/// STL set iterator - this is a non-local variable to eliminate cost of building it on stack on every local function
	TSymbolSetTYPE::iterator symbolset_pos;
	/// Stl vector iterator - this is a non-local variable to eliminate cost of building it on stack on every local function
	TSymbolVectorTYPE::iterator symbolvector_pos;
	/// Current window buffer for parsing, filled as we read input characters
	char inputbufferstr[SubStrHuff_BiggestSubStringLen] ;
	/// Length of input window buffer
12 下一页
💿 文件大小 170 K
👤 上传用户 boobyb
📂 所属分类压缩解压
🏷️ 相关标签

#Compression #Toolkit #Open #The
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -