📄 substringparser.cpp
字号:
/// @file substringparser.cpp
/// A flexible parser which can creates a symbol dictionary of substrings
///
/// @remarks
/// This is a general purpose class which is designed to be used with any
/// compressor that wants to parse files and count occurrences of certain
/// characters/words/Substrings. Parameters can be set to regulate how
/// symbols are extracted. Several utility functions are provided so that
/// this class can by default do much of the work in parsing files and
/// pruning dictionaries.
///
/// @author mouser
/// @date 2003.07.29
///
/// @version 2004.05.19 mouser [-] fixed bug in smartlookup mode of SubStringParser that could cause crashing when incoming stream matched last symbol in set.<br>
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
// Application includes
#include "substringparser.hpp"
// System includes
#include <iostream>
#include <iomanip>
#include <string>
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
SubstringParser::SubstringParser()
{
// constructor
// initialize default dictionary-building parameters
SetDefaultParameters();
// initialize our current symbol count and end-of-stream-symbol
symbolcount=0;
endofstreamsymbolnum=-1;
// initialize input buffer pos
inputbufferlen=0;
senteos=false;
}
SubstringParser::~SubstringParser()
{
// destructor frees the SubstringSymbols in symbolset
FreeData();
}
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
// OCTANE PUBLIC API - AUXILIARY FUNCTIONS
bool SubstringParser::SaveState(bitwriter &to,bool fortempdecompressiononly)
{
// save state
// return true on success
bool bretv;
bretv=SaveParameters(to);
if (bretv)
bretv=SaveSymbols(to);
return bretv;
}
bool SubstringParser::LoadState(bitreader &from)
{
// load state
// return true on success
bool bretv;
bretv=LoadParameters(from);
if (bretv)
bretv=LoadSymbols(from);
return bretv;
}
unsigned int SubstringParser::GetMemoryUsed()
{
// return memory used by this data structure
unsigned int memoryused=sizeof(this);
// this returns the size of the symbolset, as it would take up on disk if saved in straightforward way
// memory usage will be almost identical.
memoryused+=sizeof(symbolset);
// write entries in compact binary form
for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();++symbolset_pos)
{
memoryused+=sizeof(*symbolset_pos);
memoryused+=(*symbolset_pos)->get_memoryused();
}
// for vector
for (symbolvector_pos=symbolvector.begin();symbolvector_pos!=symbolvector.end();++symbolvector_pos)
memoryused+=sizeof(*symbolvector_pos);
return memoryused;
}
unsigned int SubstringParser::GetDiskspaceUsed(bool fortempdecompressiononly)
{
// this returns the size of the symbolset, as it would take up on disk if saved in straightforward way
// memory usage will be almost identical.
unsigned int memoryused=0;
// byte for file type (see WriteSymbolSet routine)
memoryused+=1;
// write entries in compact binary form
for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();++symbolset_pos)
{
// length of the string
memoryused+=1;
// write string
memoryused+=(unsigned int)(((*symbolset_pos)->get_valuep())->length());
// weight of symbol
memoryused+=sizeof(unsigned short int);
}
// end-of-header
memoryused+=1;
return memoryused;
}
void SubstringParser::ShowDebuggingInfo()
{
// show some debugging info
using namespace std;
string bitset_string;
string value;
// size of tree is
std::cout << " Memory used by SubStringParser: " << GetMemoryUsed() << " bytes ("<< (unsigned int)(GetSetSymbolCount())<<" elements).\n";
// show the dictionary sorted alphabetically
if (symbolset.size()>0)
{
cout << "Symbol Set - Sorted Alphabetically ("<<(unsigned int)(symbolset.size())<<" entries)\n";
cout << " " << setiosflags(ios::left) << setw(Parameter_MaxSubstringSize) << "symboltext" << " " << setw(10) << "frequency\n";
cout << " " << setiosflags(ios::left) << setw(Parameter_MaxSubstringSize) << "----------" << " " << setw(10) << "---------\n";
for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();++symbolset_pos)
{
value=(*symbolset_pos)->get_value();
NiceifyHighAsciiString(value);
cout << " " << setiosflags(ios::left) << setw(Parameter_MaxSubstringSize) << value << " " << setw(10) << (*symbolset_pos)->get_weight() << "\n";
}
cout << "\n";
}
}
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
// OCTANE PUBLIC API - AUXILIARY FUNCTIONS
string SubstringParser::GetParametersInformation()
{
// show user available variables and their current values
// ATTN: this has not been converted from cout to string
string returnstring;
cout << setiosflags(ios::left) << setw(17) << "maxbuildsymbols" << setiosflags(ios::left) << " " << setw(10) << Parameter_MaxSymbols_DuringBuild << " " << "parsing dictionary pruned to not exceed this size" << endl;
cout << setiosflags(ios::left) << setw(17) << "maxsymbols" << setiosflags(ios::left) << " " << setw(10) << Parameter_MaxSymbols_Final << " " << "final dictionary pruned to reach this size" << endl;
cout << setiosflags(ios::left) << setw(17) << "maxSubstring" << setiosflags(ios::left) << " " << setw(10) << Parameter_MaxSubstringSize << " " << "max size of Substrings in dictionary" << endl;
cout << setiosflags(ios::left) << setw(17) << "onlywords" << setiosflags(ios::left) << " " << setw(10) << Parameter_OnlyCodeWholeWords << " " << "only add whole word symbols? (bool)" << endl;
cout << setiosflags(ios::left) << setw(17) << "spanwords" << setiosflags(ios::left) << " " << setw(10) << Parameter_SpanWordBoundaries << " " << "encode symbols that span words? (bool)" << endl;
cout << setiosflags(ios::left) << setw(17) << "crends" << setiosflags(ios::left) << " " << setw(10) << Parameter_CountCRsAsEOTs << " " << "count CRs as end-of-stream symbols? (bool)" << endl;
cout << setiosflags(ios::left) << setw(17) << "minweight" << setiosflags(ios::left) << " " << setw(10) << Parameter_PruneMinimumWeight << " " << "minimum threshold weight for final pruning" << endl;
cout << setiosflags(ios::left) << setw(17) << "smartlookup" << setiosflags(ios::left) << " " << setw(10) << Parameter_UseSmartLookup << " " << "when true, use lower_bound stl dictionary search" << endl;
cout << setiosflags(ios::left) << setw(17) << "parsemode" << setiosflags(ios::left) << " " << setw(10) << Parameter_ParseMode << " " << "parse heuristic (0=greedy 1=lff 2=deep&slow)" << endl;
cout << setiosflags(ios::left) << setw(17) << "recalculations" << setiosflags(ios::left) << " " << setw(10) << Parameter_PruneReCalculations << " " << "recalculate symbol costs during pruning?" << endl;
return returnstring;
}
bool SubstringParser::SetParameter(const std::string ¶metername,const std::string ¶metervalue)
{
// generic function for user-interactive modification of parameters
// (see SetDefaultDictionaryBuildingParameters implementation for descriptions)
// return true if we know this variable and use it
bool bretv=false;
if (parametername=="maxbuildsymbols")
bretv=ParseParameter(parametervalue,Parameter_MaxSymbols_DuringBuild);
else if (parametername=="maxsymbols")
bretv=ParseParameter(parametervalue,Parameter_MaxSymbols_Final);
else if (parametername=="maxSubstring")
bretv=ParseParameter(parametervalue,Parameter_MaxSubstringSize);
else if (parametername=="onlywords")
bretv=ParseParameter(parametervalue,Parameter_OnlyCodeWholeWords);
else if (parametername=="spanwords")
bretv=ParseParameter(parametervalue,Parameter_SpanWordBoundaries);
else if (parametername=="crends")
bretv=ParseParameter(parametervalue,Parameter_CountCRsAsEOTs);
else if (parametername=="minweight")
bretv=ParseParameter(parametervalue,Parameter_PruneMinimumWeight);
else if (parametername=="smartlookup")
bretv=ParseParameter(parametervalue,Parameter_UseSmartLookup);
else if (parametername=="parsemode")
bretv=ParseParameter(parametervalue,Parameter_ParseMode);
else if (parametername=="recalculations")
bretv=ParseParameter(parametervalue,Parameter_PruneReCalculations);
return bretv;
}
void SubstringParser::SetDefaultParameters()
{
// encoding parameters that govern how dictionary is built
// the dictionary will never be allowed to get bigger than this size (smaller will also increase building speed)
// when this size is exceeded, we will eliminate all symbols with frequencies of smaller than some small #
Parameter_MaxSymbols_DuringBuild=200000;
// after the entire training file is parsed, we will repeat pruning steps to reduce it to a *maximum* of this many entries
// the actual symbols kept may be smaller depending on other pruning parameters
Parameter_MaxSymbols_Final=2000;
// how many characters is largest (sub)string stored in dictionary
Parameter_MaxSubstringSize=12;
// Parameter_MaxSubstringSize=3;
// Parameter_MaxSubstringSize=1;
// only store characters and whole words?
// Parameter_OnlyCodeWholeWords=false;
Parameter_OnlyCodeWholeWords=false;
// store Substrings that span words (i.e. they contain multiple words)
Parameter_SpanWordBoundaries=false;
// minimum count for a word or Substring in order to even consider leaving it in the dictionary
Parameter_PruneMinimumWeight=10;
// when encoding single lines, each line must end with an end-of-stream (EOS) symbol
// but if we train on a file, which is large and has only one EOS at end of file
// then the freq. of EOS will be very low (1), so here we can say to count a pretend EOS
// whenever we see a CR.
//Parameter_CountCRsAsEOTs=false;
Parameter_CountCRsAsEOTs=true;
// we can either use a smart search using stl find_lower_bound() to find longest Substring or we can search starting at biggest string backwards
Parameter_UseSmartLookup=true;
// we can either use a greedy search to find longest Substring starting with first char or we can use smarter but slower heuristics
Parameter_ParseMode=Dictionary_ParseMode_GREEDY;
// should we recompute symbol costs during pruning? (this actually employs the dictionary set and calculates actual usage counts and then reprunes)
Parameter_PruneReCalculations=true;
}
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
// PARSER PUBLIC API - PREPARATIONS FOR PARSING
bool SubstringParser::RewindAnyBufferedInput(bitreader &from)
{
// let go of any buffered stream - this is an odd function that can be called be compressor if it wants to hand
// off the input stream to a new parser or otherwise access the input bitstream from after last symbol parsed.
// it is necessary because some parsers can read-ahead in input buffer, and so must rewind the bitstream.
// we have to implement this since we do a read-ahead. note this wouldn't be called in normal circumstances,
// just if compressors wants us to stop prematurely and hand off the job to someone else.
// return true on success
// the number of characters we have read ahead in buffer is inputbufferlen
from.seek_bit(from.tell_bit()-(inputbufferlen*8));
return true;
}
void SubstringParser::SynchronizeStateForNewStream()
{
// synchronize state for a new stream
// this MUST be called before beginning a new parsing stream
// reset input buffer and end-of-stream sent flag
inputbufferlen=0;
senteos=false;
}
bool SubstringParser::CreateSymbolSetUsingStream(bitreader &from)
{
// scan a file and add its "tokens" into our symbol set
string valuestring;
string slidingwindowstring;
unsigned char c;
bool slidingwindowfull=false;
string eotstring=string("");
// clear any existing symbols
FreeData_Symbols();
// create primitive characters
AddPrimitiveCharacterSubstringSymbols();
// save the current bitreader and bitreader position for reparsing during pruning
current_bitreaderp = &from;
start_bitreaderposition = from.tell_bit();
// now parse the file
while (!from.empty())
{
// grab a character
c=from.get_byte();
// add the character to our window
slidingwindowstring+=c;
// trim lefthand side if it is too big
if (!slidingwindowfull)
{
// we keep a sliding window which is slightly bigger than the maxSubstring, to account for delimiter boundaries
if ((int)(slidingwindowstring.length())>Parameter_MaxSubstringSize+5)
{
// set the flag saying from now on, we are full
slidingwindowfull=true;
}
}
if (slidingwindowfull)
{
// trim leftmost character
slidingwindowstring.erase(0,1);
}
// now add the appropriate Substrings of slidingwindowstring to our dictionary SymbolSet
AddSubstringsFromSlidingWindowString(slidingwindowstring);
// when encoding single lines, each line must end with an end-of-stream (EOS) symbol
// but if we train on a file, which is large and has only one EOS at end of file
// then the freq. of EOS will be very low (1), so here we can say to count a pretend EOS
// whenever we see a CR
if (c==13 && Parameter_CountCRsAsEOTs)
{
// increment frequency for EOS
UpdateValueStringInSymbolSet(eotstring,1);
}
if (symbolset.size()>Parameter_MaxSymbols_DuringBuild)
{
// periodically we may have to prune away lowest frequency symbols in order to keep dictionary manageable
// how many should we remove - we don't want to remove just one since that would result in many repetitive calls here
// so we just pick some fraction to remove
PruneSymbolSet_ReduceSizeTo(((unsigned int)(symbolset.size())/5)+1);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -