📄 substringparser.cpp
字号:
}
// now final pruning
PruneSymbolSet();
// and now build the vector for fast random access
BuildSymbolVector();
// return success
return true;
}
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
// Parser API
bool SubstringParser::ParseNextSymbolFromInput(bitreader &from, int &symbolnum)
{
// grab an input stream symbol and set its INDEX (in symbol vector) for symbolnum
// return false after EOS
unsigned char c;
int maxSubstringlen=Parameter_MaxSubstringSize;
SubstringSymbol *symbolp;
// note that inputbufferlen are preserved across calls, and inputbufferstr[] is preserved state info
// try to fill up the inputqueue to Parameter_MaxSubstringSize size, or as much as we got
current_bitreaderp=&from;
while (inputbufferlen<maxSubstringlen && !from.empty())
{
// add the character
c=from.get_byte();
inputbufferstr[inputbufferlen]=c;
// increment character positions
++inputbufferlen;
}
if (inputbufferlen==0)
{
// no more symbols left - BUT the question now is, do we return an EOS symbol, or false for no symbols left
if (senteos)
{
// we already sent an EOS so from now on any requests for a symbol returns false saying no more symbols available
return false;
}
else
{
// we are going to drop down to return the EOS signal, but we set flag so we don't do it again
senteos=true;
}
}
// ok now we have a block of up to Parameter_MaxSubstringSize bytes from the left of the inputstr
// now find the longest leftmost (prefix) Substring in our dictionary and encode it, and shift inputqueuestr to the left with remaining bytes
// and return the new inputqueuestr with remaining bytes, and return length of remaining bytes.
symbolp=FindNextSymbolToEncode(inputbufferstr,inputbufferlen);
// symbolp=FindNextSubstringSymbolpFromInputQueueStr(inputbufferstr,inputbufferlen);
symbolnum=symbolp->get_symbolvectorpos();
inputbufferlen=SwallowSymbolFromInputQueueStr(symbolp,inputbufferstr,inputbufferlen,false);
return true;
}
bool SubstringParser::WriteSymbolText(bitwriter &to, int symbolnum,bool &isendofstreamsymbol)
{
// write the symbol indexed by symbolnum
// sets isendofostreamsymbol to true or false depending on if the symbol written is the EOS symbol
// return true on success
SubstringSymbol *symbolp=symbolvector[symbolnum];
int valuelength=(int)((symbolp->get_valuep())->length());
to.write((symbolp->get_valuep())->c_str(),valuelength);
// set EOS flag
isendofstreamsymbol=(symbolnum==endofstreamsymbolnum);
// return success
return true;
}
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
// Internal functions for freeing data structure
void SubstringParser::FreeData()
{
// free symbolset - this will key any SubstringSymbol nodes
FreeData_Symbols();
}
void SubstringParser::FreeData_Symbols()
{
// free set,vector and symbols
// we only want to free the symbols once, even if they are in both set and vector
for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();++symbolset_pos)
{
// delete the pointed to node
delete (*symbolset_pos);
}
// clear main symbolset
symbolset.clear();
// and now rebuild the vector, which will clear it
BuildSymbolVector();
}
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
// Internal Helper functions for saving and loading state
bool SubstringParser::SaveParameters(bitwriter &to)
{
// write parameter settings
to.put(Parameter_MaxSymbols_DuringBuild);
to.put(Parameter_MaxSymbols_Final);
to.put(Parameter_MaxSubstringSize);
to.put(Parameter_OnlyCodeWholeWords);
to.put(Parameter_SpanWordBoundaries);
to.put(Parameter_PruneMinimumWeight);
to.put(Parameter_CountCRsAsEOTs);
to.put(Parameter_UseSmartLookup);
to.put(Parameter_ParseMode);
to.put(Parameter_PruneReCalculations);
return true;
}
bool SubstringParser::LoadParameters(bitreader &from)
{
// read parameter settings
from.get(Parameter_MaxSymbols_DuringBuild);
from.get(Parameter_MaxSymbols_Final);
from.get(Parameter_MaxSubstringSize);
from.get(Parameter_OnlyCodeWholeWords);
from.get(Parameter_SpanWordBoundaries);
from.get(Parameter_PruneMinimumWeight);
from.get(Parameter_CountCRsAsEOTs);
from.get(Parameter_UseSmartLookup);
from.get(Parameter_ParseMode);
from.get(Parameter_PruneReCalculations);
return true;
}
bool SubstringParser::SaveSymbols(bitwriter &to)
{
// write the symboldset to file
// return true on success
unsigned char valuelength;
// write entries in compact binary form
for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();++symbolset_pos)
WriteSubstringSymbol((*symbolset_pos),to);
// write an end-of-header in case we want to add this header as prefix of another file
// we use a string-length==255 to mean end of table (no strings ever allowed greater than 254 characters)
valuelength=255;
to.put(valuelength);
// return success
return true;
}
bool SubstringParser::LoadSymbols(bitreader &from)
{
// load a previously saved dictionary from file
bool bretv=true;
bool bretv2;
// read dictionary
while (!from.empty())
{
// read length of the string, as an unsigned car
bretv2=ReadSubstringSymbol(from);
if (!bretv2)
break;
}
// and now build the vector for fast random access
BuildSymbolVector();
// return true on success
return bretv;
}
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
// Internal Parsing
bool SubstringParser::WriteSubstringSymbol(SubstringSymbol *SubstringSymbolp,bitwriter &to)
{
// SubstringSymbol reader/writer - derived classes will take these over if necessary
// return true on success
TSubStrParserWeight weightvalue;
unsigned int weightvalue_uint;
unsigned char valuelength;
// write length of the string, as an unsigned int
valuelength=(int)(((*symbolset_pos)->get_valuep())->length());
to.put(valuelength);
// write string
to.write(((*symbolset_pos)->get_valuep())->c_str(),valuelength);
// get weight of symbol
weightvalue=(*symbolset_pos)->get_weight();
// SubstringSymbols should be writable as unsigned ints, but we dynamic cast in case, which will throw an exception if not
weightvalue_uint=static_cast<unsigned int>(weightvalue);
to.put(weightvalue_uint);
// return success
return true;
}
bool SubstringParser::ReadSubstringSymbol(bitreader &from)
{
// virtual SubstringSymbol reader/writer - derived classes will take these over if necessary
// return false when we hit end
unsigned int weightvalue_uint;
unsigned char valuelength;
char valuestr[256];
std::string valuestring;
from.get(valuelength);
if (valuelength>=255)
{
// all done
return false;
}
// read string
from.read(valuestr,valuelength);
valuestring=string(valuestr,valuelength);
// read weight
from.get(weightvalue_uint);
// now add the symbol SubstringSymbol
AddSymbol(valuestring,(TSubStrParserWeight)weightvalue_uint);
return true;
}
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
// Internal symbol construction
void SubstringParser::AddPrimitiveCharacterSubstringSymbols()
{
// Push every possible character as SubstringSymbol into the set
string valuestring;
char valuestr[20];
for ( int i = 0 ; i < 256 ; i++ )
{
// form the string value of this (just the dictionary character or word)
valuestr[0]=i;valuestr[1]='\0';
valuestring=string(valuestr,1);
// create a new SubstringSymbol node with this character, with a count of 1 (if you try to add a weight of 0 you will mess up the tree since sums must be strictly increasing)
UpdateValueStringInSymbolSet(valuestring,1);
}
// we are going to use the empty string "" as our EOS (or End of Encoding) symbol, and record its number
valuestring="";
UpdateValueStringInSymbolSet(valuestring,1);
}
bool SubstringParser::UpdateValueStringInSymbolSet(const string &stringvalue,int increment)
{
// add a symbol as a SubstringSymbol with 0 count
// return false on a FAILURE to add the symbol (ran out of memory)
bool bretv=true;
// use a static local variable to reduce memory allocation-deallocation thrashing, and use it for searching
static SubstringSymbol searchSubstringSymbol("",1);
// now we use this searchSubstringSymbol for searching
searchSubstringSymbol.set_value(stringvalue);
// look it up
symbolset_pos=symbolset.find(&searchSubstringSymbol);
// if its not found, create it
if (symbolset_pos==symbolset.end())
{
// create the SubstringSymbol and add it, with weight = increment
AddSymbol(stringvalue,increment);
}
else
{
// we found it, so increment its count
(*symbolset_pos)->increment_weight(increment);
}
// return false on failure
return bretv;
}
void SubstringParser::ResetWeights()
{
// reset weights for every symbol
for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();++symbolset_pos)
(*symbolset_pos)->set_weight(0);
}
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
// Internal symbol construction
void SubstringParser::AddSubstringsFromSlidingWindowString(string &slidingwindowstring)
{
// According to our parameters, increment frequency counts (adding symbols when needed) of some Substrings from our sliding window
// The way we do this is, our primary character is the rightmost one; thats the one that will be different on each call.
// If we want to do standard huffman on characters, we would simply update the frequency of the rightmost character.
// Alternatively, we have various other parameters which can govern which Substrings we track.
// Some of many settings can generate HUGE trees and sets, but these trees will eventually be pruned of least-useful entries.
// Note this procedure is the most convoluted one in the file, because it deals with various parameters
// which regulate what kinds of Substrings from the input stream get tokenized.
string valuestring;
bool bretv;
int len=(int)(slidingwindowstring.length());
unsigned char c,c2;
int leftpos,rightpos,curpos;
bool delimiteronborder;
// first add the rightmost character, by incrementing its usage count by 1
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -