📄 substringparser.cpp
字号:
searchSubstringSymbol.set_value(inputstring);
}
}
else
{
// dumb method (67:00 seconds on a test, about 3x slower than smart method)
for(;;)
{
symbolset_pos=symbolset.find(&searchSubstringSymbol);
if (symbolset_pos!=symbolset.end())
{
// we found an exact match
break;
}
// not found, so shorten string by removing rightmost character
--inputqueuestrlen;
if (inputqueuestrlen<=0)
{
cout << "ERROR: unexpectedly didn't find any match for a string that started out with "<<originputqueuestrlen<<" characters (we hit "<<inputqueuestrlen<<"). last char was "<<(int)inputqueuestr[0]<<"."<<endl;
return NULL;
}
// decrease length of Substring currently being searched for and continue
inputstring=inputstring.substr(0,inputqueuestrlen);
searchSubstringSymbol.set_value(inputstring);
}
}
// return the pointer
return *symbolset_pos;
}
double SubstringParser::CalculateEncodeCost(char *inputqueuestr,int inputqueuestrlen)
{
// try to find the optimal cost of encoding the string, using currently selected methods
// and return its cost, ASSUMING that we start by parsing longest matching leftmost substr
SubstringSymbol* symbolnodep;
double cost=0;
// start by greedy parse of biggest leftmost substr
symbolnodep = FindNextSymbolToEncode_Greedy(inputqueuestr,inputqueuestrlen);
if (symbolnodep==NULL)
{
// no match found - this should never happen
cout << "$no match found."<<endl;
return 99999;
}
// add cost of initial Substring
cost+=GetSymbolCost(symbolnodep);
// advance pointer
inputqueuestr+=symbolnodep->get_valuelen();
inputqueuestrlen-=symbolnodep->get_valuelen();
// now the remainder of the string could be done smart or greedy
if (Parameter_ParseMode!=Dictionary_ParseMode_GREEDY && inputqueuestrlen>2)
{
// SMARTER HEURISTIC PARSE (slower than greedy, but *maybe* better)
int bestindex=inputqueuestrlen;
double bestscore=0;
double encodecost;
bool firstscore=true;
for (int index=1;index<=inputqueuestrlen;++index)
{
// find cost if we encode based on split at index
encodecost=CalculateEncodeCost(inputqueuestr,index);
if (index<inputqueuestrlen)
encodecost+=CalculateEncodeCost(&inputqueuestr[index],inputqueuestrlen-index);
// update our tracking of best score
if (firstscore || encodecost<bestscore)
{
bestscore=encodecost;
bestindex=index;
firstscore=false;
}
}
// now set cost to best cost found
cost+=bestscore;
}
else if (inputqueuestrlen>0)
{
// GREEDY PARSE cost
cost+=CalculateEncodeCost(inputqueuestr,inputqueuestrlen);
}
else
{
// nothing left to score
}
// now return cost
return cost;
}
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
// Internal Pruning functions
void SubstringParser::PruneSymbolSet_WeightThreshold(TSubStrParserWeight thresholdweight)
{
// remove all weights below some threshold set by parameter
// this is a fast and easy step that will remove almost the entire current symbol set :)
TSymbolSetTYPE::iterator nextsymbolpos;
TSubStrParserWeight weight;
int index=0;
// First simplest pruning operation - simply remove all symbols which don't have some threshold count
for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();)
{
++index;
nextsymbolpos=symbolset_pos;
++nextsymbolpos;
// only show non-zero weights (i.e. symbols used at least once)
if (!((*symbolset_pos)->dontprune()))
{
weight=(*symbolset_pos)->get_weight();
if (weight<thresholdweight)
{
// delete this symbol
delete (*symbolset_pos);
symbolset.erase(symbolset_pos);
}
}
// advance pointer
symbolset_pos=nextsymbolpos;
}
}
void SubstringParser::PruneSymbolSet_RemoveRedundantPrefixes()
{
// some Substring symbols are simply prefixes of others, with no independent occurrences
// for example the Substring "supercomput" is a redundant Substring of "supercomputer"
// so we can remove those with no cost.
TSymbolSetTYPE::iterator nextsymbolpos;
TSymbolSetTYPE::iterator lastsymbolpos;
TSubStrParserWeight weight,lastweight;
string lastvalue;
int lastvaluelen;
lastweight=0;
for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();)
{
nextsymbolpos=symbolset_pos;
++nextsymbolpos;
// some symbols (like primitive characters, cannot be pruned)
if ((*symbolset_pos)->dontprune())
lastweight=0;
else
{
weight=(*symbolset_pos)->get_weight();
if (weight==lastweight && weight!=0)
{
// got a candidate for pruning, IFF our prior string is a proper prefix of us
lastvaluelen=(int)(lastvalue.length());
if (((*symbolset_pos)->get_value()).compare(0,lastvaluelen,lastvalue.c_str(),lastvaluelen)==0)
{
// yes it is, so kill our last value
delete *lastsymbolpos;
symbolset.erase(lastsymbolpos);
}
}
// remember last symbol details
lastweight=weight;
lastvalue=(*symbolset_pos)->get_value();
lastsymbolpos=symbolset_pos;
}
// advance pointer
symbolset_pos=nextsymbolpos;
}
}
void SubstringParser::PruneSymbolSet_ReduceSizeTo(unsigned int targetsize)
{
// we have exceeded our maximum # of symbols allowed during build process, so reduce the size
// this is done quite inefficiently, but we can worry about that later, or not, since its not in a time-critical place
// ATTN: a smarter way would be to sort by weight and pop off smallest weights.
if (symbolset.size()<targetsize)
return;
for (int count=2;count<10000;++count)
{
// remove all removable symbols with a weight < count and then check set size again
PruneSymbolSet_WeightThreshold((TSubStrParserWeight)count);
if (symbolset.size()<=targetsize)
break;
}
}
void SubstringParser::PruneSymbolSet()
{
// we need to reduce the size of the dictionary SymbolSet by pruning symbols we think are not worth the coding space
// having silly symbols increases our cost in two main ways:
// 1) cost in memory needed to store the dictionary in memory and time needed to search dictionary
// 2) cost incurred due to the fact that the more words in dictionary, the more bits needed to encode other symbols.
// this is a very serious issue and suggests the dictionary should be as small as possible.
// the bitreader is passed so that we can reparse the input in order to test our created symbol set.
// display info
cout << endl << "Total symbols in dictionary before pruning: " << GetSetSymbolCount() << endl;
// remove weights below a threshold
PruneSymbolSet_WeightThreshold(Parameter_PruneMinimumWeight);
cout << "After Stage 1 Pruning (removed scores below "<<Parameter_PruneMinimumWeight<<"), symbols: " << GetSetSymbolCount() << endl;
// remove redundant prefixes
PruneSymbolSet_RemoveRedundantPrefixes();
cout << "After Stage 2 Pruning (removed redundant prefixes), symbols: " << GetSetSymbolCount() << endl;
// pre-capping recompression
if (Parameter_PruneReCalculations)
{
// now actually re-compress the file using current symbol set to get better real-world weight counts, and repeat prunings
ReParseFileReComputeCosts(true);
PruneSymbolSet_WeightThreshold(Parameter_PruneMinimumWeight);
PruneSymbolSet_RemoveRedundantPrefixes();
// some info for user
cout << "After pre-ceil pre-compression, re-evaluation, and repruning, symbols: " << GetSetSymbolCount() << endl;
}
// finally, cap symbol count. there are smart and dumb ways this can be done
PruneSymbolSet_ReduceSizeTo(Parameter_MaxSymbols_Final);
cout << "After Stage 3 Pruning (killed worst scores to reach target), symbols: " << GetSetSymbolCount() << endl;
// more recompression loops after capping
if (Parameter_PruneReCalculations)
{
// now actually re-compress the file using current symbol set to get better real-world weight counts, and repeat prunings
ReParseFileReComputeCosts(true);
PruneSymbolSet_WeightThreshold(Parameter_PruneMinimumWeight);
PruneSymbolSet_RemoveRedundantPrefixes();
cout << "After re-compression, re-evaluation, and repruning, symbols: " << GetSetSymbolCount() << endl;
}
}
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
// For reparsing and recomputing scores
double SubstringParser::ReParseFileReComputeCosts(bool updateweights)
{
// USING the current symbolset we have just created, reparse it and recalculate the 'cost'
// and frequency of each symbol.
// This is useful because as we create and modify(prune) the symbolset, the symbols interact
// and effect frequencies, so its helpful to reparse the input and recalculate frequencies.
// return sum cost of parsed symbols IFF updateweights == false
// First we need to rebuild the symbol vector before we can parse
BuildSymbolVector();
int symbolnum;
SubstringSymbol *symbolp;
double totalcost=0;
// reset input bitreader to where we started dictionarizing
current_bitreaderp->seek_bit(start_bitreaderposition);
// reset weights if we are recalculating them
if (updateweights)
ResetWeights();
// loop through input grabbing symbols
// ATTN: are we properly updating weights?
while (ParseNextSymbolFromInput(*current_bitreaderp,symbolnum))
{
symbolp=symbolvector[symbolnum];
if (updateweights)
{
// we can be run in a mode to actually update weights while encoding
if (updateweights)
{
// note this will not change priority queue, and is therefore somewhat skidding on what is 'allowed' by stl
// but we use this purposefully so that we are modifying the weights while preserving current priority queue.
symbolp->increment_weight(1);
// we need to also keep track of CRs for EOS's if appropriate
if (Parameter_CountCRsAsEOTs && strchr(symbolp->get_valuep()->c_str(),13)!=NULL)
{
symbolp=FindNextSymbolToEncode_Greedy("",0);
if (symbolp!=NULL)
symbolp->increment_weight(1);
}
}
}
else
totalcost+=symbolp->get_cost();
}
// return success
return totalcost;
}
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
// Internal symbol construction
void SubstringParser::BuildSymbolVector()
{
// build a vector of symbol pointers, for efficient random access
// clear any existing vector
symbolvector.clear();
// add symbols from set
int index=0;
for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();++symbolset_pos)
{
// add symbol to vector
symbolvector.push_back(*symbolset_pos);
// tell symbol what number it is
(*symbolset_pos)->set_symbolvectorpos(index++);
}
// now we need to update our count of how many symbols we have
symbolcount=(int)(symbolvector.size());
// and record the symbol representing the end-of-stream (EOS is "")
endofstreamsymbolnum=LookupSymbolNumberFromText(string(""));
}
int SubstringParser::LookupSymbolNumberFromText(string wordstring)
{
// return the number of a symbol in the parsing symbolset (or -1 if not found)
static SubstringSymbol searchSubstringSymbol("",1);
// now we use this searchSubstringSymbol for searching
searchSubstringSymbol.set_value(wordstring);
symbolset_pos=symbolset.find(&searchSubstringSymbol);
// if its not found, return -1
if (symbolset_pos==symbolset.end())
return -1;
// otherwise return its index symbol number
return ((*symbolset_pos)->get_symbolvectorpos());
}
//---------------------------------------------------------------------------
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -