📄 substringparser.hpp
字号:
int inputbufferlen;
/// The parser needs to return an end-of-symbol character before finishing reading the stream, so we need to track whether we sent an EOS yet.
/// @todo this is an odd responsibility for the parser to have to deal with, so we should probably find a way to eliminate this.
bool senteos;
protected:
// Encoding parameters which govern how dictionary is built (see SetDefaultDictionaryBuildingParameters implementation for descriptions)
/// The dictionary will never be allowed to get bigger than this size (smaller will also increase building speed)
/// when this size is exceeded, we will eliminate all symbols with frequencies of smaller than some small #
unsigned int Parameter_MaxSymbols_DuringBuild;
/// After the entire training file is parsed, we will repeat pruning steps to reduce it to a *maximum* of this many entries
/// the actual symbols kept may be smaller depending on other pruning parameters
unsigned int Parameter_MaxSymbols_Final;
/// How many characters is largest (sub)string stored in dictionary
int Parameter_MaxSubstringSize;
/// Only store characters and whole words?
bool Parameter_OnlyCodeWholeWords;
/// Store Substrings that span words (i.e. they contain multiple words)
bool Parameter_SpanWordBoundaries;
// When encoding single lines, each line must end with an end-of-stream (EOS) symbol
// but if we train on a file, which is large and has only one EOS at end of file
// then the freq. of EOS will be very low (1), so here we can say to count a pretend EOS
// whenever we see a CR.
bool Parameter_CountCRsAsEOTs;
/// Minimum count for a word or Substring in order to even consider leaving it in the dictionary
unsigned int Parameter_PruneMinimumWeight;
/// We can either use a greedy search to find longest Substring starting with first char or we can use smarter but slower heuristics
int Parameter_ParseMode;
/// We can either use a smart search using stl find_lower_bound() to find longest Substring or we can search starting at biggest string backwards
bool Parameter_UseSmartLookup;
/// How many iterations should we recompress to test heuristic symbolset cost, if any
/// this tests the currently built symbol set to 'reencode' the input stream and calculates actual usage counts and then reprunes.
bool Parameter_PruneReCalculations;
protected:
/// Keeps track of how many symbols have we created for fast replies
int symbolcount;
/// Keeps track of end of stream symbol number for fast replies
int endofstreamsymbolnum;
/// Stores last bitreader , so that we can rewind when we want to reparse and check our heuristic symbolset costs
bitreader *current_bitreaderp;
/// Stores last bitreader starting position before parsing, so that we can rewind when we want to reparse and check our heuristic symbolset costs
size_t start_bitreaderposition;
public:
//---------------------------------------------------------------------------
/// Constructor
SubstringParser();
/// Destructor
~SubstringParser();
//---------------------------------------------------------------------------
public:
//---------------------------------------------------------------------------
// PARSER PUBLIC API - PREPARATIONS FOR PARSING
virtual bool CreateSymbolSetUsingStream(bitreader &from);
virtual bool IsReadyToParse() {if (symbolcount>0) return true; else return false;};
virtual bool RewindAnyBufferedInput(bitreader &from);
//---------------------------------------------------------------------------
public:
//---------------------------------------------------------------------------
// PARSER PUBLIC API - MAIN PARSING FUNCTIONS
/// Synchronize state for a new stream - this is called every time a new stream is parsed, to allow parser to get into 'start' state,
virtual void SynchronizeStateForNewStream();
virtual int GetSymbolCount() {return symbolcount;};
virtual bool ParseNextSymbolFromInput(bitreader &from, int &symbolnum);
virtual bool WriteSymbolText(bitwriter &to, int symbolnum,bool &isendofstreamsymbol);
virtual string LookupSymbolText(int symbolnum) {assert(symbolnum<(int)(symbolvector.size()));return symbolvector[symbolnum]->get_value();};
//---------------------------------------------------------------------------
public:
//---------------------------------------------------------------------------
// OCTANE PUBLIC API - AUXILIARY FUNCTIONS - these are supported by all derived classes
virtual void ShowDebuggingInfo();
virtual unsigned int GetMemoryUsed();
virtual unsigned int GetDiskspaceUsed(bool fortempdecompressiononly);
virtual std::string GetParametersInformation();
virtual void SetDefaultParameters();
virtual bool SetParameter(const std::string ¶metername,const std::string ¶metervalue);
virtual bool SaveState(bitwriter &to,bool fortempdecompressiononly);
virtual bool LoadState(bitreader &from);
//---------------------------------------------------------------------------
public:
//---------------------------------------------------------------------------
// OCTANE PARSER PUBLIC API - AUXILIARY FUNCTIONS - symbol helpers
int GetSetSymbolCount() {return (int)(symbolset.size());};
int LookupSymbolNumberFromText(string wordstring);
//---------------------------------------------------------------------------
private:
// save and load state information
/// Save complete state info (the entire symbolset and parameters).
/// @return true on success.
bool SaveSymbols(bitwriter &to);
/// Load complete state info (the entire symbolset and parameters).
/// @return true on success.
bool LoadSymbols(bitreader &from);
private:
// symbol manipulation
/// Add the ascii characters 0-255 to the symbol set, in addition to the end-of-stream symbol
void AddPrimitiveCharacterSubstringSymbols();
/// Lookup a symbol based on its text and Increase the frequency of the symbol.
/// The symbol will be created if it doesn't already exist.
/// @return true on success.
bool UpdateValueStringInSymbolSet(const string &stringvalue,int increment);
/// Add all substrings contained in a window are added to dictionary (based on parameters defining which substrings to add).
void AddSubstringsFromSlidingWindowString(string &slidingwindowstring);
/// Create a new symbol and add it to symbol set with specific starting frequency.
SubstringSymbol* AddSymbol(const std::string &stringvalue, const TSubStrParserWeight weightvalue) {SubstringSymbol* p=new SubstringSymbol(stringvalue,weightvalue);if (p!=NULL) symbolset.insert(p);return p;}
private:
// internal save/load helper functions
/// Save parameters to stream.
/// @return true on success.
bool SaveParameters(bitwriter &to);
/// Load parameters from stream.
/// @return true on success.
bool LoadParameters(bitreader &from);
/// Save symbol to stream.
/// @return true on success.
bool ReadSubstringSymbol(bitreader &from);
/// Load symbol from stream.
/// @return true on success.
bool WriteSubstringSymbol(SubstringSymbol *SubstringSymbolp,bitwriter &to);
private:
/// Reset weights on all symbols, in preparation for reparsing.
void ResetWeights();
private:
// Prune symbols from the set
/// Engage pruning routine, calls various other pruning steps.
void PruneSymbolSet();
/// Eliminate least frequent symbols in order to achieve a target symbol set size.
void PruneSymbolSet_ReduceSizeTo(unsigned int targetsize);
/// Eliminate all symbols with frequency counts below some threshold.
void PruneSymbolSet_WeightThreshold(TSubStrParserWeight thresholdweight);
/// Eliminate all symbols whose string values are proper prefixes of other symbols with equal frequencies
void PruneSymbolSet_RemoveRedundantPrefixes();
/// Re-parse the training file and compute heuristic costs of encoding and recompute symbol weight given current symbols.
/// This is necessary because after adding symbols and pruning, the frequency counts change due to interactions.
double ReParseFileReComputeCosts(bool updateweights);
private:
/// A heuristic cost computation that returns a cost of the symbol (just inverse of frequency).
/// This is used to help decide the heuristic cost of a symbol set on a whole file.
double GetSymbolCost(SubstringSymbol* symbolnodep) {return symbolnodep->get_cost();};
private:
// Internal encoding and finding symbols to encode
/// Given an input string window(buffer), pick out the symbol corresponding to leftmost string, according to parsing rules.
/// @return symbol pointer or NULL if no symbol could be found (which would be considered an error)
SubstringSymbol* FindNextSymbolToEncode(char *inputqueuestr,int inputqueuestrlen);
/// Use simple greedy mode to choose longest leftmost symbol from input buffer
/// @return symbol pointer or NULL if no symbol could be found (which would be consider an error)
/// @see FindNextSymbolToEncode
SubstringSymbol* FindNextSymbolToEncode_Greedy(char *inputqueuestr,int inputqueuestrlen);
/// After choosing a symbol to encode, this eats it from he input window buffer (and optionally displays some debugging info)
int SwallowSymbolFromInputQueueStr(SubstringSymbol *symbolp,char *inputqueuestr,int inputqueuestrlen,bool debugmode);
/// Calculate the heuristic cost of parsing a window of text, using current parsing mode.
/// This is used to optimally parse an input string by finding best way to divide it up into symbols.
double CalculateEncodeCost(char *inputqueuestr,int inputqueuestrlen);
private:
/// build a vector of symbols for easy random access
void BuildSymbolVector();
private:
/// Free data held by the class.
void FreeData();
/// Free symbol data held by the class.
void FreeData_Symbols();
};
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
#endif
//---------------------------------------------------------------------------
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -