📄 substringparser.hpp

📁 Octane v1.01.20 The Open Compression Toolkit for C++ . The Open Compression Toolkit is a set of mo
💻 HPP
📖 第 1 页 / 共 2 页
字号:
上一页 12
	int inputbufferlen;
	/// The parser needs to return an end-of-symbol character before finishing reading the stream, so we need to track whether we sent an EOS yet.
	/// @todo this is an odd responsibility for the parser to have to deal with, so we should probably find a way to eliminate this.
	bool senteos;
protected:
	// Encoding parameters which govern how dictionary is built (see SetDefaultDictionaryBuildingParameters implementation for descriptions)
	/// The dictionary will never be allowed to get bigger than this size (smaller will also increase building speed)
	///  when this size is exceeded, we will eliminate all symbols with frequencies of smaller than some small #
	unsigned int Parameter_MaxSymbols_DuringBuild;
	/// After the entire training file is parsed, we will repeat pruning steps to reduce it to a *maximum* of this many entries
	///  the actual symbols kept may be smaller depending on other pruning parameters
	unsigned int Parameter_MaxSymbols_Final;
	/// How many characters is largest (sub)string stored in dictionary
	int Parameter_MaxSubstringSize;
	/// Only store characters and whole words?
	bool Parameter_OnlyCodeWholeWords;
	/// Store Substrings that span words (i.e. they contain multiple words)
	bool Parameter_SpanWordBoundaries;
	// When encoding single lines, each line must end with an end-of-stream (EOS) symbol
	//  but if we train on a file, which is large and has only one EOS at end of file
	//  then the freq. of EOS will be very low (1), so here we can say to count a pretend EOS
	//  whenever we see a CR.
	bool Parameter_CountCRsAsEOTs;
	/// Minimum count for a word or Substring in order to even consider leaving it in the dictionary
	unsigned int Parameter_PruneMinimumWeight;
	/// We can either use a greedy search to find longest Substring starting with first char or we can use smarter but slower heuristics
	int Parameter_ParseMode;
	/// We can either use a smart search using stl find_lower_bound() to find longest Substring or we can search starting at biggest string backwards
	bool Parameter_UseSmartLookup;
	/// How many iterations should we recompress to test heuristic symbolset cost, if any
	///  this tests the currently built symbol set to 'reencode' the input stream and calculates actual usage counts and then reprunes.
	bool Parameter_PruneReCalculations;
protected:
	/// Keeps track of how many symbols have we created for fast replies
	int symbolcount;
	/// Keeps track of end of stream symbol number for fast replies
	int endofstreamsymbolnum;
	/// Stores last bitreader , so that we can rewind when we want to reparse and check our heuristic symbolset costs
	bitreader *current_bitreaderp;
	/// Stores last bitreader starting position before parsing, so that we can rewind when we want to reparse and check our heuristic symbolset costs
	size_t start_bitreaderposition;
public:
	//---------------------------------------------------------------------------
	/// Constructor
	SubstringParser();
	/// Destructor
	~SubstringParser();
	//---------------------------------------------------------------------------
public:
	//---------------------------------------------------------------------------
	// PARSER PUBLIC API - PREPARATIONS FOR PARSING
	virtual bool CreateSymbolSetUsingStream(bitreader &from);
	virtual bool IsReadyToParse() {if (symbolcount>0) return true; else return false;};
	virtual bool RewindAnyBufferedInput(bitreader &from);
	//---------------------------------------------------------------------------
public:
	//---------------------------------------------------------------------------
	// PARSER PUBLIC API - MAIN PARSING FUNCTIONS
	/// Synchronize state for a new stream - this is called every time a new stream is parsed, to allow parser to get into 'start' state,
	virtual void SynchronizeStateForNewStream();
	virtual int GetSymbolCount() {return symbolcount;};
	virtual bool ParseNextSymbolFromInput(bitreader &from, int &symbolnum);
	virtual bool WriteSymbolText(bitwriter &to, int symbolnum,bool &isendofstreamsymbol);
	virtual string LookupSymbolText(int symbolnum) {assert(symbolnum<(int)(symbolvector.size()));return symbolvector[symbolnum]->get_value();};
	//---------------------------------------------------------------------------
public:
	//---------------------------------------------------------------------------
	// OCTANE PUBLIC API - AUXILIARY FUNCTIONS - these are supported by all derived classes
	virtual void ShowDebuggingInfo();
	virtual unsigned int GetMemoryUsed();
	virtual unsigned int GetDiskspaceUsed(bool fortempdecompressiononly);
	virtual std::string GetParametersInformation();
	virtual void SetDefaultParameters();
	virtual bool SetParameter(const std::string &parametername,const std::string &parametervalue);
	virtual bool SaveState(bitwriter &to,bool fortempdecompressiononly);
	virtual bool LoadState(bitreader &from);
	//---------------------------------------------------------------------------
public:
	//---------------------------------------------------------------------------
	// OCTANE PARSER PUBLIC API - AUXILIARY FUNCTIONS - symbol helpers
	int GetSetSymbolCount() {return (int)(symbolset.size());};
	int LookupSymbolNumberFromText(string wordstring);
	//---------------------------------------------------------------------------
private:
	// save and load state information
	/// Save complete state info (the entire symbolset and parameters).
	/// @return true on success.
	bool SaveSymbols(bitwriter &to);
	/// Load complete state info (the entire symbolset and parameters).
	/// @return true on success.
	bool LoadSymbols(bitreader &from);
private:
	// symbol manipulation
	/// Add the ascii characters 0-255 to the symbol set, in addition to the end-of-stream symbol
	void AddPrimitiveCharacterSubstringSymbols();
	/// Lookup a symbol based on its text and Increase the frequency of the symbol.
	/// The symbol will be created if it doesn't already exist.
	/// @return true on success.
	bool UpdateValueStringInSymbolSet(const string &stringvalue,int increment);
	/// Add all substrings contained in a window are added to dictionary (based on parameters defining which substrings to add).
	void AddSubstringsFromSlidingWindowString(string &slidingwindowstring);
	/// Create a new symbol and add it to symbol set with specific starting frequency.
	SubstringSymbol* AddSymbol(const std::string &stringvalue, const TSubStrParserWeight weightvalue) {SubstringSymbol* p=new SubstringSymbol(stringvalue,weightvalue);if (p!=NULL) symbolset.insert(p);return p;}
private:
	// internal save/load helper functions
	/// Save parameters to stream.
	/// @return true on success.
	bool SaveParameters(bitwriter &to);
	/// Load parameters from stream.
	/// @return true on success.
	bool LoadParameters(bitreader &from);
	/// Save symbol to stream.
	/// @return true on success.
	bool ReadSubstringSymbol(bitreader &from);
	/// Load symbol from stream.
	/// @return true on success.
	bool WriteSubstringSymbol(SubstringSymbol *SubstringSymbolp,bitwriter &to);
private:
	/// Reset weights on all symbols, in preparation for reparsing.
	void ResetWeights();
private:
	// Prune symbols from the set
	/// Engage pruning routine, calls various other pruning steps.
	void PruneSymbolSet();
	/// Eliminate least frequent symbols in order to achieve a target symbol set size.
	void PruneSymbolSet_ReduceSizeTo(unsigned int targetsize);
	/// Eliminate all symbols with frequency counts below some threshold.
	void PruneSymbolSet_WeightThreshold(TSubStrParserWeight thresholdweight);
	/// Eliminate all symbols whose string values are proper prefixes of other symbols with equal frequencies
	void PruneSymbolSet_RemoveRedundantPrefixes();
	/// Re-parse the training file and compute heuristic costs of encoding and recompute symbol weight given current symbols.
	/// This is necessary because after adding symbols and pruning, the frequency counts change due to interactions.
	double ReParseFileReComputeCosts(bool updateweights);
private:
	/// A heuristic cost computation that returns a cost of the symbol (just inverse of frequency).
	/// This is used to help decide the heuristic cost of a symbol set on a whole file.
	double GetSymbolCost(SubstringSymbol* symbolnodep) {return symbolnodep->get_cost();};
private:
	// Internal encoding and finding symbols to encode
	/// Given an input string window(buffer), pick out the symbol corresponding to leftmost string, according to parsing rules.
	/// @return symbol pointer or NULL if no symbol could be found (which would be considered an error)
	SubstringSymbol* FindNextSymbolToEncode(char *inputqueuestr,int inputqueuestrlen);
	/// Use simple greedy mode to choose longest leftmost symbol from input buffer
	/// @return symbol pointer or NULL if no symbol could be found (which would be consider an error)
	/// @see FindNextSymbolToEncode
	SubstringSymbol* FindNextSymbolToEncode_Greedy(char *inputqueuestr,int inputqueuestrlen);
	/// After choosing a symbol to encode, this eats it from he input window buffer (and optionally displays some debugging info)
	int SwallowSymbolFromInputQueueStr(SubstringSymbol *symbolp,char *inputqueuestr,int inputqueuestrlen,bool debugmode);
	/// Calculate the heuristic cost of parsing a window of text, using current parsing mode.
	/// This is used to optimally parse an input string by finding best way to divide it up into symbols.
	double CalculateEncodeCost(char *inputqueuestr,int inputqueuestrlen);
private:
	/// build a vector of symbols for easy random access
	void BuildSymbolVector();
private:
	/// Free data held by the class.
	void FreeData();
	/// Free symbol data held by the class.
	void FreeData_Symbols();
};
//---------------------------------------------------------------------------





//---------------------------------------------------------------------------
#endif
//---------------------------------------------------------------------------
上一页 12
💿 文件大小 170 K
👤 上传用户 boobyb
📂 所属分类压缩解压
🏷️ 相关标签

#Compression #Toolkit #Open #The
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -