⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 substringparser.cpp

📁 Octane v1.01.20 The Open Compression Toolkit for C++ . The Open Compression Toolkit is a set of mo
💻 CPP
📖 第 1 页 / 共 4 页
字号:
/// @file substringparser.cpp
/// A flexible parser which can creates a symbol dictionary of substrings
///
/// @remarks
///  This is a general purpose class which is designed to be used with any
///   compressor that wants to parse files and count occurrences of certain
///   characters/words/Substrings.  Parameters can be set to regulate how
///   symbols are extracted.  Several utility functions are provided so that
///   this class can by default do much of the work in parsing files and
///   pruning dictionaries.
///
/// @author mouser
/// @date   2003.07.29
///
/// @version 2004.05.19 mouser [-] fixed bug in smartlookup mode of SubStringParser that could cause crashing when incoming stream matched last symbol in set.<br>
//---------------------------------------------------------------------------

//---------------------------------------------------------------------------
// Application includes
#include "substringparser.hpp"
// System includes
#include <iostream>
#include <iomanip>
#include <string>
//---------------------------------------------------------------------------




//---------------------------------------------------------------------------
SubstringParser::SubstringParser()
{
	// constructor
	// initialize default dictionary-building parameters
	SetDefaultParameters();
	// initialize our current symbol count and end-of-stream-symbol
	symbolcount=0;
	endofstreamsymbolnum=-1;
	// initialize input buffer pos
	inputbufferlen=0;
	senteos=false;
}

SubstringParser::~SubstringParser()
{
	// destructor frees the SubstringSymbols in symbolset
	FreeData();
}
//---------------------------------------------------------------------------



//---------------------------------------------------------------------------
// OCTANE PUBLIC API - AUXILIARY FUNCTIONS
bool SubstringParser::SaveState(bitwriter &to,bool fortempdecompressiononly)
{
	// save state
	// return true on success
	bool bretv;
	bretv=SaveParameters(to);
	if (bretv)
		bretv=SaveSymbols(to);
	return bretv;
}

bool SubstringParser::LoadState(bitreader &from)
{
	// load state
	// return true on success
	bool bretv;
	bretv=LoadParameters(from);
	if (bretv)
		bretv=LoadSymbols(from);
	return bretv;
}


unsigned int SubstringParser::GetMemoryUsed()
	{
	// return memory used by this data structure
	unsigned int memoryused=sizeof(this);

	// this returns the size of the symbolset, as it would take up on disk if saved in straightforward way
	//  memory usage will be almost identical.
	memoryused+=sizeof(symbolset);
	// write entries in compact binary form
	for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();++symbolset_pos)
		{
		memoryused+=sizeof(*symbolset_pos);
		memoryused+=(*symbolset_pos)->get_memoryused();
		}
	// for vector
	for (symbolvector_pos=symbolvector.begin();symbolvector_pos!=symbolvector.end();++symbolvector_pos)
		memoryused+=sizeof(*symbolvector_pos);
	
	return memoryused;
	}


unsigned int SubstringParser::GetDiskspaceUsed(bool fortempdecompressiononly)
{
	// this returns the size of the symbolset, as it would take up on disk if saved in straightforward way
	//  memory usage will be almost identical.
	unsigned int memoryused=0;

	// byte for file type (see WriteSymbolSet routine)
	memoryused+=1;
	
	// write entries in compact binary form
	for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();++symbolset_pos)
		{
		// length of the string
		memoryused+=1;
		// write string
		memoryused+=(unsigned int)(((*symbolset_pos)->get_valuep())->length());
		// weight of symbol
		memoryused+=sizeof(unsigned short int);
		}

	// end-of-header
	memoryused+=1;
	return memoryused;
}




void SubstringParser::ShowDebuggingInfo()
{
	// show some debugging info
	using namespace std;
	string bitset_string;
	string value;

	// size of tree is
	std::cout << " Memory used by SubStringParser: " << GetMemoryUsed() << " bytes ("<< (unsigned int)(GetSetSymbolCount())<<" elements).\n";
	
	// show the dictionary sorted alphabetically
	if (symbolset.size()>0)
		{
		cout << "Symbol Set - Sorted Alphabetically ("<<(unsigned int)(symbolset.size())<<" entries)\n";
		cout << " " << setiosflags(ios::left) << setw(Parameter_MaxSubstringSize) << "symboltext" << " " << setw(10) << "frequency\n";
		cout << " " << setiosflags(ios::left) << setw(Parameter_MaxSubstringSize) << "----------" << " " << setw(10) << "---------\n";
		for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();++symbolset_pos)
			{
			value=(*symbolset_pos)->get_value();
			NiceifyHighAsciiString(value);
			cout << " " << setiosflags(ios::left) << setw(Parameter_MaxSubstringSize) << value << " " << setw(10) << (*symbolset_pos)->get_weight() << "\n";
			}
		cout << "\n";
		}
}
//---------------------------------------------------------------------------


//---------------------------------------------------------------------------
// OCTANE PUBLIC API - AUXILIARY FUNCTIONS
	
string SubstringParser::GetParametersInformation()
{
	// show user available variables and their current values
	// ATTN: this has not been converted from cout to string
	string returnstring;
	cout << setiosflags(ios::left) << setw(17) << "maxbuildsymbols" << setiosflags(ios::left) << " " << setw(10) << Parameter_MaxSymbols_DuringBuild << " " << "parsing dictionary pruned to not exceed this size" << endl;
	cout << setiosflags(ios::left) << setw(17) << "maxsymbols" << setiosflags(ios::left) << " " << setw(10) << Parameter_MaxSymbols_Final << " " << "final dictionary pruned to reach this size" << endl;
	cout << setiosflags(ios::left) << setw(17) << "maxSubstring" << setiosflags(ios::left) << " " << setw(10) << Parameter_MaxSubstringSize << " " << "max size of Substrings in dictionary" << endl;
	cout << setiosflags(ios::left) << setw(17) << "onlywords" << setiosflags(ios::left) << " " << setw(10) << Parameter_OnlyCodeWholeWords << " " << "only add whole word symbols? (bool)" << endl;
	cout << setiosflags(ios::left) << setw(17) << "spanwords" << setiosflags(ios::left) << " " << setw(10) << Parameter_SpanWordBoundaries << " " << "encode symbols that span words? (bool)" << endl;
	cout << setiosflags(ios::left) << setw(17) << "crends" << setiosflags(ios::left) << " " << setw(10) << Parameter_CountCRsAsEOTs << " " << "count CRs as end-of-stream symbols? (bool)" << endl;
	cout << setiosflags(ios::left) << setw(17) << "minweight" << setiosflags(ios::left) << " " << setw(10) << Parameter_PruneMinimumWeight << " " << "minimum threshold weight for final pruning" << endl;
	cout << setiosflags(ios::left) << setw(17) << "smartlookup" << setiosflags(ios::left) << " " << setw(10) << Parameter_UseSmartLookup << " " << "when true, use lower_bound stl dictionary search" << endl;
	cout << setiosflags(ios::left) << setw(17) << "parsemode" << setiosflags(ios::left) << " " << setw(10) << Parameter_ParseMode << " " << "parse heuristic (0=greedy 1=lff 2=deep&slow)" << endl;
	cout << setiosflags(ios::left) << setw(17) << "recalculations" << setiosflags(ios::left) << " " << setw(10) << Parameter_PruneReCalculations << " " << "recalculate symbol costs during pruning?" << endl;
	return returnstring;
}


bool SubstringParser::SetParameter(const std::string &parametername,const std::string &parametervalue)
{
	// generic function for user-interactive modification of parameters
	// (see SetDefaultDictionaryBuildingParameters implementation for descriptions)
	// return true if we know this variable and use it
	bool bretv=false;
	if (parametername=="maxbuildsymbols")
		bretv=ParseParameter(parametervalue,Parameter_MaxSymbols_DuringBuild);
	else if (parametername=="maxsymbols")
		bretv=ParseParameter(parametervalue,Parameter_MaxSymbols_Final);
	else if (parametername=="maxSubstring")
		bretv=ParseParameter(parametervalue,Parameter_MaxSubstringSize);
	else if (parametername=="onlywords")
		bretv=ParseParameter(parametervalue,Parameter_OnlyCodeWholeWords);
	else if (parametername=="spanwords")
		bretv=ParseParameter(parametervalue,Parameter_SpanWordBoundaries);
	else if (parametername=="crends")
		bretv=ParseParameter(parametervalue,Parameter_CountCRsAsEOTs);
	else if (parametername=="minweight")
		bretv=ParseParameter(parametervalue,Parameter_PruneMinimumWeight);
	else if (parametername=="smartlookup")
		bretv=ParseParameter(parametervalue,Parameter_UseSmartLookup);
	else if (parametername=="parsemode")
		bretv=ParseParameter(parametervalue,Parameter_ParseMode);
	else if (parametername=="recalculations")
		bretv=ParseParameter(parametervalue,Parameter_PruneReCalculations);
	return bretv;
}


void SubstringParser::SetDefaultParameters()
{
	// encoding parameters that govern how dictionary is built

	// the dictionary will never be allowed to get bigger than this size (smaller will also increase building speed)
	//  when this size is exceeded, we will eliminate all symbols with frequencies of smaller than some small #
	Parameter_MaxSymbols_DuringBuild=200000;

	// after the entire training file is parsed, we will repeat pruning steps to reduce it to a *maximum* of this many entries
	// the actual symbols kept may be smaller depending on other pruning parameters
	Parameter_MaxSymbols_Final=2000;

	// how many characters is largest (sub)string stored in dictionary
	Parameter_MaxSubstringSize=12;
	//	Parameter_MaxSubstringSize=3;
	//	Parameter_MaxSubstringSize=1;

	// only store characters and whole words?
	//	Parameter_OnlyCodeWholeWords=false;
	Parameter_OnlyCodeWholeWords=false;

	// store Substrings that span words (i.e. they contain multiple words)
	Parameter_SpanWordBoundaries=false;

	// minimum count for a word or Substring in order to even consider leaving it in the dictionary
	Parameter_PruneMinimumWeight=10;

	// when encoding single lines, each line must end with an end-of-stream (EOS) symbol
	//  but if we train on a file, which is large and has only one EOS at end of file
	//  then the freq. of EOS will be very low (1), so here we can say to count a pretend EOS
	//  whenever we see a CR.
	//Parameter_CountCRsAsEOTs=false;
	Parameter_CountCRsAsEOTs=true;

	// we can either use a smart search using stl find_lower_bound() to find longest Substring or we can search starting at biggest string backwards
	Parameter_UseSmartLookup=true;

	// we can either use a greedy search to find longest Substring starting with first char or we can use smarter but slower heuristics
	Parameter_ParseMode=Dictionary_ParseMode_GREEDY;

	// should we recompute symbol costs during pruning? (this actually employs the dictionary set and calculates actual usage counts and then reprunes)
	Parameter_PruneReCalculations=true;
}
//---------------------------------------------------------------------------



//---------------------------------------------------------------------------
// PARSER PUBLIC API - PREPARATIONS FOR PARSING
	
bool SubstringParser::RewindAnyBufferedInput(bitreader &from)
{
	// let go of any buffered stream - this is an odd function that can be called be compressor if it wants to hand
	//  off the input stream to a new parser or otherwise access the input bitstream from after last symbol parsed.
	//  it is necessary because some parsers can read-ahead in input buffer, and so must rewind the bitstream.
	// we have to implement this since we do a read-ahead.  note this wouldn't be called in normal circumstances,
	//  just if compressors wants us to stop prematurely and hand off the job to someone else.
	// return true on success

	// the number of characters we have read ahead in buffer is inputbufferlen
	from.seek_bit(from.tell_bit()-(inputbufferlen*8));
	return true;
}

void SubstringParser::SynchronizeStateForNewStream()
{
	// synchronize state for a new stream
	// this MUST be called before beginning a new parsing stream
	// reset input buffer and end-of-stream sent flag
	inputbufferlen=0;
	senteos=false;
}

bool SubstringParser::CreateSymbolSetUsingStream(bitreader &from)
{
	// scan a file and add its "tokens" into our symbol set
	string valuestring;
	string slidingwindowstring;
	unsigned char c;
	bool slidingwindowfull=false;
	string eotstring=string("");
	
	// clear any existing symbols
	FreeData_Symbols();
	// create primitive characters
	AddPrimitiveCharacterSubstringSymbols();
	
	// save the current bitreader and bitreader position for reparsing during pruning
	current_bitreaderp = &from;
	start_bitreaderposition = from.tell_bit();

	// now parse the file
	while (!from.empty())
		{
		// grab a character
		c=from.get_byte();

		// add the character to our window
		slidingwindowstring+=c;

		// trim lefthand side if it is too big
		if (!slidingwindowfull)
			{
			// we keep a sliding window which is slightly bigger than the maxSubstring, to account for delimiter boundaries
			if ((int)(slidingwindowstring.length())>Parameter_MaxSubstringSize+5)
				{
				// set the flag saying from now on, we are full
				slidingwindowfull=true;
				}
			}
		if (slidingwindowfull)
			{
			// trim leftmost character
			slidingwindowstring.erase(0,1);
			}

		// now add the appropriate Substrings of slidingwindowstring to our dictionary SymbolSet
		AddSubstringsFromSlidingWindowString(slidingwindowstring);

		// when encoding single lines, each line must end with an end-of-stream (EOS) symbol
		//  but if we train on a file, which is large and has only one EOS at end of file
		//  then the freq. of EOS will be very low (1), so here we can say to count a pretend EOS
		//  whenever we see a CR
		if (c==13 && Parameter_CountCRsAsEOTs)
			{
			// increment frequency for EOS
			UpdateValueStringInSymbolSet(eotstring,1);
			}

		if (symbolset.size()>Parameter_MaxSymbols_DuringBuild)
			{
			// periodically we may have to prune away lowest frequency symbols in order to keep dictionary manageable
				// how many should we remove - we don't want to remove just one since that would result in many repetitive calls here
			//  so we just pick some fraction to remove
			PruneSymbolSet_ReduceSizeTo(((unsigned int)(symbolset.size())/5)+1);
			}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -