⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 substringparser.cpp

📁 Octane v1.01.20 The Open Compression Toolkit for C++ . The Open Compression Toolkit is a set of mo
💻 CPP
📖 第 1 页 / 共 4 页
字号:
		}

	// now final pruning
	PruneSymbolSet();

	// and now build the vector for fast random access
	BuildSymbolVector();
	
	// return success
	return true;
}
//---------------------------------------------------------------------------



//---------------------------------------------------------------------------
// Parser API

bool SubstringParser::ParseNextSymbolFromInput(bitreader &from, int &symbolnum)
{
	// grab an input stream symbol and set its INDEX (in symbol vector) for symbolnum
	// return false after EOS
	unsigned char c;
	int maxSubstringlen=Parameter_MaxSubstringSize;
	SubstringSymbol *symbolp;
	// note that inputbufferlen are preserved across calls, and inputbufferstr[] is preserved state info

	// try to fill up the inputqueue to Parameter_MaxSubstringSize size, or as much as we got
	current_bitreaderp=&from;
	while (inputbufferlen<maxSubstringlen && !from.empty())
		{
		// add the character
		c=from.get_byte();
		inputbufferstr[inputbufferlen]=c;
		// increment character positions
		++inputbufferlen;
		}
	
	if (inputbufferlen==0)
		{
		// no more symbols left - BUT the question now is, do we return an EOS symbol, or false for no symbols left
		if (senteos)
			{
			// we already sent an EOS so from now on any requests for a symbol returns false saying no more symbols available
			return false;
			}
		else
			{
			// we are going to drop down to return the EOS signal, but we set flag so we don't do it again
			senteos=true;
			}
		}

	// ok now we have a block of up to Parameter_MaxSubstringSize bytes from the left of the inputstr
	// now find the longest leftmost (prefix) Substring in our dictionary and encode it, and shift inputqueuestr to the left with remaining bytes
	//  and return the new inputqueuestr with remaining bytes, and return length of remaining bytes.
	symbolp=FindNextSymbolToEncode(inputbufferstr,inputbufferlen);
//	symbolp=FindNextSubstringSymbolpFromInputQueueStr(inputbufferstr,inputbufferlen);

	symbolnum=symbolp->get_symbolvectorpos();
	inputbufferlen=SwallowSymbolFromInputQueueStr(symbolp,inputbufferstr,inputbufferlen,false);
	return true;
}


bool SubstringParser::WriteSymbolText(bitwriter &to, int symbolnum,bool &isendofstreamsymbol)
{
	// write the symbol indexed by symbolnum
	// sets isendofostreamsymbol to true or false depending on if the symbol written is the EOS symbol
	// return true on success

	SubstringSymbol *symbolp=symbolvector[symbolnum];
	int valuelength=(int)((symbolp->get_valuep())->length());
	to.write((symbolp->get_valuep())->c_str(),valuelength);

	// set EOS flag
	isendofstreamsymbol=(symbolnum==endofstreamsymbolnum);

	// return success
	return true;
}
//---------------------------------------------------------------------------














//---------------------------------------------------------------------------
// Internal functions for freeing data structure

void SubstringParser::FreeData()
{
	// free symbolset - this will key any SubstringSymbol nodes
	FreeData_Symbols();
}

void SubstringParser::FreeData_Symbols()
{
	// free set,vector and symbols
	
	// we only want to free the symbols once, even if they are in both set and vector
	for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();++symbolset_pos)
		{
		// delete the pointed to node
		delete (*symbolset_pos);
		}

	// clear main symbolset
	symbolset.clear();

	// and now rebuild the vector, which will clear it
	BuildSymbolVector();
}
//---------------------------------------------------------------------------


//---------------------------------------------------------------------------
// Internal Helper functions for saving and loading state

bool SubstringParser::SaveParameters(bitwriter &to)
{
	// write parameter settings
	to.put(Parameter_MaxSymbols_DuringBuild);
	to.put(Parameter_MaxSymbols_Final);
	to.put(Parameter_MaxSubstringSize);
	to.put(Parameter_OnlyCodeWholeWords);
	to.put(Parameter_SpanWordBoundaries);
	to.put(Parameter_PruneMinimumWeight);
	to.put(Parameter_CountCRsAsEOTs);
	to.put(Parameter_UseSmartLookup);
	to.put(Parameter_ParseMode);
	to.put(Parameter_PruneReCalculations);
	return true;
}


bool SubstringParser::LoadParameters(bitreader &from)
{
	// read parameter settings
	from.get(Parameter_MaxSymbols_DuringBuild);
	from.get(Parameter_MaxSymbols_Final);
	from.get(Parameter_MaxSubstringSize);
	from.get(Parameter_OnlyCodeWholeWords);
	from.get(Parameter_SpanWordBoundaries);
	from.get(Parameter_PruneMinimumWeight);
	from.get(Parameter_CountCRsAsEOTs);
	from.get(Parameter_UseSmartLookup);
	from.get(Parameter_ParseMode);
	from.get(Parameter_PruneReCalculations);
	return true;
}


bool SubstringParser::SaveSymbols(bitwriter &to)
{
	// write the symboldset to file
	// return true on success
	unsigned char valuelength;

	// write entries in compact binary form
	for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();++symbolset_pos)
		WriteSubstringSymbol((*symbolset_pos),to);

	// write an end-of-header in case we want to add this header as prefix of another file
	// we use a string-length==255 to mean end of table (no strings ever allowed greater than 254 characters)
	valuelength=255;
	to.put(valuelength);

	// return success
	return true;
}


bool SubstringParser::LoadSymbols(bitreader &from)
{
	// load a previously saved dictionary from file
	bool bretv=true;
	bool bretv2;

	// read dictionary
	while (!from.empty())
		{
		// read length of the string, as an unsigned car
		bretv2=ReadSubstringSymbol(from);
		if (!bretv2)
			break;
		}

	// and now build the vector for fast random access
	BuildSymbolVector();

	// return true on success
	return bretv;
}
//---------------------------------------------------------------------------


//---------------------------------------------------------------------------
// Internal Parsing
bool SubstringParser::WriteSubstringSymbol(SubstringSymbol *SubstringSymbolp,bitwriter &to)
{
	// SubstringSymbol reader/writer - derived classes will take these over if necessary
	// return true on success
	TSubStrParserWeight weightvalue;
	unsigned int weightvalue_uint;
	unsigned char valuelength;

	// write length of the string, as an unsigned int
	valuelength=(int)(((*symbolset_pos)->get_valuep())->length());
	to.put(valuelength);
	// write string
	to.write(((*symbolset_pos)->get_valuep())->c_str(),valuelength);
	// get weight of symbol
	weightvalue=(*symbolset_pos)->get_weight();
	// SubstringSymbols should be writable as unsigned ints, but we dynamic cast in case, which will throw an exception if not
	weightvalue_uint=static_cast<unsigned int>(weightvalue);
	to.put(weightvalue_uint);

	// return success
	return true;
}

bool SubstringParser::ReadSubstringSymbol(bitreader &from)
{
	// virtual SubstringSymbol reader/writer - derived classes will take these over if necessary
	// return false when we hit end
	unsigned int weightvalue_uint;
	unsigned char valuelength;
	char valuestr[256];
	std::string valuestring;
	
	from.get(valuelength);
	if (valuelength>=255)
		{
		// all done
		return false;
		}
	// read string
	from.read(valuestr,valuelength);
	valuestring=string(valuestr,valuelength);
	// read weight
	from.get(weightvalue_uint);
	// now add the symbol SubstringSymbol
	AddSymbol(valuestring,(TSubStrParserWeight)weightvalue_uint);
	return true;
}
//---------------------------------------------------------------------------



//---------------------------------------------------------------------------
// Internal symbol construction

void SubstringParser::AddPrimitiveCharacterSubstringSymbols()
	{
	// Push every possible character as SubstringSymbol into the set
	string valuestring;
	char valuestr[20];

	for ( int i = 0 ; i < 256 ; i++ )
		{
		// form the string value of this (just the dictionary character or word)
		valuestr[0]=i;valuestr[1]='\0';
		valuestring=string(valuestr,1);
		// create a new SubstringSymbol node with this character, with a count of 1 (if you try to add a weight of 0 you will mess up the tree since sums must be strictly increasing)
		UpdateValueStringInSymbolSet(valuestring,1);
		}

	// we are going to use the empty string "" as our EOS (or End of Encoding) symbol, and record its number
	valuestring="";
	UpdateValueStringInSymbolSet(valuestring,1);
	}


bool SubstringParser::UpdateValueStringInSymbolSet(const string &stringvalue,int increment)
{
	// add a symbol as a SubstringSymbol with 0 count
	// return false on a FAILURE to add the symbol (ran out of memory)
	bool bretv=true;

	// use a static local variable to reduce memory allocation-deallocation thrashing, and use it for searching
	static SubstringSymbol searchSubstringSymbol("",1);
	// now we use this searchSubstringSymbol for searching
	searchSubstringSymbol.set_value(stringvalue);

	// look it up
	symbolset_pos=symbolset.find(&searchSubstringSymbol);
	// if its not found, create it
	if (symbolset_pos==symbolset.end())
		{
		// create the SubstringSymbol and add it, with weight = increment
		AddSymbol(stringvalue,increment);
		}
	else
		{
		// we found it, so increment its count
		(*symbolset_pos)->increment_weight(increment);
		}

	// return false on failure
	return bretv;
}

void SubstringParser::ResetWeights()
{
	// reset weights for every symbol
	for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();++symbolset_pos)
		(*symbolset_pos)->set_weight(0);
}
//---------------------------------------------------------------------------



//---------------------------------------------------------------------------
// Internal symbol construction

void SubstringParser::AddSubstringsFromSlidingWindowString(string &slidingwindowstring)
{
	// According to our parameters, increment frequency counts (adding symbols when needed) of some Substrings from our sliding window
	// The way we do this is, our primary character is the rightmost one; thats the one that will be different on each call.
	// If we want to do standard huffman on characters, we would simply update the frequency of the rightmost character.
	// Alternatively, we have various other parameters which can govern which Substrings we track.
	// Some of many settings can generate HUGE trees and sets, but these trees will eventually be pruned of least-useful entries.
	// Note this procedure is the most convoluted one in the file, because it deals with various parameters
	//  which regulate what kinds of Substrings from the input stream get tokenized.

	string valuestring;
	bool bretv;
	int len=(int)(slidingwindowstring.length());
	unsigned char c,c2;
	int leftpos,rightpos,curpos;
	bool delimiteronborder;

	// first add the rightmost character, by incrementing its usage count by 1

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -