📄 substringparser.cpp

📁 Octane v1.01.20 The Open Compression Toolkit for C++ . The Open Compression Toolkit is a set of mo
💻 CPP
📖 第 1 页 / 共 4 页
字号:
上一页 1 2 34
			searchSubstringSymbol.set_value(inputstring);
			}
		}
	else
		{
		// dumb method (67:00 seconds on a test, about 3x slower than smart method)
		for(;;)
			{
			symbolset_pos=symbolset.find(&searchSubstringSymbol);
			if (symbolset_pos!=symbolset.end())
				{
				// we found an exact match
				break;
				}
			// not found, so shorten string by removing rightmost character
			--inputqueuestrlen;
			if (inputqueuestrlen<=0)
				{
				cout << "ERROR: unexpectedly didn't find any match for a string that started out with "<<originputqueuestrlen<<" characters (we hit "<<inputqueuestrlen<<").  last char was "<<(int)inputqueuestr[0]<<"."<<endl;
				return NULL;
				}
			// decrease length of Substring currently being searched for and continue
			inputstring=inputstring.substr(0,inputqueuestrlen);
			searchSubstringSymbol.set_value(inputstring);
			}
		}

	// return the pointer
	return *symbolset_pos;
}


double SubstringParser::CalculateEncodeCost(char *inputqueuestr,int inputqueuestrlen)
{
	// try to find the optimal cost of encoding the string, using currently selected methods
	//  and return its cost, ASSUMING that we start by parsing longest matching leftmost substr
	SubstringSymbol* symbolnodep;
	double cost=0;

	// start by greedy parse of biggest leftmost substr
	symbolnodep = FindNextSymbolToEncode_Greedy(inputqueuestr,inputqueuestrlen);
	if (symbolnodep==NULL)
		{
		// no match found - this should never happen
		cout << "$no match found."<<endl;
		return 99999;
		}

	// add cost of initial Substring
	cost+=GetSymbolCost(symbolnodep);

	// advance pointer
	inputqueuestr+=symbolnodep->get_valuelen();
	inputqueuestrlen-=symbolnodep->get_valuelen();

	// now the remainder of the string could be done smart or greedy
	if (Parameter_ParseMode!=Dictionary_ParseMode_GREEDY && inputqueuestrlen>2)
		{
		// SMARTER HEURISTIC PARSE (slower than greedy, but *maybe* better)
		int bestindex=inputqueuestrlen;
		double bestscore=0;
		double encodecost;
		bool firstscore=true;
		for (int index=1;index<=inputqueuestrlen;++index)
			{
			// find cost if we encode based on split at index
			encodecost=CalculateEncodeCost(inputqueuestr,index);
   			if (index<inputqueuestrlen)
      			encodecost+=CalculateEncodeCost(&inputqueuestr[index],inputqueuestrlen-index);
			// update our tracking of best score
			if (firstscore || encodecost<bestscore)
				{
				bestscore=encodecost;
				bestindex=index;
				firstscore=false;
				}
			}
		// now set cost to best cost found
		cost+=bestscore;
		}
	else if (inputqueuestrlen>0)
		{
		// GREEDY PARSE cost
		cost+=CalculateEncodeCost(inputqueuestr,inputqueuestrlen);
		}
	else
		{
		// nothing left to score
		}

	// now return cost
	return cost;
}
//---------------------------------------------------------------------------




//---------------------------------------------------------------------------
// Internal Pruning functions

void SubstringParser::PruneSymbolSet_WeightThreshold(TSubStrParserWeight thresholdweight)
{
	// remove all weights below some threshold set by parameter
	// this is a fast and easy step that will remove almost the entire current symbol set :)
	TSymbolSetTYPE::iterator nextsymbolpos;
	TSubStrParserWeight weight;
	int index=0;

	// First simplest pruning operation - simply remove all symbols which don't have some threshold count
	for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();)
		{
		++index;
		nextsymbolpos=symbolset_pos;
		++nextsymbolpos;
		// only show non-zero weights (i.e. symbols used at least once)
		if (!((*symbolset_pos)->dontprune()))
			{
			weight=(*symbolset_pos)->get_weight();
			if (weight<thresholdweight)
				{
				// delete this symbol
				delete (*symbolset_pos);
				symbolset.erase(symbolset_pos);
				}
			}
		// advance pointer
		symbolset_pos=nextsymbolpos;
		}
}


void SubstringParser::PruneSymbolSet_RemoveRedundantPrefixes()
{
	// some Substring symbols are simply prefixes of others, with no independent occurrences
	//  for example the Substring "supercomput" is a redundant Substring of "supercomputer"
	//  so we can remove those with no cost.
	TSymbolSetTYPE::iterator nextsymbolpos;
	TSymbolSetTYPE::iterator lastsymbolpos;
	TSubStrParserWeight weight,lastweight;
	string lastvalue;
	int lastvaluelen;

	lastweight=0;
	for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();)
		{
		nextsymbolpos=symbolset_pos;
		++nextsymbolpos;
		// some symbols (like primitive characters, cannot be pruned)
		if ((*symbolset_pos)->dontprune())
			lastweight=0;
		else
			{
			weight=(*symbolset_pos)->get_weight();
			if (weight==lastweight && weight!=0)
				{
				// got a candidate for pruning, IFF our prior string is a proper prefix of us
				lastvaluelen=(int)(lastvalue.length());
				if (((*symbolset_pos)->get_value()).compare(0,lastvaluelen,lastvalue.c_str(),lastvaluelen)==0)
					{
					// yes it is, so kill our last value
					delete *lastsymbolpos;
					symbolset.erase(lastsymbolpos);
					}
				}
			// remember last symbol details
			lastweight=weight;
			lastvalue=(*symbolset_pos)->get_value();
			lastsymbolpos=symbolset_pos;
			}
		// advance pointer
		symbolset_pos=nextsymbolpos;
		}
}


void SubstringParser::PruneSymbolSet_ReduceSizeTo(unsigned int targetsize)
{
	// we have exceeded our maximum # of symbols allowed during build process, so reduce the size
	//  this is done quite inefficiently, but we can worry about that later, or not, since its not in a time-critical place
	// ATTN: a smarter way would be to sort by weight and pop off smallest weights.
	if (symbolset.size()<targetsize)
		return;

	for (int count=2;count<10000;++count)
		{
		// remove all removable symbols with a weight < count and then check set size again
		PruneSymbolSet_WeightThreshold((TSubStrParserWeight)count);
		if (symbolset.size()<=targetsize)
			break;
		}
}


void SubstringParser::PruneSymbolSet()
{
	// we need to reduce the size of the dictionary SymbolSet by pruning symbols we think are not worth the coding space
	//  having silly symbols increases our cost in two main ways:
	//   1) cost in memory needed to store the dictionary in memory and time needed to search dictionary
	//   2) cost incurred due to the fact that the more words in dictionary, the more bits needed to encode other symbols.
	//      this is a very serious issue and suggests the dictionary should be as small as possible.
	// the bitreader is passed so that we can reparse the input in order to test our created symbol set.

	// display info
	cout << endl << "Total symbols in dictionary before pruning: " << GetSetSymbolCount() << endl;

	// remove weights below a threshold
	PruneSymbolSet_WeightThreshold(Parameter_PruneMinimumWeight);
	cout << "After Stage 1 Pruning (removed scores below "<<Parameter_PruneMinimumWeight<<"), symbols: " << GetSetSymbolCount() << endl;

	// remove redundant prefixes
	PruneSymbolSet_RemoveRedundantPrefixes();
	cout << "After Stage 2 Pruning (removed redundant prefixes), symbols: " << GetSetSymbolCount() << endl;

	// pre-capping recompression
	if (Parameter_PruneReCalculations)
		{
		// now actually re-compress the file using current symbol set to get better real-world weight counts, and repeat prunings
		ReParseFileReComputeCosts(true);
		PruneSymbolSet_WeightThreshold(Parameter_PruneMinimumWeight);
		PruneSymbolSet_RemoveRedundantPrefixes();
		// some info for user
		cout << "After pre-ceil pre-compression, re-evaluation, and repruning, symbols: " << GetSetSymbolCount() << endl;
		}

	// finally, cap symbol count.  there are smart and dumb ways this can be done
	PruneSymbolSet_ReduceSizeTo(Parameter_MaxSymbols_Final);
	cout << "After Stage 3 Pruning (killed worst scores to reach target), symbols: " << GetSetSymbolCount() << endl;

	// more recompression loops after capping
	if (Parameter_PruneReCalculations)
		{
		// now actually re-compress the file using current symbol set to get better real-world weight counts, and repeat prunings
		ReParseFileReComputeCosts(true);
		PruneSymbolSet_WeightThreshold(Parameter_PruneMinimumWeight);
		PruneSymbolSet_RemoveRedundantPrefixes();
		cout << "After re-compression, re-evaluation, and repruning, symbols: " << GetSetSymbolCount() << endl;
		}
}
//---------------------------------------------------------------------------




//---------------------------------------------------------------------------
// For reparsing and recomputing scores

double SubstringParser::ReParseFileReComputeCosts(bool updateweights)
{
	// USING the current symbolset we have just created, reparse it and recalculate the 'cost'
	//  and frequency of each symbol.
	// This is useful because as we create and modify(prune) the symbolset, the symbols interact
	//  and effect frequencies, so its helpful to reparse the input and recalculate frequencies.
	// return sum cost of parsed symbols IFF updateweights == false

	// First we need to rebuild the symbol vector before we can parse
	BuildSymbolVector();

	int symbolnum;
	SubstringSymbol *symbolp;
	double totalcost=0;

	// reset input bitreader to where we started dictionarizing
	current_bitreaderp->seek_bit(start_bitreaderposition);

	// reset weights if we are recalculating them
	if (updateweights)
		ResetWeights();

	// loop through input grabbing symbols
	// ATTN: are we properly updating weights?
	while (ParseNextSymbolFromInput(*current_bitreaderp,symbolnum))
		{
		symbolp=symbolvector[symbolnum];
		if (updateweights)
			{
			// we can be run in a mode to actually update weights while encoding
			if (updateweights)
				{
				// note this will not change priority queue, and is therefore somewhat skidding on what is 'allowed' by stl
				//  but we use this purposefully so that we are modifying the weights while preserving current priority queue.
				symbolp->increment_weight(1);
				// we need to also keep track of CRs for EOS's if appropriate
				if (Parameter_CountCRsAsEOTs && strchr(symbolp->get_valuep()->c_str(),13)!=NULL)
					{
					symbolp=FindNextSymbolToEncode_Greedy("",0);
					if (symbolp!=NULL)
						symbolp->increment_weight(1);
					}
				}
			}
		else
			totalcost+=symbolp->get_cost();
		}

	// return success
	return totalcost;
}
//---------------------------------------------------------------------------


//---------------------------------------------------------------------------
// Internal symbol construction

void SubstringParser::BuildSymbolVector()
{
	// build a vector of symbol pointers, for efficient random access
	// clear any existing vector
	symbolvector.clear();
	// add symbols from set
	int index=0;
	for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();++symbolset_pos)
		{
		// add symbol to vector
		symbolvector.push_back(*symbolset_pos);
		// tell symbol what number it is
		(*symbolset_pos)->set_symbolvectorpos(index++);
		}
	// now we need to update our count of how many symbols we have
	symbolcount=(int)(symbolvector.size());
	// and record the symbol representing the end-of-stream (EOS is "")
	endofstreamsymbolnum=LookupSymbolNumberFromText(string(""));
}


int SubstringParser::LookupSymbolNumberFromText(string wordstring)
{
	// return the number of a symbol in the parsing symbolset (or -1 if not found)
	static SubstringSymbol searchSubstringSymbol("",1);
	// now we use this searchSubstringSymbol for searching
	searchSubstringSymbol.set_value(wordstring);
	symbolset_pos=symbolset.find(&searchSubstringSymbol);
	// if its not found, return -1
	if (symbolset_pos==symbolset.end())
		return -1;
	// otherwise return its index symbol number
	return ((*symbolset_pos)->get_symbolvectorpos());
}
//---------------------------------------------------------------------------
上一页 1 2 34
💿 文件大小 170 K
👤 上传用户 boobyb
📂 所属分类压缩解压
🏷️ 相关标签

#Compression #Toolkit #Open #The
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -