📄 substringparser.cpp

📁 Octane v1.01.20 The Open Compression Toolkit for C++ . The Open Compression Toolkit is a set of mo
💻 CPP
📖 第 1 页 / 共 4 页
字号:
	valuestring=slidingwindowstring.substr(len-1,1);
	bretv=UpdateValueStringInSymbolSet(valuestring,1);

	// if the string length is only one character, then we are done.
	if (len==1)
		return;

	// reset flag which helps us prevent from adding a keyword which ends in a delimiter
	delimiteronborder=false;

	if (Parameter_OnlyCodeWholeWords)
		{
		// we only want to store full words, so if we have not hit a new word boundary, then return
		c=slidingwindowstring[len-1];
		c2=slidingwindowstring[len-2];
		if (len<3)
			{
			// too small to be on a word boundary
			return;
			}
		if (!IsNonWordCharacter(c) && !IsNonWordCharacter(c2))
			{
			// we are within a word
			return;
			}
		if (IsNonWordCharacter(c) && IsNonWordCharacter(c2))
			{
			// we are within a word
			return;
			}
		// start end pointer at last non-delimiter
		rightpos=len-2;
		}
	else
		{
		// start end position at last character
		rightpos=len-1;
		}

	// set flag to tell us whether the new character is a delimiter and shouldn't be coded on the end of a real word
	if (IsNonWordCharacter(slidingwindowstring[rightpos]))
		{
		// other than adding delimiters as their own characters (done above), we don't want to code them as part of a word
		// so we set a flag here saying that we are only going to move leftward to catch a sequence of delimiters as a Substring, and NOT
		// allows a Substring which has delimiters on the *borders* of words
		delimiteronborder=true;
		}
	else
		delimiteronborder=false;

	// set left pos
	leftpos=rightpos-Parameter_MaxSubstringSize;
	if (leftpos<0)
		leftpos=0;

	// start walking backwards from right to left and add Substrings to dictionary
	//  always encoding from curpos to end (rightpos)
	for (curpos=rightpos-1;curpos>leftpos;--curpos)
		{
		c=slidingwindowstring[curpos];
		if (delimiteronborder && IsNonWordCharacter(c))
			{
			if (Parameter_OnlyCodeWholeWords && curpos>leftpos)
				{
				// since we only code words, we don't want to code this Substring of delimiters yet
				continue;
				}
			// because we are parsing a string of delimiters, we want to just drop through to write out this Substring
			}
		else if (IsNonWordCharacter(c) || delimiteronborder)
			{
			if (IsNonWordCharacter(c) && IsNonWordCharacter(slidingwindowstring[curpos+1]))
				{
				// we have hit a series of delimiters to the left of a word, so we've already encoded the non-delimiter-bordered stuff to our right
				// so we don't want to encode it again.
				continue;
				}
			if (Parameter_OnlyCodeWholeWords || delimiteronborder)
				{
				// we found a new word (or sequence of words), so encode it
				valuestring=slidingwindowstring.substr(curpos+1,rightpos-curpos);
				//cout << "type 1 word: "<<valuestring<< endl;
				bretv=UpdateValueStringInSymbolSet(valuestring,1);
				if (delimiteronborder)
					{
					// we just finished writing out a string of delimiters bordered on left by non-delimiter, so we stop now
					break;
					}
				else if (!Parameter_SpanWordBoundaries)
					{
					// since we don't span word boundaries, we are done
					break;
					}
				else
					{
					// let it continue to look for next whole word
					continue;
					}
				}
			else if (!Parameter_SpanWordBoundaries)
				{
				// we hit a word boundary, so we are done
				break;
				}
			else if (IsNonWordCharacter(c) && !delimiteronborder)
				{
				// we don't want to let any word Substring start with a delimiter (or end with one, which is checked above)
				// actually i think by the time we drop down to here this is always true
				continue;
				}
			}
		else if (delimiteronborder)
			{
			// our rightmost character was a delimiter so we wont consider any non-delimiters words to the left of it
			break;
			}
		else if (Parameter_OnlyCodeWholeWords)
			{
			// we are in the middle of a word, so we do nothing
			continue;
			}

		// update the Substring
		valuestring=slidingwindowstring.substr(curpos,(rightpos-curpos)+1);
		bretv=UpdateValueStringInSymbolSet(valuestring,1);
		}
}
//---------------------------------------------------------------------------




//---------------------------------------------------------------------------
// Internal Parsing

int SubstringParser::SwallowSymbolFromInputQueueStr(SubstringSymbol *symbolp,char *inputqueuestr,int inputqueuestrlen,bool debugmode)
{
	// find which symbol to encode
	if (symbolp==NULL)
		{
		// no match found
		return -1;
		}

	// debugmode shows the sub-string used for encoding
	if (debugmode)
		cout << symbolp->get_value() << "*";

	// shift inputqueuestrlen
	int writepos=0;
	for (int pos=(int)((symbolp->get_valuep())->length());pos<inputqueuestrlen;++pos)
		{
		inputqueuestr[writepos]=inputqueuestr[pos];
		++writepos;
		}
	inputqueuestrlen=writepos;

	// return length of shifted inputqueuestr
	return inputqueuestrlen;
}


SubstringSymbol* SubstringParser::FindNextSymbolToEncode(char *inputqueuestr,int inputqueuestrlen)
{
	// ok now we have a block of up to Parameter_MaxSubstringSize bytes from the left of the inputstr
	// now find the longest leftmost (prefix) Substring in our dictionary and encode it, and shift inputqueuestr to the left with remaining bytes.
	// return the new inputqueuestr with remaining bytes, and return length of remaining bytes.
	SubstringSymbol *symbolnodep = 0;

	if (Parameter_ParseMode==Dictionary_ParseMode_GREEDY || inputqueuestrlen<=2)
		{
		// GREEDY PARSE
		// find biggest matching prefix in symbol dictionary - this should never fail unless the dictionary is not built at all,
		//  since the dictionary is supposed to contain every character, so at least the leftmost character should match
		symbolnodep = FindNextSymbolToEncode_Greedy(inputqueuestr,inputqueuestrlen);
		}
	else if (Parameter_ParseMode==Dictionary_ParseMode_LFF)
		{
		// SMARTER HEURISTIC PARSE (slower than greedy, but *maybe* better)
		int bestindex=inputqueuestrlen;
		int bestscore=-1;
		for (int index=0;index<=inputqueuestrlen-1;++index)
			{
			// find length of the string beginning at index
			// early stop if we know we cant do better
			if (bestscore>=inputqueuestrlen-index)
				break;
			symbolnodep = FindNextSymbolToEncode_Greedy(&inputqueuestr[index],inputqueuestrlen-index);
			if (symbolnodep==NULL)
				break;
			if (bestscore==-1 || symbolnodep->get_valuelen()>bestscore)
				{
				bestscore=symbolnodep->get_valuelen();
				bestindex=index;
				}
			}
		// now that we found the longest Substring, parse the string BEFORE it
		if (bestindex==0)
			bestindex=inputqueuestrlen;
		symbolnodep = FindNextSymbolToEncode_Greedy(inputqueuestr,bestindex);
		}
	else if (Parameter_ParseMode==Dictionary_ParseMode_DEEP)
		{
		// SMARTER HEURISTIC PARSE (slower than greedy, but *maybe* better)
		double encodecost;
		double bestscore;
		int longestlen=inputqueuestrlen;
		int bestindex=longestlen;
		bool firstscore=true;
		do
			{
			bestscore=-1;
			longestlen=bestindex;
			for (int index=1;index<=longestlen;++index)
				{
				// find cost if we encode based on split at index
				encodecost=CalculateEncodeCost(inputqueuestr,index);
   				if (index<longestlen)
	      			encodecost+=CalculateEncodeCost(&inputqueuestr[index],longestlen-index);
				// update our tracking of best score
				if (firstscore || encodecost<=bestscore)
					{
					bestscore=encodecost;
					bestindex=index;
					firstscore=false;
					}
				}
			} while (bestindex!=longestlen);
		// and do the search with this limit (note we could have asked CalculateEncodeCost to return best found first match
		symbolnodep = FindNextSymbolToEncode_Greedy(inputqueuestr,bestindex);
		}
	else
		{
		// unknown parse mode
		cout << "ERROR: unknown parse mode "<<Parameter_ParseMode<<"."<<endl;
		}

	return symbolnodep;
	}


SubstringSymbol* SubstringParser::FindNextSymbolToEncode_Greedy(char *inputqueuestr,int inputqueuestrlen)
{
	// find biggest matching prefix in symbol dictionary - this should never fail unless the dictionary is not built at all,
	//  since the dictionary is supposed to contain every character, so at least the leftmost character should match
	// return a pointer to the symbol, or NULL if there is some error and we cant find one.

	// there are several ways to do this dictionary search within the SymbolSet, which is a binary tree
	// one way is to start with full string and gradually shorten it until we get an exact match
	// another is to ask the stl set to find the nearest match, and search back from there for shortest exact match
	int originputqueuestrlen=inputqueuestrlen;
	string inputstring;

	// use a static local variable to reduce memory allocation-deallocation thrashing
	static SubstringSymbol searchSubstringSymbol("",1);

	// convert char * to string, using exactly inputqueuestrlen, regardless of any '\0'
	if (inputqueuestrlen==0)
		inputstring="";
	else
		inputstring=string(inputqueuestr,inputqueuestrlen);

	// truncate it to max length of any symbol, incase caller sometimes passes more
	if (inputqueuestrlen>Parameter_MaxSubstringSize)
		{
		inputqueuestrlen=Parameter_MaxSubstringSize;
		inputstring=inputstring.substr(0,inputqueuestrlen);
		}

	// now we use this searchSubstringSymbol for searching
	searchSubstringSymbol.set_value(inputstring);

	// find the longest Substring, using a smarter use of stl lower_bound instead of find()
	if (Parameter_UseSmartLookup)
		{
		// smarter method (22:00 seconds on a test, about 3x faster than slow method below
		//  ATTN: can we simplify this?
		int len;
		int index;
		string teststring;
		for (;;)
			{
			symbolset_pos=symbolset.lower_bound(&searchSubstringSymbol);
			// check for error
			if (symbolset_pos==symbolset.end())
				{
				// nothing smaller so the LAST symbol, single character (we verify to make sure there isnt an error)
				--symbolset_pos;
				// shorten main string to match
				inputqueuestrlen=1;
				inputstring=inputstring.substr(0,inputqueuestrlen);
				teststring=(*symbolset_pos)->get_value();
				if (inputstring[0]!=teststring[0])
					{
					cout << "ERROR: Unexpectedly didn't find any match for a string that started out "<<originputqueuestrlen<<" characters (we hit "<<inputqueuestrlen<<").  last char was "<<(int)((unsigned char)(inputqueuestr[0]))<<"."<<endl;
					return NULL;
					}
				// ok the best match is the last symbol
				searchSubstringSymbol.set_value(inputstring);
				break;
				}

			//teststring=(*symbolset_pos)->get_value();
			//len=(int)(teststring.length());
			//cout << "lowerbound for '"<<inputstring<<"' = '" << teststring <<"' (len="<<len<<")."<<endl;;

			// if we have an exact match we are done
			teststring=(*symbolset_pos)->get_value();
			len=(int)(teststring.length());
			if (teststring==inputstring)
				break;

			// try one previous
			--symbolset_pos;
			teststring=(*symbolset_pos)->get_value();
			len=(int)(teststring.length());
			if (len<inputqueuestrlen)
				{
				// shorten main string to match
				inputqueuestrlen=len;
				inputstring=inputstring.substr(0,inputqueuestrlen);
				}
			// if we have an exact match we are done
			if (teststring==inputstring)
				break;

			// if not, find the last character they agree on and find next bound
			len=(int)(teststring.length());
			for (index=0;index<len;++index)
				{
				if (teststring[index]!=inputstring[index])
					break;
				}

			// now shorten it and try again
		    inputqueuestrlen=index;
		    if (inputqueuestrlen==0)
				{
				cout << "ERROR: unexpectedly truncated search string that started out "<<originputqueuestrlen<<" characters (we hit "<<inputqueuestrlen<<").  last char was "<<(int)inputqueuestr[0]<<"."<<endl;
		    	inputstring="";
		    	}
			else
				inputstring=inputstring.substr(0,inputqueuestrlen);
💿 文件大小 170 K
👤 上传用户 boobyb
📂 所属分类压缩解压
🏷️ 相关标签

#Compression #Toolkit #Open #The
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -