📄 substringparser.cpp
字号:
valuestring=slidingwindowstring.substr(len-1,1);
bretv=UpdateValueStringInSymbolSet(valuestring,1);
// if the string length is only one character, then we are done.
if (len==1)
return;
// reset flag which helps us prevent from adding a keyword which ends in a delimiter
delimiteronborder=false;
if (Parameter_OnlyCodeWholeWords)
{
// we only want to store full words, so if we have not hit a new word boundary, then return
c=slidingwindowstring[len-1];
c2=slidingwindowstring[len-2];
if (len<3)
{
// too small to be on a word boundary
return;
}
if (!IsNonWordCharacter(c) && !IsNonWordCharacter(c2))
{
// we are within a word
return;
}
if (IsNonWordCharacter(c) && IsNonWordCharacter(c2))
{
// we are within a word
return;
}
// start end pointer at last non-delimiter
rightpos=len-2;
}
else
{
// start end position at last character
rightpos=len-1;
}
// set flag to tell us whether the new character is a delimiter and shouldn't be coded on the end of a real word
if (IsNonWordCharacter(slidingwindowstring[rightpos]))
{
// other than adding delimiters as their own characters (done above), we don't want to code them as part of a word
// so we set a flag here saying that we are only going to move leftward to catch a sequence of delimiters as a Substring, and NOT
// allows a Substring which has delimiters on the *borders* of words
delimiteronborder=true;
}
else
delimiteronborder=false;
// set left pos
leftpos=rightpos-Parameter_MaxSubstringSize;
if (leftpos<0)
leftpos=0;
// start walking backwards from right to left and add Substrings to dictionary
// always encoding from curpos to end (rightpos)
for (curpos=rightpos-1;curpos>leftpos;--curpos)
{
c=slidingwindowstring[curpos];
if (delimiteronborder && IsNonWordCharacter(c))
{
if (Parameter_OnlyCodeWholeWords && curpos>leftpos)
{
// since we only code words, we don't want to code this Substring of delimiters yet
continue;
}
// because we are parsing a string of delimiters, we want to just drop through to write out this Substring
}
else if (IsNonWordCharacter(c) || delimiteronborder)
{
if (IsNonWordCharacter(c) && IsNonWordCharacter(slidingwindowstring[curpos+1]))
{
// we have hit a series of delimiters to the left of a word, so we've already encoded the non-delimiter-bordered stuff to our right
// so we don't want to encode it again.
continue;
}
if (Parameter_OnlyCodeWholeWords || delimiteronborder)
{
// we found a new word (or sequence of words), so encode it
valuestring=slidingwindowstring.substr(curpos+1,rightpos-curpos);
//cout << "type 1 word: "<<valuestring<< endl;
bretv=UpdateValueStringInSymbolSet(valuestring,1);
if (delimiteronborder)
{
// we just finished writing out a string of delimiters bordered on left by non-delimiter, so we stop now
break;
}
else if (!Parameter_SpanWordBoundaries)
{
// since we don't span word boundaries, we are done
break;
}
else
{
// let it continue to look for next whole word
continue;
}
}
else if (!Parameter_SpanWordBoundaries)
{
// we hit a word boundary, so we are done
break;
}
else if (IsNonWordCharacter(c) && !delimiteronborder)
{
// we don't want to let any word Substring start with a delimiter (or end with one, which is checked above)
// actually i think by the time we drop down to here this is always true
continue;
}
}
else if (delimiteronborder)
{
// our rightmost character was a delimiter so we wont consider any non-delimiters words to the left of it
break;
}
else if (Parameter_OnlyCodeWholeWords)
{
// we are in the middle of a word, so we do nothing
continue;
}
// update the Substring
valuestring=slidingwindowstring.substr(curpos,(rightpos-curpos)+1);
bretv=UpdateValueStringInSymbolSet(valuestring,1);
}
}
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
// Internal Parsing
int SubstringParser::SwallowSymbolFromInputQueueStr(SubstringSymbol *symbolp,char *inputqueuestr,int inputqueuestrlen,bool debugmode)
{
// find which symbol to encode
if (symbolp==NULL)
{
// no match found
return -1;
}
// debugmode shows the sub-string used for encoding
if (debugmode)
cout << symbolp->get_value() << "*";
// shift inputqueuestrlen
int writepos=0;
for (int pos=(int)((symbolp->get_valuep())->length());pos<inputqueuestrlen;++pos)
{
inputqueuestr[writepos]=inputqueuestr[pos];
++writepos;
}
inputqueuestrlen=writepos;
// return length of shifted inputqueuestr
return inputqueuestrlen;
}
SubstringSymbol* SubstringParser::FindNextSymbolToEncode(char *inputqueuestr,int inputqueuestrlen)
{
// ok now we have a block of up to Parameter_MaxSubstringSize bytes from the left of the inputstr
// now find the longest leftmost (prefix) Substring in our dictionary and encode it, and shift inputqueuestr to the left with remaining bytes.
// return the new inputqueuestr with remaining bytes, and return length of remaining bytes.
SubstringSymbol *symbolnodep = 0;
if (Parameter_ParseMode==Dictionary_ParseMode_GREEDY || inputqueuestrlen<=2)
{
// GREEDY PARSE
// find biggest matching prefix in symbol dictionary - this should never fail unless the dictionary is not built at all,
// since the dictionary is supposed to contain every character, so at least the leftmost character should match
symbolnodep = FindNextSymbolToEncode_Greedy(inputqueuestr,inputqueuestrlen);
}
else if (Parameter_ParseMode==Dictionary_ParseMode_LFF)
{
// SMARTER HEURISTIC PARSE (slower than greedy, but *maybe* better)
int bestindex=inputqueuestrlen;
int bestscore=-1;
for (int index=0;index<=inputqueuestrlen-1;++index)
{
// find length of the string beginning at index
// early stop if we know we cant do better
if (bestscore>=inputqueuestrlen-index)
break;
symbolnodep = FindNextSymbolToEncode_Greedy(&inputqueuestr[index],inputqueuestrlen-index);
if (symbolnodep==NULL)
break;
if (bestscore==-1 || symbolnodep->get_valuelen()>bestscore)
{
bestscore=symbolnodep->get_valuelen();
bestindex=index;
}
}
// now that we found the longest Substring, parse the string BEFORE it
if (bestindex==0)
bestindex=inputqueuestrlen;
symbolnodep = FindNextSymbolToEncode_Greedy(inputqueuestr,bestindex);
}
else if (Parameter_ParseMode==Dictionary_ParseMode_DEEP)
{
// SMARTER HEURISTIC PARSE (slower than greedy, but *maybe* better)
double encodecost;
double bestscore;
int longestlen=inputqueuestrlen;
int bestindex=longestlen;
bool firstscore=true;
do
{
bestscore=-1;
longestlen=bestindex;
for (int index=1;index<=longestlen;++index)
{
// find cost if we encode based on split at index
encodecost=CalculateEncodeCost(inputqueuestr,index);
if (index<longestlen)
encodecost+=CalculateEncodeCost(&inputqueuestr[index],longestlen-index);
// update our tracking of best score
if (firstscore || encodecost<=bestscore)
{
bestscore=encodecost;
bestindex=index;
firstscore=false;
}
}
} while (bestindex!=longestlen);
// and do the search with this limit (note we could have asked CalculateEncodeCost to return best found first match
symbolnodep = FindNextSymbolToEncode_Greedy(inputqueuestr,bestindex);
}
else
{
// unknown parse mode
cout << "ERROR: unknown parse mode "<<Parameter_ParseMode<<"."<<endl;
}
return symbolnodep;
}
SubstringSymbol* SubstringParser::FindNextSymbolToEncode_Greedy(char *inputqueuestr,int inputqueuestrlen)
{
// find biggest matching prefix in symbol dictionary - this should never fail unless the dictionary is not built at all,
// since the dictionary is supposed to contain every character, so at least the leftmost character should match
// return a pointer to the symbol, or NULL if there is some error and we cant find one.
// there are several ways to do this dictionary search within the SymbolSet, which is a binary tree
// one way is to start with full string and gradually shorten it until we get an exact match
// another is to ask the stl set to find the nearest match, and search back from there for shortest exact match
int originputqueuestrlen=inputqueuestrlen;
string inputstring;
// use a static local variable to reduce memory allocation-deallocation thrashing
static SubstringSymbol searchSubstringSymbol("",1);
// convert char * to string, using exactly inputqueuestrlen, regardless of any '\0'
if (inputqueuestrlen==0)
inputstring="";
else
inputstring=string(inputqueuestr,inputqueuestrlen);
// truncate it to max length of any symbol, incase caller sometimes passes more
if (inputqueuestrlen>Parameter_MaxSubstringSize)
{
inputqueuestrlen=Parameter_MaxSubstringSize;
inputstring=inputstring.substr(0,inputqueuestrlen);
}
// now we use this searchSubstringSymbol for searching
searchSubstringSymbol.set_value(inputstring);
// find the longest Substring, using a smarter use of stl lower_bound instead of find()
if (Parameter_UseSmartLookup)
{
// smarter method (22:00 seconds on a test, about 3x faster than slow method below
// ATTN: can we simplify this?
int len;
int index;
string teststring;
for (;;)
{
symbolset_pos=symbolset.lower_bound(&searchSubstringSymbol);
// check for error
if (symbolset_pos==symbolset.end())
{
// nothing smaller so the LAST symbol, single character (we verify to make sure there isnt an error)
--symbolset_pos;
// shorten main string to match
inputqueuestrlen=1;
inputstring=inputstring.substr(0,inputqueuestrlen);
teststring=(*symbolset_pos)->get_value();
if (inputstring[0]!=teststring[0])
{
cout << "ERROR: Unexpectedly didn't find any match for a string that started out "<<originputqueuestrlen<<" characters (we hit "<<inputqueuestrlen<<"). last char was "<<(int)((unsigned char)(inputqueuestr[0]))<<"."<<endl;
return NULL;
}
// ok the best match is the last symbol
searchSubstringSymbol.set_value(inputstring);
break;
}
//teststring=(*symbolset_pos)->get_value();
//len=(int)(teststring.length());
//cout << "lowerbound for '"<<inputstring<<"' = '" << teststring <<"' (len="<<len<<")."<<endl;;
// if we have an exact match we are done
teststring=(*symbolset_pos)->get_value();
len=(int)(teststring.length());
if (teststring==inputstring)
break;
// try one previous
--symbolset_pos;
teststring=(*symbolset_pos)->get_value();
len=(int)(teststring.length());
if (len<inputqueuestrlen)
{
// shorten main string to match
inputqueuestrlen=len;
inputstring=inputstring.substr(0,inputqueuestrlen);
}
// if we have an exact match we are done
if (teststring==inputstring)
break;
// if not, find the last character they agree on and find next bound
len=(int)(teststring.length());
for (index=0;index<len;++index)
{
if (teststring[index]!=inputstring[index])
break;
}
// now shorten it and try again
inputqueuestrlen=index;
if (inputqueuestrlen==0)
{
cout << "ERROR: unexpectedly truncated search string that started out "<<originputqueuestrlen<<" characters (we hit "<<inputqueuestrlen<<"). last char was "<<(int)inputqueuestr[0]<<"."<<endl;
inputstring="";
}
else
inputstring=inputstring.substr(0,inputqueuestrlen);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -