📄 wcpattern.cpp
字号:
/**
From the author (Jeff Stuart)
"
Let me start by saying this file is pretty big. If you feel up to it, you can
try making changes yourself, but you would be better off to just email me at
stuart@cs.ucdavis.edu if you think there is a bug, or have something useful you
would like added. This project is very "near and dear" to me, so I am fairly quick
to make bug fixes. The header files for WCPattern and WCMatcher are fairly well
documented and the function names are pretty self-explanatory, but if you are having
any trouble, feel free to email me at stuart@cs.ucdavis.edu.
If you email me, make sure you put something like C++RE in the subject because
I tend to delete email if I don't recognize the name and the subject is
something like "I Need Your Help" or "Got A Second" or "I Found It".
"
*/
/*
Detailed documentation is provided in this class' header file
@author Jeffery Stuart
@since November 2004
@version 1.07.00
*/
#ifdef _WIN32
#pragma warning(push)
#pragma warning(disable:4996)
#endif
#include <nlkit/WCPattern.h>
#include <nlkit/WCMatcher.h>
#include <cstring>
#include <wchar.h>
#include <algorithm>
#ifndef _WIN32
#include <wctype.h>
#endif
using namespace std;
namespace nlkit
{
std::map<std::wstring, WCPattern *> WCPattern::compiledWCPatterns;
std::map<std::wstring, std::pair<std::wstring, unsigned long> > WCPattern::registeredWCPatterns;
const int WCPattern::MIN_QMATCH = 0x00000000;
const int WCPattern::MAX_QMATCH = 0x7FFFFFFF;
const unsigned long WCPattern::CASE_INSENSITIVE = 0x01;
const unsigned long WCPattern::LITERAL = 0x02;
const unsigned long WCPattern::DOT_MATCHES_ALL = 0x04;
const unsigned long WCPattern::MULTILINE_MATCHING = 0x08;
const unsigned long WCPattern::UNIX_LINE_MODE = 0x10;
#if defined(_WIN32)
#define str_icmp _wcsicmp
#elif defined(__CYGWIN__) || defined(__APPLE__) || defined(__hpux) || defined(__sun)
static inline int str_icmp(const wchar_t * a, const wchar_t * b)
{
while (*a && *b)
{
const int t = (int)towlower(*a) - (int)towlower(*b);
if (t) return t;
++a; ++b;
}
if (*a)
{
if (*b) return (int)towlower(*a) - (int)towlower(*b);
return 1;
}
else if (*b) return 1;
return 0;
}
#else
#define str_icmp wcscasecmp
#endif
WCPattern::WCPattern(const std::wstring & rhs)
{
matcher = NULL;
pattern = rhs;
curInd = 0;
groupCount = 0;
nonCapGroupCount = 0;
error = 0;
head = NULL;
}
// convenience function in case we want to add any extra debugging output
void WCPattern::raiseError()
{
switch (pattern[curInd - 1])
{
case '*':
case ')':
case '+':
case '?':
case ']':
case '}':
fwprintf(stderr, L"%s\n%*c^\n", pattern.c_str(), curInd - 1, ' ');
fwprintf(stderr, L"Syntax Error near here. Possible unescaped meta character.\n");
break;
default:
fwprintf(stderr, L"%s\n%*c^\n", pattern.c_str(), curInd - 1, ' ');
fwprintf(stderr, L"Syntax Error near here. \n");
break;
}
error = 1;
}
NFAUNode * WCPattern::registerNode(NFAUNode * node)
{
nodes[node] = 1;
return node;
}
std::wstring WCPattern::classUnion (std::wstring s1, std::wstring s2) const
{
wchar_t * out = new wchar_t[66000];
std::sort(s1.begin(), s1.end());
std::sort(s2.begin(), s2.end());
*std::set_union(s1.begin(), s1.end(), s2.begin(), s2.end(), out) = 0;
std::wstring ret = out;
delete [] out;
return ret;
}
std::wstring WCPattern::classIntersect (std::wstring s1, std::wstring s2) const
{
wchar_t * out = new wchar_t[66000];
std::sort(s1.begin(), s1.end());
std::sort(s2.begin(), s2.end());
*std::set_intersection(s1.begin(), s1.end(), s2.begin(), s2.end(), out) = 0;
std::wstring ret = out;
delete [] out;
return ret;
}
std::wstring WCPattern::classNegate (std::wstring s1) const
{
wchar_t * out = new wchar_t[66000];
int i, ind = 0;
std::map<wchar_t, bool> m;
for (i = 0; i < (int)s1.size(); ++i) m[s1[i]] = 1;
for (i = 0xFF; i >= 0; --i) if (m.find((wchar_t)i) == m.end()) out[ind++] = (wchar_t)i;
out[ind] = 0;
std::wstring ret(out, ind);
delete [] out;
return ret;
}
std::wstring WCPattern::classCreateRange(wchar_t low, wchar_t hi) const
{
wchar_t out[300];
int ind = 0;
while (low != hi) out[ind++] = low++;
out[ind++] = low;
return std::wstring(out, ind);
}
int WCPattern::getInt(int start, int end)
{
int ret = 0;
for (; start <= end; ++start) ret = ret * 10 + (pattern[start] - (wchar_t)'0');
return ret;
}
bool WCPattern::quantifyCurly(int & sNum, int & eNum)
{
bool good = 1;
int i, ci = curInd + 1;
int commaInd = ci, endInd = ci, len = pattern.size();
sNum = eNum = 0;
while (endInd < len && pattern[endInd ] != (wchar_t)'}') ++endInd;
while (commaInd < endInd && pattern[commaInd] != (wchar_t)',') ++commaInd;
if (endInd >= len) { raiseError(); return 0; }
for (i = ci; good && i < endInd; ++i) if (i != commaInd && !iswdigit(pattern[i])) good = 0;
if (!good && commaInd < endInd) { raiseError(); return 0; }
if (!good) return 0;
/* so now everything in here is either a comma (and there is at most one comma) or a digit */
if (commaInd == ci) // {,*}
{
if (endInd == commaInd + 1) { sNum = MIN_QMATCH; eNum = MAX_QMATCH; } // {,} = *
else { sNum = MIN_QMATCH; eNum = getInt(commaInd + 1, endInd - 1); } // {,+}
}
else if (commaInd == endInd - 1) { sNum = getInt(ci, commaInd - 1); eNum = MAX_QMATCH; } // {+,}
else if (commaInd == endInd) { sNum = getInt(ci, endInd - 1); eNum = sNum; } // {+}
else { sNum = getInt(ci, commaInd - 1); eNum = getInt(commaInd + 1, endInd - 1); } // {+,+}
curInd = endInd + 1;
return 1;
}
NFAUNode * WCPattern::quantifyGroup(NFAUNode * start, NFAUNode * stop, const int gn)
{
NFAUNode * newNode = NULL;
int type = 0;
if (curInd < (int)pattern.size())
{
wchar_t ch = (curInd + 1 >= (int)pattern.size()) ? -1 : pattern[curInd + 1];
switch (pattern[curInd])
{
case (wchar_t)'*':
++curInd;
switch (ch)
{
case (wchar_t)'?': ++curInd; type = 1; break;
case (wchar_t)'+': ++curInd; type = 2; break;
}
newNode = registerNode(new NFAGroupLoopPrologueUNode(gn));
newNode->next = registerNode(new NFAGroupLoopUNode(start, MIN_QMATCH, MAX_QMATCH, gn, type));
stop->next = newNode->next;
return newNode;
case (wchar_t)'?':
++curInd;
switch (ch)
{
case (wchar_t)'?': ++curInd; type = 1; break;
case (wchar_t)'+': ++curInd; type = 2; break;
}
newNode = registerNode(new NFAGroupLoopPrologueUNode(gn));
newNode->next = registerNode(new NFAGroupLoopUNode(start, MIN_QMATCH, 1, gn, type));
stop->next = newNode->next;
return newNode;
case (wchar_t)'+':
++curInd;
switch (ch)
{
case (wchar_t)'?': ++curInd; type = 1; break;
case (wchar_t)'+': ++curInd; type = 2; break;
}
newNode = registerNode(new NFAGroupLoopPrologueUNode(gn));
newNode->next = registerNode(new NFAGroupLoopUNode(start, 1, MAX_QMATCH, gn, type));
stop->next = newNode->next;
return newNode;
case (wchar_t)'{':
{
int s, e;
if (quantifyCurly(s, e))
{
ch = (curInd < (int)pattern.size()) ? pattern[curInd] : -1;
switch (ch)
{
case (wchar_t)'?': ++curInd; type = 1; break;
case (wchar_t)'+': ++curInd; type = 2; break;
}
newNode = registerNode(new NFAGroupLoopPrologueUNode(gn));
newNode->next = registerNode(new NFAGroupLoopUNode(start, s, e, gn, type));
stop->next = newNode->next;
return newNode;
}
}
default:
break;
}
}
return NULL;
}
NFAUNode * WCPattern::quantify(NFAUNode * newNode)
{
if (curInd < (int)pattern.size())
{
wchar_t ch = (curInd + 1 >= (int)pattern.size()) ? -1 : pattern[curInd + 1];
switch (pattern[curInd])
{
case (wchar_t)'*':
++curInd;
switch (ch)
{
case (wchar_t)'?': ++curInd; newNode = registerNode(new NFALazyQuantifierUNode (this, newNode, MIN_QMATCH, MAX_QMATCH)); break;
case (wchar_t)'+': ++curInd; newNode = registerNode(new NFAPossessiveQuantifierUNode(this, newNode, MIN_QMATCH, MAX_QMATCH)); break;
default: newNode = registerNode(new NFAGreedyQuantifierUNode (this, newNode, MIN_QMATCH, MAX_QMATCH)); break;
}
break;
case (wchar_t)'?':
++curInd;
switch (ch)
{
case (wchar_t)'?': ++curInd; newNode = registerNode(new NFALazyQuantifierUNode (this, newNode, MIN_QMATCH, 1)); break;
case (wchar_t)'+': ++curInd; newNode = registerNode(new NFAPossessiveQuantifierUNode(this, newNode, MIN_QMATCH, 1)); break;
default: newNode = registerNode(new NFAGreedyQuantifierUNode (this, newNode, MIN_QMATCH, 1)); break;
}
break;
case (wchar_t)'+':
++curInd;
switch (ch)
{
case (wchar_t)'?': ++curInd; newNode = registerNode(new NFALazyQuantifierUNode (this, newNode, 1, MAX_QMATCH)); break;
case (wchar_t)'+': ++curInd; newNode = registerNode(new NFAPossessiveQuantifierUNode(this, newNode, 1, MAX_QMATCH)); break;
default: newNode = registerNode(new NFAGreedyQuantifierUNode (this, newNode, 1, MAX_QMATCH)); break;
}
break;
case (wchar_t)'{':
{
int s, e;
if (quantifyCurly(s, e))
{
ch = (curInd < (int)pattern.size()) ? pattern[curInd] : -1;
switch (ch)
{
case (wchar_t)'?': ++curInd; newNode = registerNode(new NFALazyQuantifierUNode (this, newNode, s, e)); break;
case (wchar_t)'+': ++curInd; newNode = registerNode(new NFAPossessiveQuantifierUNode(this, newNode, s, e)); break;
default: newNode = registerNode(new NFAGreedyQuantifierUNode (this, newNode, s, e)); break;
}
}
}
break;
default:
break;
}
}
return newNode;
}
std::wstring WCPattern::parseClass()
{
std::wstring t, ret = L"";
wchar_t ch, c1, c2;
bool inv = 0, neg = 0, quo = 0;
if (curInd < (int)pattern.size() && pattern[curInd] == (wchar_t)'^')
{
++curInd;
neg = 1;
}
while (curInd < (int)pattern.size() && pattern[curInd] != (wchar_t)']')
{
ch = pattern[curInd++];
if (ch == (wchar_t)'[')
{
t = parseClass();
ret = classUnion(ret, t);
}
/*else if (ch == (wchar_t)'-')
{
raiseError();
curInd = pattern.size();
}*/
else if (ch == (wchar_t)'&' && curInd < (int)pattern.size() && pattern[curInd] == (wchar_t)'&')
{
if (pattern[++curInd] != (wchar_t)'[')
{
raiseError();
curInd = pattern.size();
}
else
{
++curInd;
t = parseClass();
ret = classIntersect(ret, t);
}
}
else if (ch == (wchar_t)'\\')
{
t = parseEscape(inv, quo);
if (quo)
{
raiseError();
curInd = pattern.size();
}
else if (inv || t.size() > 1) // cant be part of a range (a-z)
{
if (inv) t = classNegate(t);
ret = classUnion(ret, t);
}
else if (curInd < (int)pattern.size() && pattern[curInd] == (wchar_t)'-') // part of a range (a-z)
{
c1 = t[0];
++curInd;
if (curInd >= (int)pattern.size()) raiseError();
else
{
c2 = pattern[curInd++];
if (c2 == (wchar_t)'\\')
{
t = parseEscape(inv, quo);
if (quo)
{
raiseError();
curInd = pattern.size();
}
else if (inv || t.size() > 1) raiseError();
else ret = classUnion(ret, classCreateRange(c1, c2));
}
else if (c2 == (wchar_t)'[' || c2 == (wchar_t)']' || c2 == (wchar_t)'-' || c2 == (wchar_t)'&')
{
raiseError();
curInd = pattern.size();
}
else ret = classUnion(ret, classCreateRange(c1, c2));
}
}
else
{
ret = classUnion(ret, t);
}
}
else if (curInd < (int)pattern.size() && pattern[curInd] == (wchar_t)'-')
{
c1 = ch;
++curInd;
if (curInd >= (int)pattern.size()) raiseError();
else
{
c2 = pattern[curInd++];
if (c2 == (wchar_t)'\\')
{
t = parseEscape(inv, quo);
if (quo)
{
raiseError();
curInd = pattern.size();
}
else if (inv || t.size() > 1) raiseError();
else ret = classUnion(ret, classCreateRange(c1, c2));
}
else if (c2 == (wchar_t)'[' || c2 == (wchar_t)']' || c2 == (wchar_t)'-' || c2 == (wchar_t)'&')
{
raiseError();
curInd = pattern.size();
}
else
{
ret = classUnion(ret, classCreateRange(c1, c2));
}
}
}
else
{
ret += L" ";
ret[ret.size() - 1] = ch;
}
}
if (curInd >= (int)pattern.size() || pattern[curInd] != (wchar_t)']')
{
raiseError();
ret = L"";
}
else
{
++curInd;
if (neg) ret = classNegate(ret);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -