⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 pattern.cpp

📁 一些unix下的c/c++的util包
💻 CPP
📖 第 1 页 / 共 4 页
字号:
/**
  From the author (Jeff Stuart)
  "
  Let me start by saying this file is pretty big. If you feel up to it, you can
  try making changes yourself, but you would be better off to just email me at
  stuart@cs.ucdavis.edu if you think there is a bug, or have something useful you
  would like added. This project is very "near and dear" to me, so I am fairly
  quick to make bug fixes. The header files for Pattern and Matcher are fairly
  well documented and the function names are pretty self-explanatory, but if you
  are having any trouble, feel free to email me at stuart@cs.ucdavis.edu.

  If you email me, make sure you put something like C++RE in the subject because
  I tend to delete email if I don't recognize the name and the subject is
  something like "I Need Your Help" or "Got A Second" or "I Found It".
  "
 */

/*
  Detailed documentation is provided in this class' header file

  @author   Jeffery Stuart
  @since    November 2004
  @version  1.07.00
*/

#ifdef _WIN32
  #pragma warning(push)
  #pragma warning(disable:4996)
  #define str_icmp stricmp
#else
  #define str_icmp strcasecmp
#endif

#include "nlkit/Pattern.h"
#include "nlkit/Matcher.h"
#include <ctype.h>
#include <cstring>
#include <algorithm>
using namespace nlkit;
using namespace std;
std::map<std::string, Pattern *> Pattern::compiledPatterns;
std::map<std::string, std::pair<std::string, unsigned long> > Pattern::registeredPatterns;

const int Pattern::MIN_QMATCH = 0x00000000;
const int Pattern::MAX_QMATCH = 0x7FFFFFFF;

const unsigned long Pattern::CASE_INSENSITIVE       = 0x01;
const unsigned long Pattern::LITERAL                = 0x02;
const unsigned long Pattern::DOT_MATCHES_ALL        = 0x04;
const unsigned long Pattern::MULTILINE_MATCHING     = 0x08;
const unsigned long Pattern::UNIX_LINE_MODE         = 0x10;

Pattern::Pattern(const std::string & rhs)
{
  matcher = NULL;
  pattern = rhs;
  curInd = 0;
  groupCount = 0;
  nonCapGroupCount = 0;
  error = 0;
  head = NULL;
}
// convenience function in case we want to add any extra debugging output
void Pattern::raiseError()
{
  switch (pattern[curInd - 1])
  {
  case '*':
  case ')':
  case '+':
  case '?':
  case ']':
  case '}':
    fprintf(stderr, "%s\n%*c^\n", pattern.c_str(), curInd - 1, ' ');
    fprintf(stderr, "Syntax Error near here. Possible unescaped meta character.\n");
    break;
  default:
    fprintf(stderr, "%s\n%*c^\n", pattern.c_str(), curInd - 1, ' ');
    fprintf(stderr, "Syntax Error near here. \n");
    break;
  }
  error = 1;
}
NFANode * Pattern::registerNode(NFANode * node)
{
  nodes[node] = 1;
  return node;
}

std::string Pattern::classUnion      (std::string s1, std::string s2)  const
{
  char out[300];
  std::sort(s1.begin(), s1.end());
  std::sort(s2.begin(), s2.end());
  *std::set_union(s1.begin(), s1.end(), s2.begin(), s2.end(), out) = 0;
  return out;
}
std::string Pattern::classIntersect  (std::string s1, std::string s2)  const
{
  char out[300];
  std::sort(s1.begin(), s1.end());
  std::sort(s2.begin(), s2.end());
  *std::set_intersection(s1.begin(), s1.end(), s2.begin(), s2.end(), out) = 0;
  return out;
}
std::string Pattern::classNegate     (std::string s1)                  const
{
  char out[300];
  int i, ind = 0;
  std::map<char, bool> m;

  for (i = 0; i < (int)s1.size(); ++i) m[s1[i]] = 1;
  for (i = 0xFF; i >= 0; --i) if (m.find((char)i) == m.end()) out[ind++] = (char)i;
  out[ind] = 0;
  return std::string(out, ind);
}
std::string Pattern::classCreateRange(char low, char hi)    const
{
  char out[300];
  int ind = 0;
  while (low != hi) out[ind++] = low++;
  out[ind++] = low;
  return std::string(out, ind);
}

int Pattern::getInt(int start, int end)
{
  int ret = 0;
  for (; start <= end; ++start) ret = ret * 10 + (pattern[start] - '0');
  return ret;
}
bool Pattern::quantifyCurly(int & sNum, int & eNum)
{
  bool good = 1;
  int i, ci = curInd + 1;
  int commaInd = ci, endInd = ci, len = pattern.size();
  sNum = eNum = 0;

  while (endInd   < len     && pattern[endInd  ] != '}') ++endInd;
  while (commaInd < endInd  && pattern[commaInd] != ',') ++commaInd;
  if (endInd >= len) { raiseError(); return 0; }
  for (i = ci; good && i < endInd; ++i) if (i != commaInd && !isdigit(pattern[i])) good = 0;
  if (!good && commaInd < endInd) { raiseError(); return 0; }
  if (!good) return 0;
  /* so now everything in here is either a comma (and there is at most one comma) or a digit */
  if (commaInd == ci) // {,*}
  {
    if (endInd == commaInd + 1)    { sNum = MIN_QMATCH;               eNum = MAX_QMATCH;                        } // {,} = *
    else                           { sNum = MIN_QMATCH;               eNum = getInt(commaInd + 1, endInd - 1);  } // {,+}
  }
  else if (commaInd == endInd - 1) { sNum = getInt(ci, commaInd - 1); eNum = MAX_QMATCH;                        } // {+,}
  else if (commaInd == endInd)     { sNum = getInt(ci, endInd - 1);   eNum = sNum;                              } // {+}
  else                             { sNum = getInt(ci, commaInd - 1); eNum = getInt(commaInd + 1, endInd - 1);  } // {+,+}
  curInd = endInd + 1;
  return 1;
}
NFANode * Pattern::quantifyGroup(NFANode * start, NFANode * stop, const int gn)
{
  NFANode * newNode = NULL;
  int type = 0;

  if (curInd < (int)pattern.size())
  {
    char ch = (curInd + 1 >= (int)pattern.size()) ? -1 : pattern[curInd + 1];
    switch (pattern[curInd])
    {
    case '*':
      ++curInd;
      switch (ch)
      {
      case '?': ++curInd; type = 1; break;
      case '+': ++curInd; type = 2; break;
      }
      newNode = registerNode(new NFAGroupLoopPrologueNode(gn));
      newNode->next = registerNode(new NFAGroupLoopNode(start, MIN_QMATCH, MAX_QMATCH, gn, type));
      stop->next = newNode->next;
      return newNode;
    case '?':
      ++curInd;
      switch (ch)
      {
      case '?': ++curInd; type = 1; break;
      case '+': ++curInd; type = 2; break;
      }
      newNode = registerNode(new NFAGroupLoopPrologueNode(gn));
      newNode->next = registerNode(new NFAGroupLoopNode(start, MIN_QMATCH, 1, gn, type));
      stop->next = newNode->next;
      return newNode;
    case '+':
      ++curInd;
      switch (ch)
      {
      case '?': ++curInd; type = 1; break;
      case '+': ++curInd; type = 2; break;
      }
      newNode = registerNode(new NFAGroupLoopPrologueNode(gn));
      newNode->next = registerNode(new NFAGroupLoopNode(start, 1, MAX_QMATCH, gn, type));
      stop->next = newNode->next;
      return newNode;
    case '{':
      {
        int s, e;
        if (quantifyCurly(s, e))
        {
          ch = (curInd < (int)pattern.size()) ? pattern[curInd] : -1;
          switch (ch)
          {
          case '?': ++curInd; type = 1; break;
          case '+': ++curInd; type = 2; break;
          }
          newNode = registerNode(new NFAGroupLoopPrologueNode(gn));
          newNode->next = registerNode(new NFAGroupLoopNode(start, s, e, gn, type));
          stop->next = newNode->next;
          return newNode;
        }
      }
    default:
      break;
    }
  }
  return NULL;
}

NFANode * Pattern::quantify(NFANode * newNode)
{
  if (curInd < (int)pattern.size())
  {
    char ch = (curInd + 1 >= (int)pattern.size()) ? -1 : pattern[curInd + 1];
    switch (pattern[curInd])
    {
    case '*':
      ++curInd;
      switch (ch)
      {
      case '?': ++curInd; newNode = registerNode(new NFALazyQuantifierNode      (this, newNode, MIN_QMATCH, MAX_QMATCH)); break;
      case '+': ++curInd; newNode = registerNode(new NFAPossessiveQuantifierNode(this, newNode, MIN_QMATCH, MAX_QMATCH)); break;
      default:            newNode = registerNode(new NFAGreedyQuantifierNode    (this, newNode, MIN_QMATCH, MAX_QMATCH)); break;
      }
      break;
    case '?':
      ++curInd;
      switch (ch)
      {
      case '?': ++curInd; newNode = registerNode(new NFALazyQuantifierNode      (this, newNode, MIN_QMATCH, 1)); break;
      case '+': ++curInd; newNode = registerNode(new NFAPossessiveQuantifierNode(this, newNode, MIN_QMATCH, 1)); break;
      default:            newNode = registerNode(new NFAGreedyQuantifierNode    (this, newNode, MIN_QMATCH, 1)); break;
      }
      break;
    case '+':
      ++curInd;
      switch (ch)
      {
      case '?': ++curInd; newNode = registerNode(new NFALazyQuantifierNode      (this, newNode, 1, MAX_QMATCH)); break;
      case '+': ++curInd; newNode = registerNode(new NFAPossessiveQuantifierNode(this, newNode, 1, MAX_QMATCH)); break;
      default:            newNode = registerNode(new NFAGreedyQuantifierNode    (this, newNode, 1, MAX_QMATCH)); break;
      }
      break;
    case '{':
      {
        int s, e;
        if (quantifyCurly(s, e))
        {
          ch = (curInd < (int)pattern.size()) ? pattern[curInd] : -1;
          switch (ch)
          {
          case '?': ++curInd; newNode = registerNode(new NFALazyQuantifierNode      (this, newNode, s, e)); break;
          case '+': ++curInd; newNode = registerNode(new NFAPossessiveQuantifierNode(this, newNode, s, e)); break;
          default:            newNode = registerNode(new NFAGreedyQuantifierNode    (this, newNode, s, e)); break;
          }
        }
      }
      break;
    default:
      break;
    }
  }
  return newNode;
}
std::string Pattern::parseClass()
{
  std::string t, ret = "";
  char ch, c1, c2;
  bool inv = 0, neg = 0, quo = 0;

  if (curInd < (int)pattern.size() && pattern[curInd] == '^')
  {
    ++curInd;
    neg = 1;
  }
  while (curInd < (int)pattern.size() && pattern[curInd] != ']')
  {
    ch = pattern[curInd++];
    if (ch == '[')
    {
      t = parseClass();
      ret = classUnion(ret, t);
    }
    /*else if (ch == '-')
    {
      raiseError();
      curInd = pattern.size();
    }*/
    else if (ch == '&' && curInd < (int)pattern.size() && pattern[curInd] == '&')
    {
      if (pattern[++curInd] != '[')
      {
        raiseError();
        curInd = pattern.size();
      }
      else
      {
        ++curInd;
        t = parseClass();
        ret = classIntersect(ret, t);
      }
    }
    else if (ch == '\\')
    {
      t = parseEscape(inv, quo);
      if (quo)
      {
        raiseError();
        curInd = pattern.size();
      }
      else if (inv || t.size() > 1) // cant be part of a range (a-z)
      {
        if (inv) t = classNegate(t);
        ret = classUnion(ret, t);
      }
      else if (curInd < (int)pattern.size() && pattern[curInd] == '-') // part of a range (a-z)
      {
        c1 = t[0];
        ++curInd;
        if (curInd >= (int)pattern.size()) raiseError();
        else
        {
          c2 = pattern[curInd++];
          if (c2 == '\\')
          {
            t = parseEscape(inv, quo);
            if (quo)
            {
              raiseError();
              curInd = pattern.size();
            }
            else if (inv || t.size() > 1) raiseError();
            else ret = classUnion(ret, classCreateRange(c1, c2));
          }
          else if (c2 == '[' || c2 == ']' || c2 == '-' || c2 == '&')
          {
            raiseError();
            curInd = pattern.size();
          }
          else ret = classUnion(ret, classCreateRange(c1, c2));
        }
      }
      else
      {
        ret = classUnion(ret, t);
      }
    }
    else if (curInd < (int)pattern.size() && pattern[curInd] == '-')
    {
      c1 = ch;
      ++curInd;
      if (curInd >= (int)pattern.size()) raiseError();
      else
      {
        c2 = pattern[curInd++];
        if (c2 == '\\')
        {
          t = parseEscape(inv, quo);
          if (quo)
          {
            raiseError();
            curInd = pattern.size();
          }
          else if (inv || t.size() > 1) raiseError();
          else ret = classUnion(ret, classCreateRange(c1, c2));
        }
        else if (c2 == '[' || c2 == ']' || c2 == '-' || c2 == '&')
        {
          raiseError();
          curInd = pattern.size();
        }
        else
        {
          ret = classUnion(ret, classCreateRange(c1, c2));
        }
      }
    }
    else
    {
      ret += " ";
      ret[ret.size() - 1] = ch;
    }
  }
  if (curInd >= (int)pattern.size() || pattern[curInd] != ']')
  {
    raiseError();
    ret = "";
  }
  else
  {
    ++curInd;
    if (neg) ret = classNegate(ret);
  }
  return ret;
}
std::string Pattern::parsePosix()
{
  std::string s7 = pattern.substr(curInd, 7);
  if (s7 == "{Lower}")  { curInd += 7; return "abcdefghijklmnopqrstuvwxyz";                                                                       }
  if (s7 == "{Upper}")  { curInd += 7; return "ABCDEFGHIJKLMNOPQRSTUVWXYZ";                                                                       }
  if (s7 == "{Alpha}")  { curInd += 7; return "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";                                             }
  if (s7 == "{Digit}")  { curInd += 7; return "0123456789";                                                                                       }
  if (s7 == "{Alnum}")  { curInd += 7; return "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";                                   }
  if (s7 == "{Punct}")  { curInd += 7; return "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~";                                                               }
  if (s7 == "{Graph}")  { curInd += 7; return "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; }
  if (s7 == "{Print}")  { curInd += 7; return "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; }
  if (s7 == "{Blank}")  { curInd += 7; return " \t";                                                                                              }
  if (s7 == "{Space}")  { curInd += 7; return " \t\n\x0B\f\r";                                                                                    }
  if (s7 == "{Cntrl}")
  {
    int i;
    std::string s = " ";

    for (i = 0; i < 5; ++i) s += s;
    s += " ";

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -