📄 wcpattern.cpp

📁 C++正则表达式解析
💻 CPP
📖 第 1 页 / 共 4 页
字号:
{
  std::wstring s7 = pattern.substr(curInd, 7);
  if (s7 == L"{Lower}") { curInd += 7; return L"abcdefghijklmnopqrstuvwxyz";                                                                       }
  if (s7 == L"{Upper}") { curInd += 7; return L"ABCDEFGHIJKLMNOPQRSTUVWXYZ";                                                                       }
  if (s7 == L"{Alpha}") { curInd += 7; return L"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";                                             }
  if (s7 == L"{Digit}") { curInd += 7; return L"0123456789";                                                                                       }
  if (s7 == L"{Alnum}") { curInd += 7; return L"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";                                   }
  if (s7 == L"{Punct}") { curInd += 7; return L"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~";                                                               }
  if (s7 == L"{Graph}") { curInd += 7; return L"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; }
  if (s7 == L"{Print}") { curInd += 7; return L"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; }
  if (s7 == L"{Blank}") { curInd += 7; return L" \t";                                                                                              }
  if (s7 == L"{Space}") { curInd += 7; return L" \t\n\x0B\f\r";                                                                                    }
  if (s7 == L"{Cntrl}")
  {
    int i;
    std::wstring s = L" ";

    for (i = 0; i < 5; ++i) s += s;
    s += L" ";
    for (i = 0; i <= 0x1F; ++i) s[i] = i;
    s[0x20] = 0x7F;
    curInd += 7;
    return s;
  }
  if (s7 == L"{ASCII}")
  {
    std::wstring s(0x80, (wchar_t)' ');
    for (int i = 0; i < 0x80; ++i) s[i] = i;
    curInd += 7;
    return s;
  }
  if (pattern.substr(curInd, 8) == L"{XDigit}") { curInd += 8; return L"abcdefABCDEF0123456789"; }
  raiseError();
  return L"";
}
NFAUNode * WCPattern::parseBackref()
{
  #define is_dig(x) ((x) >= (wchar_t)'0' && (x) <= (wchar_t)'9')
  #define to_int(x) ((x) - (wchar_t)'0')
  int ci = curInd;
  int oldRef = 0, ref = 0;

  while (ci < (int)pattern.size() && is_dig(pattern[ci]) && (ref < 10 || ref < groupCount))
  {
    oldRef = ref;
    ref = ref * 10 + to_int(pattern[ci++]);
  }
  if (ci == (int)pattern.size())
  {
    oldRef = ref;
    ++ci;
  }
  if (oldRef < 0 || ci <= curInd)
  {
    raiseError();
    return registerNode(new NFAReferenceUNode(-1));
  }
  curInd = ci;
  return registerNode(new NFAReferenceUNode(ref));

  #undef is_dig
  #undef to_int
}
std::wstring WCPattern::parseOctal()
{
  #define islowoc(x)  ((x) >= (wchar_t)'0' && (x) <= (wchar_t)'3')
  #define isoc(x)     ((x) >= (wchar_t)'0' && (x) <= (wchar_t)'7')
  #define fromoc(x)   ((x) - (wchar_t)'0')
  int ci = curInd;
  wchar_t ch1 = (ci + 0 < (int)pattern.size()) ? pattern[ci + 0] : -1;
  wchar_t ch2 = (ci + 1 < (int)pattern.size()) ? pattern[ci + 1] : -1;
  wchar_t ch3 = (ci + 2 < (int)pattern.size()) ? pattern[ci + 2] : -1;
  std::wstring s = L" ";

  if (islowoc(ch1) && isoc(ch2))
  {
    curInd += 2;
    s[0] = fromoc(ch1) * 8 + fromoc(ch2);
    if (isoc(ch3))
    {
      ++curInd;
      s[0] = s[0] * 8 + fromoc(ch3);
    }
  }
  else if (isoc(ch1) && isoc(ch2))
  {
    curInd += 2;
    s[0] = fromoc(ch1) * 8 + fromoc(ch2);
  }
  else raiseError();

  return s;
  #undef islowoc
  #undef isoc
  #undef fromoc
}
std::wstring WCPattern::parseHex()
{
  #define to_low(x)   (((x) >= (wchar_t)'A' && (x) <= (wchar_t)'Z') ? ((x) - (wchar_t)'A' + (wchar_t)'a') : (x))
  #define is_dig(x)   ((x) >= (wchar_t)'0' && (x) <= (wchar_t)'9')
  #define is_hex(x)   (is_dig(x) || (to_low(x) >= (wchar_t)'a' && to_low(x) <= (wchar_t)'f'))
  #define to_int(x)   ((is_dig(x)) ? ((x) - (wchar_t)'0') : (to_low(x) - (wchar_t)'a' + 10))

  int ci = curInd;
  wchar_t ch1 = (ci + 0 < (int)pattern.size()) ? pattern[ci + 0] : -1;
  wchar_t ch2 = (ci + 1 < (int)pattern.size()) ? pattern[ci + 1] : -1;
  wchar_t ch3 = (ci + 2 < (int)pattern.size()) ? pattern[ci + 2] : -1;
  wchar_t ch4 = (ci + 3 < (int)pattern.size()) ? pattern[ci + 3] : -1;
  std::wstring s = L" ";

  if (is_hex(ch1) && is_hex(ch2) && is_hex(ch3) && is_hex(ch4))
  {
    curInd += 2;
    s[0] = (to_int(ch1) << 12 & 0xF000) | (to_int(ch2) << 8 & 0x0F00) |
           (to_int(ch3) <<  4 & 0x0F00) | (to_int(ch4)      & 0x000F);
  }
  else if (is_hex(ch1) && is_hex(ch2))
  {
    curInd += 2;
    s[0] = (to_int(ch1) << 4 & 0xF0) | (to_int(ch2) & 0x0F);
  }

  return s;
  #undef to_low
  #undef is_dig
  #undef is_hex
  #undef to_int
}
std::wstring WCPattern::parseEscape(bool & inv, bool & quo)
{
  wchar_t ch = pattern[curInd++];
  std::wstring classes = L"";

  if (curInd > (int)pattern.size())
  {
    raiseError();
    return NULL;
  }

  quo = 0;
  inv = 0;
  switch (ch)
  {
  case (wchar_t)'p': classes = parsePosix();                                                         break;
  case (wchar_t)'P': classes = L"!!"; classes += parsePosix();                                        break;
  case (wchar_t)'d': classes = L"0123456789";                                                         break;
  case (wchar_t)'D': classes = L"!!0123456789";                                                       break;
  case (wchar_t)'s': classes = L" \t\r\n\f";                                                          break;
  case (wchar_t)'S': classes = L"!! \t\r\n\f";                                                        break;
  case (wchar_t)'w': classes = L"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_";    break;
  case (wchar_t)'W': classes = L"!!abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_";  break;
  case (wchar_t)'0': classes = parseOctal(); break;
  case (wchar_t)'x': classes = parseHex();   break;

  case (wchar_t)'Q': quo = 1;        break;
  case (wchar_t)'t': classes = L"\t"; break;
  case (wchar_t)'r': classes = L"\r"; break;
  case (wchar_t)'n': classes = L"\n"; break;
  case (wchar_t)'f': classes = L"\f"; break;
  case (wchar_t)'a': classes = L"\a"; break;
  case (wchar_t)'e': classes = L"\r"; break;
  default:  classes = L" "; classes[0] = ch; break;
  }
  if (classes.substr(0, 2) == L"!!")
  {
    classes = classes.substr(2);
    inv = 1;
  }
  return classes;
}
NFAUNode * WCPattern::parseRegisteredWCPattern(NFAUNode ** end)
{
  int i, j;
  std::wstring s;
  NFAUNode * ret = NULL;
  for (i = curInd; i < (int)pattern.size() && pattern[i] != (wchar_t)'}'; ++i) { }
  if (pattern[i] != (wchar_t)'}') { raiseError(); return NULL; }
  if (i == curInd + 1)   { raiseError(); return NULL; } // {}
  if (
      !(
        (pattern[curInd] >= (wchar_t)'a' && pattern[curInd] <= (wchar_t)'z') ||
        (pattern[curInd] >= (wchar_t)'A' && pattern[curInd] <= (wchar_t)'Z') ||
        (pattern[curInd] == (wchar_t)'_')
       )
     )
  {
    raiseError();
    return NULL;
  }
  for (j = curInd; !error && j < i; ++j)
  {
    if (
        !(
          (pattern[j] >= (wchar_t)'a' && pattern[j] <= (wchar_t)'z') ||
          (pattern[j] >= (wchar_t)'A' && pattern[j] <= (wchar_t)'Z') ||
          (pattern[j] >= (wchar_t)'0' && pattern[j] <= (wchar_t)'9') ||
          (pattern[j] == (wchar_t)'_')
         )
        )
    {
      raiseError();
      return NULL;
    }
  }
  s = pattern.substr(curInd, i - curInd);
  if (registeredWCPatterns.find(s) == registeredWCPatterns.end()) raiseError();
  else
  {
    unsigned long oflags = flags;
    std::wstring op = pattern;
    int ci = i + 1;

    pattern = registeredWCPatterns[s].first;
    curInd = 0;
    flags = registeredWCPatterns[s].second;

    --groupCount;
    ret = parse(0, 0, end);

    pattern = op;
    curInd = ci;
    flags = oflags;
  }
  if (error) { *end = ret = NULL; }
  return ret;
}

// look behind should interpret everything as a literal (except \\) since the
// pattern must have a concrete length
NFAUNode * WCPattern::parseBehind(const bool pos, NFAUNode ** end)
{
  std::wstring t = L"";
  while (curInd < (int)pattern.size() && pattern[curInd] != (wchar_t)')')
  {
    wchar_t ch = pattern[curInd++];
    t += L" ";
    if (ch == (wchar_t)'\\')
    {
      if (curInd + 1 >= (int)pattern.size())
      {
        raiseError();
        return *end = registerNode(new NFACharUNode((wchar_t)' '));
      }
      ch = pattern[curInd++];
    }
    t[t.size() - 1] = ch;
  }
  if (curInd >= (int)pattern.size() || pattern[curInd] != (wchar_t)')') raiseError();
  else ++curInd;
  return *end = registerNode(new NFALookBehindUNode(t, pos));
}
NFAUNode * WCPattern::parseQuote()
{
  bool done = 0;
  std::wstring s = L"";

  while (!done)
  {
    if      (curInd >= (int)pattern.size())
    {
      raiseError();
      done = 1;
    }
    else if (pattern.substr(curInd, 2) == L"\\E")
    {
      curInd += 2;
      done = 1;
    }
    else if (pattern[curInd] == (wchar_t)'\\')
    {
      s += L" ";
      s[s.size() - 1] = pattern[++curInd];
      ++curInd;
    }
    else
    {
      s += L" ";
      s[s.size() - 1] = pattern[curInd++];
    }
  }
  if ((flags & WCPattern::CASE_INSENSITIVE) != 0) return registerNode(new NFACIQuoteUNode(s));
  return registerNode(new NFAQuoteUNode(s));
}
NFAUNode * WCPattern::parse(const bool inParen, const bool inOr, NFAUNode ** end)
{
  NFAUNode * start, * cur, * next = NULL;
  std::wstring t;
  int grc = groupCount++;
  bool inv, quo;
  bool ahead = 0, pos = 0, noncap = 0, indep = 0;
  unsigned long oldFlags = flags;

  if (inParen)
  {
    if (pattern[curInd] == (wchar_t)'?')
    {
      ++curInd;
      --groupCount;
      if      (pattern[curInd]           == (wchar_t)':')   { noncap = 1; ++curInd;     grc = --nonCapGroupCount; }
      else if (pattern[curInd]           == (wchar_t)'=')   { ++curInd;     ahead = 1;  pos = 1;                  }
      else if (pattern[curInd]           == (wchar_t)'!')   { ++curInd;     ahead = 1;  pos = 0;                  }
      else if (pattern.substr(curInd, 2) == L"<=")  { curInd += 2;  return parseBehind(1, end);                   }
      else if (pattern.substr(curInd, 2) == L"<!")  { curInd += 2;  return parseBehind(0, end);                   }
      else if (pattern[curInd]           == (wchar_t)'>')   { ++curInd;     indep = 1;                            }
      else
      {
        bool negate = false, done = false;
        while (!done)
        {
          if (curInd >= (int)pattern.size())
          {
            raiseError();
            return NULL;
          }
          else if (negate)
          {
            switch (pattern[curInd])
            {
            case (wchar_t)'i': flags &= ~WCPattern::CASE_INSENSITIVE;   break;
            case (wchar_t)'d': flags &= ~WCPattern::UNIX_LINE_MODE;     break;
            case (wchar_t)'m': flags &= ~WCPattern::MULTILINE_MATCHING; break;
            case (wchar_t)'s': flags &= ~WCPattern::DOT_MATCHES_ALL;    break;
            case (wchar_t)':': done = true;                             break;
            case (wchar_t)'-':
            default: raiseError(); return NULL;
            }
          }
          else
          {
            switch (pattern[curInd])
            {
            case (wchar_t)'i': flags |= WCPattern::CASE_INSENSITIVE;    break;
            case (wchar_t)'d': flags |= WCPattern::UNIX_LINE_MODE;      break;
            case (wchar_t)'m': flags |= WCPattern::MULTILINE_MATCHING;  break;
            case (wchar_t)'s': flags |= WCPattern::DOT_MATCHES_ALL;     break;
            case (wchar_t)':': done = true;                             break;
            case (wchar_t)'-': negate = true;                           break;
            default:  raiseError(); return NULL;
            }
          }
          ++curInd;
        }
        noncap = 1;
        grc = --nonCapGroupCount;
      }

      if (noncap) cur = start = registerNode(new NFAGroupHeadUNode(grc));
      else        cur = start = registerNode(new NFASubStartUNode);
    }
    else cur = start = registerNode(new NFAGroupHeadUNode(grc));
  }
  else cur = start = registerNode(new NFASubStartUNode);
  while (curInd < (int)pattern.size())
  {
    wchar_t ch = pattern[curInd++];

    next = NULL;
    if (error) return NULL;
    switch (ch)
    {
    case (wchar_t)'^':
      if ((flags & WCPattern::MULTILINE_MATCHING) != 0) next = registerNode(new NFAStartOfLineUNode);
      else                                            next = registerNode(new NFAStartOfInputUNode);
      break;
    case (wchar_t)'$':
      if ((flags & WCPattern::MULTILINE_MATCHING) != 0) next = registerNode(new NFAEndOfLineUNode);
      else                                            next = registerNode(new NFAEndOfInputUNode(0));
      break;
    case (wchar_t)'|':
      --groupCount;
      cur->next = registerNode(new NFAAcceptUNode);
      cur = start = registerNode(new NFAOrUNode(start, parse(inParen, 1)));
      break;
    case (wchar_t)'\\':
      if      (curInd < (int)pattern.size())
      {
        bool eoi = 0;
        switch (pattern[curInd])
        {
        case (wchar_t)'1':
        case (wchar_t)'2':
        case (wchar_t)'3':
        case (wchar_t)'4':
        case (wchar_t)'5':
        case (wchar_t)'6':
        case (wchar_t)'7':
        case (wchar_t)'8':
        case (wchar_t)'9': next = parseBackref(); break;
        case (wchar_t)'A': ++curInd; next = registerNode(new NFAStartOfInputUNode);     break;
        case (wchar_t)'B': ++curInd; next = registerNode(new NFAWordBoundaryUNode(0));  break;
        case (wchar_t)'b': ++curInd; next = registerNode(new NFAWordBoundaryUNode(1));  break;
        case (wchar_t)'G': ++curInd; next = registerNode(new NFAEndOfMatchUNode);       break;
        case (wchar_t)'Z': eoi = 1;
        case (wchar_t)'z': ++curInd; next = registerNode(new NFAEndOfInputUNode(eoi));  break;
        default:
          t = parseEscape(inv, quo);
          //printf("inv quo classes { %c %c %s }\n", inv ? (wchar_t)'t' : (wchar_t)'f', quo ? (wchar_t)'t' : (wchar_t)'f', t.c_str());
          if (!quo)
          {
            if (t.size() > 1 || inv)
            {
              if ((flags & WCPattern::CASE_INSENSITIVE) != 0) next = registerNode(new NFACIClassUNode(t, inv));
              else                                            next = registerNode(new NFAClassUNode(t, inv));
            }
            else
            {
              next = registerNode(new NFACharUNode(t[0]));
            }
          }
          else
          {
            next = parseQuote();
          }
        }
      }
      else raiseError();
      break;
    case (wchar_t)'[':
      if ((flags & WCPattern::CASE_INSENSITIVE) == 0)
      {
        NFAClassUNode * clazz = new NFAClassUNode();
        std::wstring s = parseClass();
        for (int i = 0; i < (int)s.size(); ++i) clazz->vals[s[i]] = 1;
        next = registerNode(clazz);
      }
      else
      {
        NFACIClassUNode * clazz = new NFACIClassUNode();
        std::wstring s = parseClass();
        for (int i = 0; i < (int)s.size(); ++i) clazz->vals[towlower(s[i])] = 1;
        next = registerNode(clazz);
      }
      break;
    case (wchar_t)'.':
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -