📄 pcre_compile.c

📁 Scheme跨平台编译器
💻 C
📖 第 1 页 / 共 5 页
字号:
上一页 1 2 3 45
    case OP_TYPEMINUPTO:    case OP_TYPEPOSUPTO:    if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;    break;    /* End of branch */    case OP_KET:    case OP_KETRMAX:    case OP_KETRMIN:    case OP_ALT:    return TRUE;    /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,    MINUPTO, and POSUPTO may be followed by a multibyte character */#ifdef SUPPORT_UTF8    case OP_STAR:    case OP_MINSTAR:    case OP_POSSTAR:    case OP_QUERY:    case OP_MINQUERY:    case OP_POSQUERY:    case OP_UPTO:    case OP_MINUPTO:    case OP_POSUPTO:    if (utf8) while ((code[2] & 0xc0) == 0x80) code++;    break;#endif    }  }return TRUE;}/**************************************************    Scan compiled regex for non-emptiness       **************************************************//* This function is called to check for left recursive calls. We want to checkthe current branch of the current pattern to see if it could match the emptystring. If it could, we must look outwards for branches at other levels,stopping when we pass beyond the bracket which is the subject of the recursion.Arguments:  code        points to start of the recursion  endcode     points to where to stop (current RECURSE item)  bcptr       points to the chain of current (unclosed) branch starts  utf8        TRUE if in UTF-8 modeReturns:      TRUE if what is matched could be empty*/static BOOLcould_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,  BOOL utf8){while (bcptr != NULL && bcptr->current >= code)  {  if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;  bcptr = bcptr->outer;  }return TRUE;}/**************************************************           Check for POSIX class syntax         **************************************************//* This function is called when the sequence "[:" or "[." or "[=" isencountered in a character class. It checks whether this is followed by asequence of characters terminated by a matching ":]" or ".]" or "=]". If wereach an unescaped ']' without the special preceding character, return FALSE.Originally, this function only recognized a sequence of letters between theterminators, but it seems that Perl recognizes any sequence of characters,though of course unknown POSIX names are subsequently rejected. Perl gives an"Unknown POSIX class" error for [:f\oo:] for example, where previously PCREdidn't consider this to be a POSIX class. Likewise for [:1234:].The problem in trying to be exactly like Perl is in the handling of escapes. Wehave to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIXclass, but [abc[:x\]pqr:]] is (so that an error can be generated). The codebelow handles the special case of \], but does not try to do any other escapeprocessing. This makes it different from Perl for cases such as [:l\ower:]where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize"l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,I think.Arguments:  ptr      pointer to the initial [  endptr   where to return the end pointerReturns:   TRUE or FALSE*/static BOOLcheck_posix_syntax(const uschar *ptr, const uschar **endptr){int terminator;          /* Don't combine these lines; the Solaris cc */terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */for (++ptr; *ptr != 0; ptr++)  {  if (*ptr == '\\' && ptr[1] == ']') ptr++; else    {    if (*ptr == ']') return FALSE;    if (*ptr == terminator && ptr[1] == ']')      {      *endptr = ptr;      return TRUE;      }    }  }return FALSE;}/**************************************************          Check POSIX class name                **************************************************//* This function is called to check the name given in a POSIX-style class entrysuch as [:alnum:].Arguments:  ptr        points to the first letter  len        the length of the nameReturns:     a value representing the name, or -1 if unknown*/static intcheck_posix_name(const uschar *ptr, int len){const char *pn = posix_names;register int yield = 0;while (posix_name_lengths[yield] != 0)  {  if (len == posix_name_lengths[yield] &&    strncmp((const char *)ptr, pn, len) == 0) return yield;  pn += posix_name_lengths[yield] + 1;  yield++;  }return -1;}/**************************************************    Adjust OP_RECURSE items in repeated group   **************************************************//* OP_RECURSE items contain an offset from the start of the regex to the groupthat is referenced. This means that groups can be replicated for fixedrepetition simply by copying (because the recursion is allowed to refer toearlier groups that are outside the current group). However, when a group isoptional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO isinserted before it, after it has been compiled. This means that any OP_RECURSEitems within it that refer to the group itself or any contained groups have tohave their offsets adjusted. That one of the jobs of this function. Before itis called, the partially compiled regex must be temporarily terminated withOP_END.This function has been extended with the possibility of forward references forrecursions and subroutine calls. It must also check the list of such referencesfor the group we are dealing with. If it finds that one of the recursions inthe current group is on this list, it adjusts the offset in the list, not thevalue in the reference (which is a group number).Arguments:  group      points to the start of the group  adjust     the amount by which the group is to be moved  utf8       TRUE in UTF-8 mode  cd         contains pointers to tables etc.  save_hwm   the hwm forward reference pointer at the start of the groupReturns:     nothing*/static voidadjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,  uschar *save_hwm){uschar *ptr = group;while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)  {  int offset;  uschar *hc;  /* See if this recursion is on the forward reference list. If so, adjust the  reference. */  for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)    {    offset = GET(hc, 0);    if (cd->start_code + offset == ptr + 1)      {      PUT(hc, 0, offset + adjust);      break;      }    }  /* Otherwise, adjust the recursion offset if it's after the start of this  group. */  if (hc >= cd->hwm)    {    offset = GET(ptr, 1);    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);    }  ptr += 1 + LINK_SIZE;  }}/**************************************************        Insert an automatic callout point       **************************************************//* This function is called when the PCRE_AUTO_CALLOUT option is set, to insertcallout points before each pattern item.Arguments:  code           current code pointer  ptr            current pattern pointer  cd             pointers to tables etcReturns:         new code pointer*/static uschar *auto_callout(uschar *code, const uschar *ptr, compile_data *cd){*code++ = OP_CALLOUT;*code++ = 255;PUT(code, 0, ptr - cd->start_pattern);  /* Pattern offset */PUT(code, LINK_SIZE, 0);                /* Default length */return code + 2*LINK_SIZE;}/**************************************************         Complete a callout item                **************************************************//* A callout item contains the length of the next item in the pattern, whichwe can't fill in till after we have reached the relevant point. This is usedfor both automatic and manual callouts.Arguments:  previous_callout   points to previous callout item  ptr                current pattern pointer  cd                 pointers to tables etcReturns:             nothing*/static voidcomplete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd){int length = ptr - cd->start_pattern - GET(previous_callout, 2);PUT(previous_callout, 2 + LINK_SIZE, length);}#ifdef SUPPORT_UCP/**************************************************           Get othercase range                  **************************************************//* This function is passed the start and end of a class range, in UTF-8 modewith UCP support. It searches up the characters, looking for internal ranges ofcharacters in the "other" case. Each call returns the next one, updating thestart address.Arguments:  cptr        points to starting character value; updated  d           end value  ocptr       where to put start of othercase range  odptr       where to put end of othercase rangeYield:        TRUE when range returned; FALSE when no more*/static BOOLget_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,  unsigned int *odptr){unsigned int c, othercase, next;for (c = *cptr; c <= d; c++)  { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }if (c > d) return FALSE;*ocptr = othercase;next = othercase + 1;for (++c; c <= d; c++)  {  if (_pcre_ucp_othercase(c) != next) break;  next++;  }*odptr = next - 1;*cptr = c;return TRUE;}#endif  /* SUPPORT_UCP *//**************************************************     Check if auto-possessifying is possible    **************************************************//* This function is called for unlimited repeats of certain items, to seewhether the next thing could possibly match the repeated item. If not, it makessense to automatically possessify the repeated item.Arguments:  op_code       the repeated op code  this          data for this item, depends on the opcode  utf8          TRUE in UTF-8 mode  utf8_char     used for utf8 character bytes, NULL if not relevant  ptr           next character in pattern  options       options bits  cd            contains pointers to tables etc.Returns:        TRUE if possessifying is wanted*/static BOOLcheck_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,  const uschar *ptr, int options, compile_data *cd){int next;/* Skip whitespace and comments in extended mode */if ((options & PCRE_EXTENDED) != 0)  {  for (;;)    {    while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;    if (*ptr == '#')      {      while (*(++ptr) != 0)        if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }      }    else break;    }  }/* If the next item is one that we can handle, get its value. A non-negativevalue is a character, a negative value is an escape value. */if (*ptr == '\\')  {  int temperrorcode = 0;  next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);  if (temperrorcode != 0) return FALSE;  ptr++;    /* Point after the escape sequence */  }else if ((cd->ctypes[*ptr] & ctype_meta) == 0)  {#ifdef SUPPORT_UTF8  if (utf8) { GETCHARINC(next, ptr); } else#endif  next = *ptr++;  }else return FALSE;/* Skip whitespace and comments in extended mode */if ((options & PCRE_EXTENDED) != 0)  {  for (;;)    {    while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;    if (*ptr == '#')      {      while (*(++ptr) != 0)        if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }      }    else break;    }  }/* If the next thing is itself optional, we have to give up. */if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)  return FALSE;/* Now compare the next item with the previous opcode. If the previous is apositive single character match, "item" either contains the character or, if"item" is greater than 127 in utf8 mode, the character's bytes are inutf8_char. *//* Handle cases when the next item is a character. */if (next >= 0) switch(op_code)  {  case OP_CHAR:#ifdef SUPPORT_UTF8  if (utf8 && item > 127) { GETCHAR(item, utf8_char); }#endif  return item != next;  /* For CHARNC (caseless character) we must check the other case. If we have  Unicode property support, we can use it to
上一页 1 2 3 45
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -