首页 › 资源下载 › 生物技术 › ncbi源码 › 源码查看
pcre.c

来自「ncbi源码」· C语言代码 · 共 2,222 行 · 第 1/5 页
2,222 行
      if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;      branchlength += (cc[1] << 8) + cc[2];      cc += 5;      break;      default:      branchlength++;      }    break;    /* Anything else is variable length */    default:    return -1;    }  }/* Control never gets here */}/**************************************************           Check for POSIX class syntax         **************************************************//* This function is called when the sequence "[:" or "[." or "[=" isencountered in a character class. It checks whether this is followed by anoptional ^ and then a sequence of letters, terminated by a matching ":]" or".]" or "=]".Argument:  ptr      pointer to the initial [  endptr   where to return the end pointer  cd       pointer to compile dataReturns:   TRUE or FALSE*/static BOOLcheck_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd){int terminator;          /* Don't combine these lines; the Solaris cc */terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */if (*(++ptr) == '^') ptr++;while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;if (*ptr == terminator && ptr[1] == ']')  {  *endptr = ptr;  return TRUE;  }return FALSE;}/**************************************************          Check POSIX class name                **************************************************//* This function is called to check the name given in a POSIX-style class entrysuch as [:alnum:].Arguments:  ptr        points to the first letter  len        the length of the nameReturns:     a value representing the name, or -1 if unknown*/static intcheck_posix_name(const uschar *ptr, int len){register int yield = 0;while (posix_name_lengths[yield] != 0)  {  if (len == posix_name_lengths[yield] &&    strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;  yield++;  }return -1;}/**************************************************           Compile one branch                   **************************************************//* Scan the pattern, compiling it into the code vector.Arguments:  options      the option bits  brackets     points to number of extracting brackets used  code         points to the pointer to the current code point  ptrptr       points to the current pattern pointer  errorptr     points to pointer to error message  optchanged   set to the value of the last OP_OPT item compiled  reqchar      set to the last literal character required, else -1  countlits    set to count of mandatory literal characters  cd           contains pointers to tablesReturns:       TRUE on success               FALSE, with *errorptr set on error*/static BOOLcompile_branch(int options, int *brackets, uschar **codeptr,  const uschar **ptrptr, const char **errorptr, int *optchanged,  int *reqchar, int *countlits, compile_data *cd){int repeat_type, op_type;int repeat_min, repeat_max;int bravalue, length;int greedy_default, greedy_non_default;int prevreqchar;int condcount = 0;int subcountlits = 0;register int c;register uschar *code = *codeptr;uschar *tempcode;const uschar *ptr = *ptrptr;const uschar *tempptr;uschar *previous = NULL;uschar class[32];/* Set up the default and non-default settings for greediness */greedy_default = ((options & PCRE_UNGREEDY) != 0);greedy_non_default = greedy_default ^ 1;/* Initialize no required char, and count of literals */*reqchar = prevreqchar = -1;*countlits = 0;/* Switch on next character until the end of the branch */for (;; ptr++)  {  BOOL negate_class;  int class_charcount;  int class_lastchar;  int newoptions;  int skipbytes;  int subreqchar;  c = *ptr;  if ((options & PCRE_EXTENDED) != 0)    {    if ((cd->ctypes[c] & ctype_space) != 0) continue;    if (c == '#')      {      /* The space before the ; is to avoid a warning on a silly compiler      on the Macintosh. */      while ((c = *(++ptr)) != 0 && c != NEWLINE) ;      continue;      }    }  switch(c)    {    /* The branch terminates at end of string, |, or ). */    case 0:    case '|':    case ')':    *codeptr = code;    *ptrptr = ptr;    return TRUE;    /* Handle single-character metacharacters */    case '^':    previous = NULL;    *code++ = OP_CIRC;    break;    case '$':    previous = NULL;    *code++ = OP_DOLL;    break;    case '.':    previous = code;    *code++ = OP_ANY;    break;    /* Character classes. These always build a 32-byte bitmap of the permitted    characters, except in the special case where there is only one character.    For negated classes, we build the map as usual, then invert it at the end.    */    case '[':    previous = code;    *code++ = OP_CLASS;    /* If the first character is '^', set the negation flag and skip it. */    if ((c = *(++ptr)) == '^')      {      negate_class = TRUE;      c = *(++ptr);      }    else negate_class = FALSE;    /* Keep a count of chars so that we can optimize the case of just a single    character. */    class_charcount = 0;    class_lastchar = -1;    /* Initialize the 32-char bit map to all zeros. We have to build the    map in a temporary bit of store, in case the class contains only 1    character, because in that case the compiled code doesn't use the    bit map. */    memset(class, 0, 32 * sizeof(uschar));    /* Process characters until ] is reached. By writing this as a "do" it    means that an initial ] is taken as a data character. */    do      {      if (c == 0)        {        *errorptr = ERR6;        goto FAILED;        }      /* Handle POSIX class names. Perl allows a negation extension of the      form [:^name]. A square bracket that doesn't match the syntax is      treated as a literal. We also recognize the POSIX constructions      [.ch.] and [=ch=] ("collating elements") and fault them, as Perl      5.6 does. */      if (c == '[' &&          (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&          check_posix_syntax(ptr, &tempptr, cd))        {        BOOL local_negate = FALSE;        int posix_class, i;        register const uschar *cbits = cd->cbits;        if (ptr[1] != ':')          {          *errorptr = ERR31;          goto FAILED;          }        ptr += 2;        if (*ptr == '^')          {          local_negate = TRUE;          ptr++;          }        posix_class = check_posix_name(ptr, tempptr - ptr);        if (posix_class < 0)          {          *errorptr = ERR30;          goto FAILED;          }        /* If matching is caseless, upper and lower are converted to        alpha. This relies on the fact that the class table starts with        alpha, lower, upper as the first 3 entries. */        if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)          posix_class = 0;        /* Or into the map we are building up to 3 of the static class        tables, or their negations. */        posix_class *= 3;        for (i = 0; i < 3; i++)          {          int taboffset = posix_class_maps[posix_class + i];          if (taboffset < 0) break;          if (local_negate)            for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];          else            for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];          }        ptr = tempptr + 1;        class_charcount = 10;  /* Set > 1; assumes more than 1 per class */        continue;        }      /* Backslash may introduce a single character, or it may introduce one      of the specials, which just set a flag. Escaped items are checked for      validity in the pre-compiling pass. The sequence \b is a special case.      Inside a class (and only there) it is treated as backspace. Elsewhere      it marks a word boundary. Other escapes have preset maps ready to      or into the one we are building. We assume they have more than one      character in them, so set class_count bigger than one. */      if (c == '\\')        {        c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);        if (-c == ESC_b) c = '\b';        else if (c < 0)          {          register const uschar *cbits = cd->cbits;          class_charcount = 10;          switch (-c)            {            case ESC_d:            for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];            continue;            case ESC_D:            for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];            continue;            case ESC_w:            for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];            continue;            case ESC_W:            for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];            continue;            case ESC_s:            for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];            continue;            case ESC_S:            for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];            continue;            default:            *errorptr = ERR7;            goto FAILED;            }          }        /* Fall through if single character, but don't at present allow        chars > 255 in UTF-8 mode. */#ifdef SUPPORT_UTF8        if (c > 255)          {          *errorptr = ERR33;          goto FAILED;          }#endif        }      /* A single character may be followed by '-' to form a range. However,      Perl does not permit ']' to be the end of the range. A '-' character      here is treated as a literal. */      if (ptr[1] == '-' && ptr[2] != ']')        {        int d;        ptr += 2;        d = *ptr;        if (d == 0)          {          *errorptr = ERR6;          goto FAILED;          }        /* The second part of a range can be a single-character escape, but        not any of the other escapes. Perl 5.6 treats a hyphen as a literal        in such circumstances. */        if (d == '\\')          {          const uschar *oldptr = ptr;          d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);#ifdef SUPPORT_UTF8          if (d > 255)            {            *errorptr = ERR33;            goto FAILED;            }#endif          /* \b is backslash; any other special means the '-' was literal */          if (d < 0)            {            if (d == -ESC_b) d = '\b'; else              {              ptr = oldptr - 2;              goto SINGLE_CHARACTER;  /* A few lines below */              }            }          }        if (d < c)          {          *errorptr = ERR8;          goto FAILED;          }        for (; c <= d; c++)          {          class[c/8] |= (1 << (c&7));          if ((options & PCRE_CASELESS) != 0)            {            int uc = cd->fcc[c];           /* flip case */            class[uc/8] |= (1 << (uc&7));            }          class_charcount++;                /* in case a one-char range */          class_lastchar = c;          }        continue;   /* Go get the next char in the class */        }      /* Handle a lone single character - we can get here for a normal      non-escape char, or after \ that introduces a single character. */      SINGLE_CHARACTER:      class [c/8] |= (1 << (c&7));      if ((options & PCRE_CASELESS) != 0)        {        c = cd->fcc[c];   /* flip case */        class[c/8] |= (1 << (c&7));        }      class_charcount++;      class_lastchar = c;      }    /* Loop until ']' reached; the check for end of string happens inside the    loop. This "while" is the end of the "do" above. */    while ((c = *(++ptr)) != ']');    /* If class_charcount is 1 and class_lastchar is not negative, we saw    precisely one character. This doesn't need the whole 32-byte bit map.    We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if    it's negative. */    if (class_charcount == 1 && class_lastchar >= 0)      {      if (negate_class)        {
pcre.c - 源码说明

本页面展示了「ncbi源码」中的 pcre.c 源码文件，采用 C语言编程语言编写，共 2,222 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与ncbi相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?