📄 pypcre.c

📁 python s60 1.4.5版本的源代码
💻 C
📖 第 1 页 / 共 5 页
字号:
      case OP_CLASS:
      case OP_NEGCLASS:
      case OP_REF:
      case OP_CLASS_L:
	switch(*cc)
	  {
	  case (OP_REF):    cc += 2; break;
	  case (OP_CLASS): case (OP_NEGCLASS): cc += 1+32; break;
	  case (OP_CLASS_L): cc += 1+1+32; break;
	  }

      switch (*cc)
        {
        case OP_CRSTAR:
        case OP_CRMINSTAR:
        case OP_CRQUERY:
        case OP_CRMINQUERY:
        cc++;
        break;

        case OP_CRRANGE:
        case OP_CRMINRANGE:
        if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
        cc += 3;
        break;

        default:
        goto NEXT_BRANCH;
        }
      break;

      /* Anything else matches at least one character */

      default:
      goto NEXT_BRANCH;
      }
    }

  NEXT_BRANCH:
  code += (code[1] << 8) + code[2];
  }
while (*code == OP_ALT);

/* No branches match the empty string */

return FALSE;
}

/* Determine the length of a group ID in an expression like
   (?P<foo_123>...) 
Arguments:
  ptr        pattern position pointer (say that 3 times fast)
  finalchar  the character that will mark the end of the ID
  errorptr   points to the pointer to the error message
*/
  
static int 
get_group_id(const uschar *ptr, char finalchar, const char **errorptr)
{
  const uschar *start = ptr;

  /* If the first character is not in \w, or is in \w but is a digit,
     report an error */
  if (!(pcre_ctypes[*ptr] & ctype_word) ||
      (pcre_ctypes[*ptr++] & ctype_digit))
    {
      *errorptr = "(?P identifier must start with a letter or underscore";
      return 0;
    }

  /* Increment ptr until we either hit a null byte, the desired 
     final character, or a non-word character */
  for(; (*ptr != 0) && (*ptr != finalchar) && 
	(pcre_ctypes[*ptr] & ctype_word); ptr++)
    {
      /* Empty loop body */
    }
  if (*ptr==finalchar)
    return ptr-start;
  if (*ptr==0)
    {
      *errorptr = "unterminated (?P identifier";
      return 0;
    }
  *errorptr = "illegal character in (?P identifier";
  return 0;
}

/*************************************************
*            Handle escapes                      *
*************************************************/

/* This function is called when a \ has been encountered. It either returns a
positive value for a simple escape such as \n, or a negative value which
encodes one of the more complicated things such as \d. On entry, ptr is
pointing at the \. On exit, it is on the final character of the escape
sequence.

Arguments:
  ptrptr     points to the pattern position pointer
  errorptr   points to the pointer to the error message
  bracount   number of previous extracting brackets
  options    the options bits
  isclass    TRUE if inside a character class

Returns:     zero or positive => a data character
             negative => a special escape sequence
             on error, errorptr is set
*/

static int
check_escape(const uschar **ptrptr, const char **errorptr, int bracount, 
	     int options, BOOL isclass)
{
const uschar *ptr = *ptrptr;
int c = *(++ptr) & 255;   /* Ensure > 0 on signed-char systems */
int i;

if (c == 0) *errorptr = ERR1;

/* Digits or letters may have special meaning; all others are literals. */

else if (c < '0' || c > 'z') {}

/* Do an initial lookup in a table. A non-zero result is something that can be
returned immediately. Otherwise further processing may be required. */

else if ((i = escapes[c - '0']) != 0) c = i;

/* Escapes that need further processing, or are illegal. */

else
  {

  switch (c)
    {
    /* The handling of escape sequences consisting of a string of digits
    starting with one that is not zero is not straightforward. By experiment,
    the way Perl works seems to be as follows:

    Outside a character class, the digits are read as a decimal number. If the
    number is less than 10, or if there are that many previous extracting
    left brackets, then it is a back reference. Otherwise, up to three octal
    digits are read to form an escaped byte. Thus \123 is likely to be octal
    123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
    value is greater than 377, the least significant 8 bits are taken. Inside a
    character class, \ followed by a digit is always an octal number. */

    case '1': case '2': case '3': case '4': case '5':
    case '6': case '7': case '8': case '9':

    {
      /* PYTHON: Try to compute an octal value for a character */
      for(c=0, i=0; ptr[i]!=0 && i<3; i++) 
	{
	  if (( pcre_ctypes[ ptr[i] ] & ctype_odigit) != 0)
	    c = (c * 8 + ptr[i]-'0') & 255;
	  else
	    break; /* Non-octal character--break out of the loop */
	}
      /* It's a character if there were exactly 3 octal digits, or if
	 we're inside a character class and there was at least one
	 octal digit. */
      if ( (i == 3) || (isclass && i!=0) )
	{
	  ptr += i-1;
	  break;
	}
      c = ptr[0]; /* Restore the first character after the \ */
      c -= '0'; i = 1;
      while (i<2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0)
	{
	  c = c * 10 + ptr[1] - '0'; 
	  ptr++; i++;
	}
      if (c > 255 - ESC_REF) *errorptr = "back reference too big";
      c = -(ESC_REF + c);
    }
  break;

    /* \0 always starts an octal number, but we may drop through to here with a
    larger first octal digit */

    case '0':
    c -= '0';
    while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
      ptr[1] != '8' && ptr[1] != '9')
        c = (c * 8 + *(++ptr) - '0') & 255;
    break;

    /* Special escapes not starting with a digit are straightforward */

    case 'x':
  c = 0;
  while ( (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
    {
    ptr++;
    c = c * 16 + pcre_lcc[*ptr] -
      (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
    c &= 255;
    }
  break;


    /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
    other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
    for Perl compatibility, it is a literal. */

    default:
    if ((options & PCRE_EXTRA) != 0) switch(c)
      {
      case 'X':
      c = -ESC_X;      /* This could be a lookup if it ever got into Perl */
      break;

      default:
      *errorptr = ERR3;
      break;
      }
    break;
    }
  }

*ptrptr = ptr;
return c;
}



/*************************************************
*            Check for counted repeat            *
*************************************************/

/* This function is called when a '{' is encountered in a place where it might
start a quantifier. It looks ahead to see if it really is a quantifier or not.
It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
where the ddds are digits.

Arguments:
  p         pointer to the first char after '{'

Returns:    TRUE or FALSE
*/

static BOOL
is_counted_repeat(const uschar *p)
{
if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
if (*p == '}') return TRUE;

if (*p++ != ',') return FALSE;
if (*p == '}') return TRUE;

if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
return (*p == '}');
}



/*************************************************
*         Read repeat counts                     *
*************************************************/

/* Read an item of the form {n,m} and return the values. This is called only
after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
so the syntax is guaranteed to be correct, but we need to check the values.

Arguments:
  p          pointer to first char after '{'
  minp       pointer to int for min
  maxp       pointer to int for max
             returned as -1 if no max
  errorptr   points to pointer to error message

Returns:     pointer to '}' on success;
             current ptr on error, with errorptr set
*/

static const uschar *
read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
{
int min = 0;
int max = -1;

while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';

if (*p == '}') max = min; else
  {
  if (*(++p) != '}')
    {
    max = 0;
    while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
    if (max < min)
      {
      *errorptr = ERR4;
      return p;
      }
    }
  }

/* Do paranoid checks, then fill in the required variables, and pass back the
pointer to the terminating '}'. */

if (min > 65535 || max > 65535)
  *errorptr = ERR5;
else
  {
  *minp = min;
  *maxp = max;
  }
return p;
}



/*************************************************
*           Compile one branch                   *
*************************************************/

/* Scan the pattern, compiling it into the code vector.

Arguments:
  options    the option bits
  bracket    points to number of brackets used
  code       points to the pointer to the current code point
  ptrptr     points to the current pattern pointer
  errorptr   points to pointer to error message

Returns:     TRUE on success
             FALSE, with *errorptr set on error
*/

static BOOL
compile_branch(int options, int *brackets, uschar **codeptr,
	       const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
{
int repeat_type, op_type;
int repeat_min, repeat_max;
int bravalue, length;
int greedy_default, greedy_non_default;
register int c;
register uschar *code = *codeptr;
const uschar *ptr = *ptrptr;
const uschar *oldptr;
uschar *previous = NULL;
uschar class[32];
uschar *class_flag;  /* Pointer to the single-byte flag for OP_CLASS_L */

/* Set up the default and non-default settings for greediness */
 
greedy_default = ((options & PCRE_UNGREEDY) != 0);
greedy_non_default = greedy_default ^ 1;
 
/* Switch on next character until the end of the branch */

for (;; ptr++)
  {
  BOOL negate_class;
  int  class_charcount;
  int  class_lastchar;

  c = *ptr;
  if ((options & PCRE_EXTENDED) != 0)
    {
    if ((pcre_ctypes[c] & ctype_space) != 0) continue;
    if (c == '#')
      {
      while ((c = *(++ptr)) != 0 && c != '\n');
      continue;
      }
    }

  switch(c)
    {
    /* The branch terminates at end of string, |, or ). */

    case 0:
    case '|':
    case ')':
    *codeptr = code;
    *ptrptr = ptr;
    return TRUE;

    /* Handle single-character metacharacters */

    case '^':
    previous = NULL;
    *code++ = OP_CIRC;
    break;

    case '$':
    previous = NULL;
    *code++ = OP_DOLL;
    break;

    case '.':
    previous = code;
    *code++ = OP_ANY;
    break;

    /* Character classes. These always build a 32-byte bitmap of the permitted
    characters, except in the special case where there is only one character.
    For negated classes, we build the map as usual, then invert it at the end.
    */

    case '[':
    previous = code;
    if (options & PCRE_LOCALE) 
      {
	*code++ = OP_CLASS_L;
	/* Set the flag for localized classes (like \w) to 0 */
	class_flag = code;
	*class_flag = 0;
      }
    else
      {
	*code++ = OP_CLASS;
	class_flag = NULL;
      }
    
    /* If the first character is '^', set the negation flag, and use a
    different opcode. This only matters if caseless matching is specified at
    runtime. */

    if ((c = *(++ptr)) == '^')
      {
      negate_class = TRUE;
      if (*(code-1)==OP_CLASS) *(code-1) = OP_NEGCLASS;
      c = *(++ptr);
      }
    else negate_class = FALSE;

    /* Keep a count of chars so that we can optimize the case of just a single
    character. */

    class_charcount = 0;
    class_lastchar = -1;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -