📄 pypcre.c

📁 python s60 1.4.5版本的源代码
💻 C
📖 第 1 页 / 共 5 页
字号:
上一页 1 2 3 45
        }

      /* If the maximum is unlimited, set a repeater in the final copy. */

      if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
      }

    /* Else there's some kind of shambles */

    else
      {
      *errorptr = ERR11;
      goto FAILED;
      }

    /* In all case we no longer have a previous item. */

    previous = NULL;
    break;


    /* Start of nested bracket sub-expression, or comment or lookahead.
    First deal with special things that can come after a bracket; all are
    introduced by ?, and the appearance of any of them means that this is not a
    referencing group. They were checked for validity in the first pass over
    the string, so we don't have to check for syntax errors here.  */

    case '(':
    previous = code;              /* Only real brackets can be repeated */
    if (*(++ptr) == '?')
      {
      bravalue = OP_BRA;

      switch (*(++ptr))
        {
        case '#':
        case 'i':
        case 'L':
        case 'm':
        case 's':
        case 'x':
        ptr++;
        while (*ptr != ')') ptr++;
        previous = NULL;
        continue;

        case ':':                 /* Non-extracting bracket */
        ptr++;
        break;

        case '=':                 /* Assertions can't be repeated */
        bravalue = OP_ASSERT;
        ptr++;
        previous = NULL;
        break;

        case '!':
        bravalue = OP_ASSERT_NOT;
        ptr++;
        previous = NULL;
        break;

	case ('P'):
	  ptr++;
	  if (*ptr=='<')
	    {
	      /* (?P<groupname>...) */
	      int idlen;
	      PyObject *string, *intobj;

	      ptr++;
	      idlen = get_group_id(ptr, '>', errorptr);
	      if (*errorptr) {
		goto FAILED;
	      }
	      string = PyString_FromStringAndSize((char*)ptr, idlen);
	      intobj = PyInt_FromLong( brackets[0] + 1 );
	      if (intobj == NULL || string == NULL)
		{
		  Py_XDECREF(string);
		  Py_XDECREF(intobj);
		  *errorptr = "exception raised";
		  goto FAILED;
		}
	      PyDict_SetItem(dictionary, string, intobj);
	      Py_DECREF(string); Py_DECREF(intobj); /* XXX DECREF commented out! */
	      ptr += idlen+1;  /* Point to rest of expression */
	      goto do_grouping_bracket;
	    }
	  if (*ptr=='=')
	    {
	      /* (?P=groupname) */
	      int idlen, refnum;
	      PyObject *string, *intobj;

	      ptr++;
	      idlen = get_group_id(ptr, ')', errorptr);
	      if (*errorptr) {
		goto FAILED;
	      }
	      string = PyString_FromStringAndSize((char *)ptr, idlen);
	      if (string==NULL)	{
		  *errorptr = "exception raised";
		  goto FAILED;
		}
	      intobj = PyDict_GetItem(dictionary, string);
	      if (intobj==NULL) {
		Py_DECREF(string);
		*errorptr = "?P= group identifier isn't defined";
		goto FAILED;
	      }

	      refnum = PyInt_AsLong(intobj);
	      Py_DECREF(string); 
	      /* The caller doesn't own the reference to the value
		 returned from PyDict_GetItem, so intobj is not
		 DECREF'ed. */

	      *code++ = OP_REF;
	      *code++ = refnum;
	      /* The continue will cause the top-level for() loop to
		 be resumed, so ptr will be immediately incremented.
		 Therefore, the following line adds just idlen, not
		 idlen+1 */
	      ptr += idlen;
	      continue;
	    }
	  /* The character after ?P is neither < nor =, so 
	     report an error.  Add more Python-extensions here. */
	  *errorptr="unknown after (?P";
	  goto FAILED;

        case '>':                         /* "Match once" brackets */
        if ((options & PCRE_EXTRA) != 0)  /* Not yet standard */
          {
          bravalue = OP_ONCE;
          ptr++;
          previous = NULL;
          break;
          }
        /* Else fall through */

        default:
        *errorptr = ERR12;
        goto FAILED;
        }
      }

    /* Else we have a referencing group */

    else
      {
      do_grouping_bracket:
      if (++(*brackets) > EXTRACT_MAX)
        {
        *errorptr = ERR13;
        goto FAILED;
        }
      bravalue = OP_BRA + *brackets;
      }

    /* Process nested bracketed re; at end pointer is on the bracket. We copy
    code into a non-register variable in order to be able to pass its address
    because some compilers complain otherwise. */

    *code = bravalue;
      {
      uschar *mcode = code;
      if (!compile_regex(options, brackets, &mcode, &ptr, errorptr, dictionary))
        goto FAILED;
      code = mcode;
      }

    if (*ptr != ')')
      {
      *errorptr = ERR14;
      goto FAILED;
      }
    break;

    /* Check \ for being a real metacharacter; if not, fall through and handle
    it as a data character at the start of a string. Escape items are checked
    for validity in the pre-compiling pass. */

    case '\\':
    oldptr = ptr;
    c = check_escape(&ptr, errorptr, *brackets, options, FALSE);

    /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
    are arranged to be the negation of the corresponding OP_values. For the
    back references, the values are ESC_REF plus the reference number. Only
    back references and those types that consume a character may be repeated.
    We can test for values between ESC_b and ESC_Z for the latter; this may
    have to change if any new ones are ever created. */

    if (c < 0)
      {
      if (-c >= ESC_REF)
        {
        int refnum = -c - ESC_REF;
        if (*brackets < refnum)
          {
          *errorptr = ERR15;
          goto FAILED;
          }
        previous = code;
        *code++ = OP_REF;
        *code++ = refnum;
        }
      else
        {
        previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
        if ( (options & PCRE_LOCALE) != 0)
	  {
	    switch (c)
	      {
		case (-ESC_b): c = -OP_WORD_BOUNDARY_L; break;
		case (-ESC_B): c = -OP_NOT_WORD_BOUNDARY_L; break;
		case (-ESC_w): c = -OP_WORDCHAR_L; break;
		case (-ESC_W): c = -OP_NOT_WORDCHAR_L; break;
	      }
	  }
        *code++ = -c;
        }
      continue;
      }

    /* Data character: Reset and fall through */

    ptr = oldptr;
    c = '\\';

    /* Handle a run of data characters until a metacharacter is encountered.
    The first character is guaranteed not to be whitespace or # when the
    extended flag is set. */

    NORMAL_CHAR:
    default:
    previous = code;
    *code = OP_CHARS;
    code += 2;
    length = 0;

    do
      {
      if ((options & PCRE_EXTENDED) != 0)
        {
        if ((pcre_ctypes[c] & ctype_space) != 0) continue;
        if (c == '#')
          {
          while ((c = *(++ptr)) != 0 && c != '\n');
          if (c == 0) break;
          continue;
          }
        }

      /* Backslash may introduce a data char or a metacharacter. Escaped items
      are checked for validity in the pre-compiling pass. Stop the string
      before a metaitem. */

      if (c == '\\')
        {
        oldptr = ptr;
        c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
        if (c < 0) { ptr = oldptr; break; }
        }

      /* Ordinary character or single-char escape */

      *code++ = c;
      length++;
      }

    /* This "while" is the end of the "do" above. */

    while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);

    /* Compute the length and set it in the data vector, and advance to
    the next state. */

    previous[1] = length;
    if (length < 255) ptr--;
    break;
    }
  }                   /* end of big loop */

/* Control never reaches here by falling through, only by a goto for all the
error states. Pass back the position in the pattern so that it can be displayed
to the user for diagnosing the error. */

FAILED:
*ptrptr = ptr;
return FALSE;
}




/*************************************************
*     Compile sequence of alternatives           *
*************************************************/

/* On entry, ptr is pointing past the bracket character, but on return
it points to the closing bracket, or vertical bar, or end of string.
The code variable is pointing at the byte into which the BRA operator has been
stored.

Argument:
  options   the option bits
  brackets  -> int containing the number of extracting brackets used
  codeptr   -> the address of the current code pointer
  ptrptr    -> the address of the current pattern pointer
  errorptr  -> pointer to error message

Returns:    TRUE on success
*/

static BOOL
compile_regex(int options, int *brackets, uschar **codeptr,
  const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
{
const uschar *ptr = *ptrptr;
uschar *code = *codeptr;
uschar *start_bracket = code;

for (;;)
  {
  int length;
  uschar *last_branch = code;

  code += 3;
  if (!compile_branch(options, brackets, &code, &ptr, errorptr, dictionary))
    {
    *ptrptr = ptr;
    return FALSE;
    }

  /* Fill in the length of the last branch */

  length = code - last_branch;
  last_branch[1] = length >> 8;
  last_branch[2] = length & 255;

  /* Reached end of expression, either ')' or end of pattern. Insert a
  terminating ket and the length of the whole bracketed item, and return,
  leaving the pointer at the terminating char. */

  if (*ptr != '|')
    {
    length = code - start_bracket;
    *code++ = OP_KET;
    *code++ = length >> 8;
    *code++ = length & 255;
    *codeptr = code;
    *ptrptr = ptr;
    return TRUE;
    }

  /* Another branch follows; insert an "or" node and advance the pointer. */

  *code = OP_ALT;
  ptr++;
  }
/* Control never reaches here */
}



/*************************************************
*          Check for anchored expression         *
*************************************************/

/* Try to find out if this is an anchored regular expression. Consider each
alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
it's anchored. However, if this is a multiline pattern, then only OP_SOD
counts, since OP_CIRC can match in the middle.

A branch is also implicitly anchored if it starts with .* because that will try
the rest of the pattern at all possible matching points, so there is no point
trying them again.

Argument:  points to start of expression (the bracket)
Returns:   TRUE or FALSE
*/

static BOOL
is_anchored(register const uschar *code, BOOL multiline)
{
do {
   int op = (int)code[3];
   if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
     { if (!is_anchored(code+3, multiline)) return FALSE; }
   else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
     { if (code[4] != OP_ANY) return FALSE; }
   else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
   code += (code[1] << 8) + code[2];
   }
while (*code == OP_ALT);
return TRUE;
}



/*************************************************
*     Check for start with \n line expression    *
*************************************************/

/* This is called for multiline expressions to try to find out if every branch
starts with ^ so that "first char" processing can be done to speed things up.

Argument:  points to start of expression (the bracket)
Returns:   TRUE or FALSE
*/

static BOOL
is_startline(const uschar *code)
{
do {
   if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
     { if (!is_startline(code+3)) return FALSE; }
   else if (code[3] != OP_CIRC) return FALSE;
   code += (code[1] << 8) + code[2];
   }
while (*code == OP_ALT);
return TRUE;
}



/*************************************************
*          Check for fixed first char            *
*************************************************/

/* Try to find out if there is a fixed first character. This is called for
unanchored expressions, as it speeds up their processing quite considerably.
Consider each alternative branch. If they all start with the same char, or with
a bracket all of whose alternatives start with the same char (recurse ad lib),
then
上一页 1 2 3 45
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -