📄 pcre_compile.c
字号:
Returns: pointer to the opcode for the bracket, or NULL if not found*/static const uschar *find_bracket(const uschar *code, BOOL utf8, int number){for (;;) { register int c = *code; if (c == OP_END) return NULL; /* XCLASS is used for classes that cannot be represented just by a bit map. This includes negated single high-valued characters. The length in the table is zero; the actual length is stored in the compiled code. */ if (c == OP_XCLASS) code += GET(code, 1); /* Handle capturing bracket */ else if (c == OP_CBRA) { int n = GET2(code, 1+LINK_SIZE); if (n == number) return (uschar *)code; code += _pcre_OP_lengths[c]; } /* Otherwise, we can get the item's length from the table, except that for repeated character types, we have to test for \p and \P, which have an extra two bytes of parameters. */ else { switch(c) { case OP_TYPESTAR: case OP_TYPEMINSTAR: case OP_TYPEPLUS: case OP_TYPEMINPLUS: case OP_TYPEQUERY: case OP_TYPEMINQUERY: case OP_TYPEPOSSTAR: case OP_TYPEPOSPLUS: case OP_TYPEPOSQUERY: if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; break; case OP_TYPEUPTO: case OP_TYPEMINUPTO: case OP_TYPEEXACT: case OP_TYPEPOSUPTO: if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2; break; } /* Add in the fixed length from the table */ code += _pcre_OP_lengths[c]; /* In UTF-8 mode, opcodes that are followed by a character may be followed by a multi-byte character. The length in the table is a minimum, so we have to arrange to skip the extra bytes. */#ifdef SUPPORT_UTF8 if (utf8) switch(c) { case OP_CHAR: case OP_CHARNC: case OP_EXACT: case OP_UPTO: case OP_MINUPTO: case OP_POSUPTO: case OP_STAR: case OP_MINSTAR: case OP_POSSTAR: case OP_PLUS: case OP_MINPLUS: case OP_POSPLUS: case OP_QUERY: case OP_MINQUERY: case OP_POSQUERY: if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f]; break; }#endif } }}/************************************************** Scan compiled regex for recursion reference **************************************************//* This little function scans through a compiled pattern until it finds aninstance of OP_RECURSE.Arguments: code points to start of expression utf8 TRUE in UTF-8 modeReturns: pointer to the opcode for OP_RECURSE, or NULL if not found*/static const uschar *find_recurse(const uschar *code, BOOL utf8){for (;;) { register int c = *code; if (c == OP_END) return NULL; if (c == OP_RECURSE) return code; /* XCLASS is used for classes that cannot be represented just by a bit map. This includes negated single high-valued characters. The length in the table is zero; the actual length is stored in the compiled code. */ if (c == OP_XCLASS) code += GET(code, 1); /* Otherwise, we can get the item's length from the table, except that for repeated character types, we have to test for \p and \P, which have an extra two bytes of parameters. */ else { switch(c) { case OP_TYPESTAR: case OP_TYPEMINSTAR: case OP_TYPEPLUS: case OP_TYPEMINPLUS: case OP_TYPEQUERY: case OP_TYPEMINQUERY: case OP_TYPEPOSSTAR: case OP_TYPEPOSPLUS: case OP_TYPEPOSQUERY: if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; break; case OP_TYPEPOSUPTO: case OP_TYPEUPTO: case OP_TYPEMINUPTO: case OP_TYPEEXACT: if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2; break; } /* Add in the fixed length from the table */ code += _pcre_OP_lengths[c]; /* In UTF-8 mode, opcodes that are followed by a character may be followed by a multi-byte character. The length in the table is a minimum, so we have to arrange to skip the extra bytes. */#ifdef SUPPORT_UTF8 if (utf8) switch(c) { case OP_CHAR: case OP_CHARNC: case OP_EXACT: case OP_UPTO: case OP_MINUPTO: case OP_POSUPTO: case OP_STAR: case OP_MINSTAR: case OP_POSSTAR: case OP_PLUS: case OP_MINPLUS: case OP_POSPLUS: case OP_QUERY: case OP_MINQUERY: case OP_POSQUERY: if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f]; break; }#endif } }}/************************************************** Scan compiled branch for non-emptiness **************************************************//* This function scans through a branch of a compiled pattern to see whether itcan match the empty string or not. It is called from could_be_empty()below and from compile_branch() when checking for an unlimited repeat of agroup that can match nothing. Note that first_significant_code() skips overassertions. If we hit an unclosed bracket, we return "empty" - this means we'vestruck an inner bracket whose current branch will already have been scanned.Arguments: code points to start of search endcode points to where to stop utf8 TRUE if in UTF8 modeReturns: TRUE if what is matched could be empty*/static BOOLcould_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8){register int c;for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE); code < endcode; code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE)) { const uschar *ccode; c = *code; /* Groups with zero repeats can of course be empty; skip them. */ if (c == OP_BRAZERO || c == OP_BRAMINZERO) { code += _pcre_OP_lengths[c]; do code += GET(code, 1); while (*code == OP_ALT); c = *code; continue; } /* For other groups, scan the branches. */ if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND) { BOOL empty_branch; if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */ /* Scan a closed bracket */ empty_branch = FALSE; do { if (!empty_branch && could_be_empty_branch(code, endcode, utf8)) empty_branch = TRUE; code += GET(code, 1); } while (*code == OP_ALT); if (!empty_branch) return FALSE; /* All branches are non-empty */ c = *code; continue; } /* Handle the other opcodes */ switch (c) { /* Check for quantifiers after a class. XCLASS is used for classes that cannot be represented just by a bit map. This includes negated single high-valued characters. The length in _pcre_OP_lengths[] is zero; the actual length is stored in the compiled code, so we must update "code" here. */#ifdef SUPPORT_UTF8 case OP_XCLASS: ccode = code += GET(code, 1); goto CHECK_CLASS_REPEAT;#endif case OP_CLASS: case OP_NCLASS: ccode = code + 33;#ifdef SUPPORT_UTF8 CHECK_CLASS_REPEAT:#endif switch (*ccode) { case OP_CRSTAR: /* These could be empty; continue */ case OP_CRMINSTAR: case OP_CRQUERY: case OP_CRMINQUERY: break; default: /* Non-repeat => class must match */ case OP_CRPLUS: /* These repeats aren't empty */ case OP_CRMINPLUS: return FALSE; case OP_CRRANGE: case OP_CRMINRANGE: if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */ break; } break; /* Opcodes that must match a character */ case OP_PROP: case OP_NOTPROP: case OP_EXTUNI: case OP_NOT_DIGIT: case OP_DIGIT: case OP_NOT_WHITESPACE: case OP_WHITESPACE: case OP_NOT_WORDCHAR: case OP_WORDCHAR: case OP_ANY: case OP_ANYBYTE: case OP_CHAR: case OP_CHARNC: case OP_NOT: case OP_PLUS: case OP_MINPLUS: case OP_POSPLUS: case OP_EXACT: case OP_NOTPLUS: case OP_NOTMINPLUS: case OP_NOTPOSPLUS: case OP_NOTEXACT: case OP_TYPEPLUS: case OP_TYPEMINPLUS: case OP_TYPEPOSPLUS: case OP_TYPEEXACT: return FALSE; /* These are going to continue, as they may be empty, but we have to fudge the length for the \p and \P cases. */ case OP_TYPESTAR: case OP_TYPEMINSTAR: case OP_TYPEPOSSTAR: case OP_TYPEQUERY: case OP_TYPEMINQUERY: case OP_TYPEPOSQUERY: if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; break; /* Same for these */ case OP_TYPEUPTO: case OP_TYPEMINUPTO: case OP_TYPEPOSUPTO: if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2; break; /* End of branch */ case OP_KET: case OP_KETRMAX: case OP_KETRMIN: case OP_ALT: return TRUE; /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO, MINUPTO, and POSUPTO may be followed by a multibyte character */#ifdef SUPPORT_UTF8 case OP_STAR: case OP_MINSTAR: case OP_POSSTAR: case OP_QUERY: case OP_MINQUERY: case OP_POSQUERY: case OP_UPTO: case OP_MINUPTO: case OP_POSUPTO: if (utf8) while ((code[2] & 0xc0) == 0x80) code++; break;#endif } }return TRUE;}/************************************************** Scan compiled regex for non-emptiness **************************************************//* This function is called to check for left recursive calls. We want to checkthe current branch of the current pattern to see if it could match the emptystring. If it could, we must look outwards for branches at other levels,stopping when we pass beyond the bracket which is the subject of the recursion.Arguments: code points to start of the recursion endcode points to where to stop (current RECURSE item) bcptr points to the chain of current (unclosed) branch starts utf8 TRUE if in UTF-8 modeReturns: TRUE if what is matched could be empty*/static BOOLcould_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr, BOOL utf8){while (bcptr != NULL && bcptr->current >= code) { if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE; bcptr = bcptr->outer; }return TRUE;}/************************************************** Check for POSIX class syntax **************************************************//* This function is called when the sequence "[:" or "[." or "[=" isencountered in a character class. It checks whether this is followed by anoptional ^ and then a sequence of letters, terminated by a matching ":]" or".]" or "=]".Argument: ptr pointer to the initial [ endptr where to return the end pointer cd pointer to compile dataReturns: TRUE or FALSE*/static BOOLcheck_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd){int terminator; /* Don't combine these lines; the Solaris cc */terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */if (*(++ptr) == '^') ptr++;while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;if (*ptr == terminator && ptr[1] == ']') { *endptr = ptr; return TRUE; }return FALSE;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -