📄 pcre_compile.c
字号:
/************************************************** Check POSIX class name **************************************************//* This function is called to check the name given in a POSIX-style class entrysuch as [:alnum:].Arguments: ptr points to the first letter len the length of the nameReturns: a value representing the name, or -1 if unknown*/static intcheck_posix_name(const uschar *ptr, int len){const char *pn = posix_names;register int yield = 0;while (posix_name_lengths[yield] != 0) { if (len == posix_name_lengths[yield] && strncmp((const char *)ptr, pn, len) == 0) return yield; pn += posix_name_lengths[yield] + 1; yield++; }return -1;}/************************************************** Adjust OP_RECURSE items in repeated group **************************************************//* OP_RECURSE items contain an offset from the start of the regex to the groupthat is referenced. This means that groups can be replicated for fixedrepetition simply by copying (because the recursion is allowed to refer toearlier groups that are outside the current group). However, when a group isoptional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted beforeit, after it has been compiled. This means that any OP_RECURSE items within itthat refer to the group itself or any contained groups have to have theiroffsets adjusted. That one of the jobs of this function. Before it is called,the partially compiled regex must be temporarily terminated with OP_END.This function has been extended with the possibility of forward references forrecursions and subroutine calls. It must also check the list of such referencesfor the group we are dealing with. If it finds that one of the recursions inthe current group is on this list, it adjusts the offset in the list, not thevalue in the reference (which is a group number).Arguments: group points to the start of the group adjust the amount by which the group is to be moved utf8 TRUE in UTF-8 mode cd contains pointers to tables etc. save_hwm the hwm forward reference pointer at the start of the groupReturns: nothing*/static voidadjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd, uschar *save_hwm){uschar *ptr = group;while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL) { int offset; uschar *hc; /* See if this recursion is on the forward reference list. If so, adjust the reference. */ for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE) { offset = GET(hc, 0); if (cd->start_code + offset == ptr + 1) { PUT(hc, 0, offset + adjust); break; } } /* Otherwise, adjust the recursion offset if it's after the start of this group. */ if (hc >= cd->hwm) { offset = GET(ptr, 1); if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust); } ptr += 1 + LINK_SIZE; }}/************************************************** Insert an automatic callout point **************************************************//* This function is called when the PCRE_AUTO_CALLOUT option is set, to insertcallout points before each pattern item.Arguments: code current code pointer ptr current pattern pointer cd pointers to tables etcReturns: new code pointer*/static uschar *auto_callout(uschar *code, const uschar *ptr, compile_data *cd){*code++ = OP_CALLOUT;*code++ = 255;PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */PUT(code, LINK_SIZE, 0); /* Default length */return code + 2*LINK_SIZE;}/************************************************** Complete a callout item **************************************************//* A callout item contains the length of the next item in the pattern, whichwe can't fill in till after we have reached the relevant point. This is usedfor both automatic and manual callouts.Arguments: previous_callout points to previous callout item ptr current pattern pointer cd pointers to tables etcReturns: nothing*/static voidcomplete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd){int length = ptr - cd->start_pattern - GET(previous_callout, 2);PUT(previous_callout, 2 + LINK_SIZE, length);}#ifdef SUPPORT_UCP/************************************************** Get othercase range **************************************************//* This function is passed the start and end of a class range, in UTF-8 modewith UCP support. It searches up the characters, looking for internal ranges ofcharacters in the "other" case. Each call returns the next one, updating thestart address.Arguments: cptr points to starting character value; updated d end value ocptr where to put start of othercase range odptr where to put end of othercase rangeYield: TRUE when range returned; FALSE when no more*/static BOOLget_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr, unsigned int *odptr){unsigned int c, othercase, next;for (c = *cptr; c <= d; c++) { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }if (c > d) return FALSE;*ocptr = othercase;next = othercase + 1;for (++c; c <= d; c++) { if (_pcre_ucp_othercase(c) != next) break; next++; }*odptr = next - 1;*cptr = c;return TRUE;}#endif /* SUPPORT_UCP *//************************************************** Check if auto-possessifying is possible **************************************************//* This function is called for unlimited repeats of certain items, to seewhether the next thing could possibly match the repeated item. If not, it makessense to automatically possessify the repeated item.Arguments: op_code the repeated op code this data for this item, depends on the opcode utf8 TRUE in UTF-8 mode utf8_char used for utf8 character bytes, NULL if not relevant ptr next character in pattern options options bits cd contains pointers to tables etc.Returns: TRUE if possessifying is wanted*/static BOOLcheck_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char, const uschar *ptr, int options, compile_data *cd){int next;/* Skip whitespace and comments in extended mode */if ((options & PCRE_EXTENDED) != 0) { for (;;) { while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++; if (*ptr == '#') { while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } } else break; } }/* If the next item is one that we can handle, get its value. A non-negativevalue is a character, a negative value is an escape value. */if (*ptr == '\\') { int temperrorcode = 0; next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE); if (temperrorcode != 0) return FALSE; ptr++; /* Point after the escape sequence */ }else if ((cd->ctypes[*ptr] & ctype_meta) == 0) {#ifdef SUPPORT_UTF8 if (utf8) { GETCHARINC(next, ptr); } else#endif next = *ptr++; }else return FALSE;/* Skip whitespace and comments in extended mode */if ((options & PCRE_EXTENDED) != 0) { for (;;) { while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++; if (*ptr == '#') { while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } } else break; } }/* If the next thing is itself optional, we have to give up. */if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0) return FALSE;/* Now compare the next item with the previous opcode. If the previous is apositive single character match, "item" either contains the character or, if"item" is greater than 127 in utf8 mode, the character's bytes are inutf8_char. *//* Handle cases when the next item is a character. */if (next >= 0) switch(op_code) { case OP_CHAR:#ifdef SUPPORT_UTF8 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }#endif return item != next; /* For CHARNC (caseless character) we must check the other case. If we have Unicode property support, we can use it to test the other case of high-valued characters. */ case OP_CHARNC:#ifdef SUPPORT_UTF8 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }#endif if (item == next) return FALSE;#ifdef SUPPORT_UTF8 if (utf8) { unsigned int othercase; if (next < 128) othercase = cd->fcc[next]; else#ifdef SUPPORT_UCP othercase = _pcre_ucp_othercase((unsigned int)next);#else othercase = NOTACHAR;#endif return (unsigned int)item != othercase; } else#endif /* SUPPORT_UTF8 */ return (item != cd->fcc[next]); /* Non-UTF-8 mode */ /* For OP_NOT, "item" must be a single-byte character. */ case OP_NOT: if (next < 0) return FALSE; /* Not a character */ if (item == next) return TRUE; if ((options & PCRE_CASELESS) == 0) return FALSE;#ifdef SUPPORT_UTF8 if (utf8) { unsigned int othercase; if (next < 128) othercase = cd->fcc[next]; else#ifdef SUPPORT_UCP othercase = _pcre_ucp_othercase(next);#else othercase = NOTACHAR;#endif return (unsigned int)item == othercase; } else#endif /* SUPPORT_UTF8 */ return (item == cd->fcc[next]); /* Non-UTF-8 mode */ case OP_DIGIT: return next > 127 || (cd->ctypes[next] & ctype_digit) == 0; case OP_NOT_DIGIT: return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0; case OP_WHITESPACE: return next > 127 || (cd->ctypes[next] & ctype_space) == 0; case OP_NOT_WHITESPACE: return next <= 127 && (cd->ctypes[next] & ctype_space) != 0; case OP_WORDCHAR: return next > 127 || (cd->ctypes[next] & ctype_word) == 0; case OP_NOT_WORDCHAR: return next <= 127 && (cd->ctypes[next] & ctype_word) != 0; case OP_HSPACE: case OP_NOT_HSPACE: switch(next) { case 0x09: case 0x20: case 0xa0: case 0x1680: case 0x180e: case 0x2000: case 0x2001: case 0x2002: case 0x2003: case 0x2004: case 0x2005: case 0x2006: case 0x2007: case 0x2008: case 0x2009: case 0x200A: case 0x202f: case 0x205f: case 0x3000: return op_code != OP_HSPACE; default: return op_code == OP_HSPACE; } case OP_VSPACE: case OP_NOT_VSPACE: switch(next) { case 0x0a: case 0x0b: case 0x0c: case 0x0d: case 0x85: case 0x2028: case 0x2029: return op_code != OP_VSPACE; default: return op_code == OP_VSPACE; } default: return FALSE; }/* Handle the case when the next item is \d, \s, etc. */switch(op_code) { case OP_CHAR: case OP_CHARNC:#ifdef SUPPORT_UTF8 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }#endif switch(-next) { case ESC_d: return item > 127 || (cd->ctypes[item] & ctype_digit) == 0; case ESC_D: return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0; case ESC_s: return item > 127 || (cd->ctypes[item] & ctype_space) == 0; case ESC_S: return item <= 127 && (cd->ctypes[item] & cty
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -