📄 pcre_compile.cpp
字号:
put2ByteValueAndAdvance(code, repeat_min); /* If the maximum is unlimited, insert an OP_STAR. Before doing so, we have to insert the character for the previous code. For a repeated Unicode property match, there are two extra bytes that define the required property. In UTF-8 mode, long characters have their length in c, with the 0x80 bit as a flag. */ if (repeat_max < 0) { if (c >= 128) { memcpy(code, utf8_char, c & 7); code += c & 7; } else { *code++ = c; if (prop_type >= 0) { *code++ = prop_type; *code++ = prop_value; } } *code++ = OP_STAR + repeat_type; } /* Else insert an UPTO if the max is greater than the min, again preceded by the character, for the previously inserted code. */ else if (repeat_max != repeat_min) { if (c >= 128) { memcpy(code, utf8_char, c & 7); code += c & 7; } else *code++ = c; if (prop_type >= 0) { *code++ = prop_type; *code++ = prop_value; } repeat_max -= repeat_min; *code++ = OP_UPTO + repeat_type; put2ByteValueAndAdvance(code, repeat_max); } } /* The character or character type itself comes last in all cases. */ if (c >= 128) { memcpy(code, utf8_char, c & 7); code += c & 7; } else *code++ = c; /* For a repeated Unicode property match, there are two extra bytes that define the required property. */ if (prop_type >= 0) { *code++ = prop_type; *code++ = prop_value; } } /* If previous was a character class or a back reference, we put the repeat stuff after it, but just skip the item if the repeat was {0,0}. */ else if (*previous == OP_CLASS || *previous == OP_NCLASS || *previous == OP_XCLASS || *previous == OP_REF) { if (repeat_max == 0) { code = previous; goto END_REPEAT; } if (repeat_min == 0 && repeat_max == -1) *code++ = OP_CRSTAR + repeat_type; else if (repeat_min == 1 && repeat_max == -1) *code++ = OP_CRPLUS + repeat_type; else if (repeat_min == 0 && repeat_max == 1) *code++ = OP_CRQUERY + repeat_type; else { *code++ = OP_CRRANGE + repeat_type; put2ByteValueAndAdvance(code, repeat_min); if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */ put2ByteValueAndAdvance(code, repeat_max); } } /* If previous was a bracket group, we may have to replicate it in certain cases. */ else if (*previous >= OP_BRA) { int ketoffset = 0; int len = code - previous; unsigned char* bralink = NULL; /* If the maximum repeat count is unlimited, find the end of the bracket by scanning through from the start, and compute the offset back to it from the current code pointer. There may be an OP_OPT setting following the final KET, so we can't find the end just by going back from the code pointer. */ if (repeat_max == -1) { const unsigned char* ket = previous; advanceToEndOfBracket(ket); ketoffset = code - ket; } /* The case of a zero minimum is special because of the need to stick OP_BRAZERO in front of it, and because the group appears once in the data, whereas in other cases it appears the minimum number of times. For this reason, it is simplest to treat this case separately, as otherwise the code gets far too messy. There are several special subcases when the minimum is zero. */ if (repeat_min == 0) { /* If the maximum is also zero, we just omit the group from the output altogether. */ if (repeat_max == 0) { code = previous; goto END_REPEAT; } /* If the maximum is 1 or unlimited, we just have to stick in the BRAZERO and do no more at this point. However, we do need to adjust any OP_RECURSE calls inside the group that refer to the group itself or any internal group, because the offset is from the start of the whole regex. Temporarily terminate the pattern while doing this. */ if (repeat_max <= 1) { *code = OP_END; memmove(previous+1, previous, len); code++; *previous++ = OP_BRAZERO + repeat_type; } /* If the maximum is greater than 1 and limited, we have to replicate in a nested fashion, sticking OP_BRAZERO before each set of brackets. The first one has to be handled carefully because it's the original copy, which has to be moved up. The remainder can be handled by code that is common with the non-zero minimum case below. We have to adjust the value of repeat_max, since one less copy is required. */ else { *code = OP_END; memmove(previous + 2 + LINK_SIZE, previous, len); code += 2 + LINK_SIZE; *previous++ = OP_BRAZERO + repeat_type; *previous++ = OP_BRA; /* We chain together the bracket offset fields that have to be filled in later when the ends of the brackets are reached. */ int offset = (!bralink) ? 0 : previous - bralink; bralink = previous; putLinkValueAllowZeroAndAdvance(previous, offset); } repeat_max--; } /* If the minimum is greater than zero, replicate the group as many times as necessary, and adjust the maximum to the number of subsequent copies that we need. If we set a first char from the group, and didn't set a required char, copy the latter from the former. */ else { if (repeat_min > 1) { if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte; for (int i = 1; i < repeat_min; i++) { memcpy(code, previous, len); code += len; } } if (repeat_max > 0) repeat_max -= repeat_min; } /* This code is common to both the zero and non-zero minimum cases. If the maximum is limited, it replicates the group in a nested fashion, remembering the bracket starts on a stack. In the case of a zero minimum, the first one was set up above. In all cases the repeat_max now specifies the number of additional copies needed. */ if (repeat_max >= 0) { for (int i = repeat_max - 1; i >= 0; i--) { *code++ = OP_BRAZERO + repeat_type; /* All but the final copy start a new nesting, maintaining the chain of brackets outstanding. */ if (i != 0) { *code++ = OP_BRA; int offset = (!bralink) ? 0 : code - bralink; bralink = code; putLinkValueAllowZeroAndAdvance(code, offset); } memcpy(code, previous, len); code += len; } /* Now chain through the pending brackets, and fill in their length fields (which are holding the chain links pro tem). */ while (bralink) { int offset = code - bralink + 1; unsigned char* bra = code - offset; int oldlinkoffset = getLinkValueAllowZero(bra + 1); bralink = (!oldlinkoffset) ? 0 : bralink - oldlinkoffset; *code++ = OP_KET; putLinkValueAndAdvance(code, offset); putLinkValue(bra + 1, offset); } } /* If the maximum is unlimited, set a repeater in the final copy. We can't just offset backwards from the current code point, because we don't know if there's been an options resetting after the ket. The correct offset was computed above. */ else code[-ketoffset] = OP_KETRMAX + repeat_type; } /* Else there's some kind of shambles */ else { *errorcodeptr = ERR11; goto FAILED; } /* In all case we no longer have a previous item. We also set the "follows varying string" flag for subsequently encountered reqbytes if it isn't already set and we have just passed a varying length item. */ END_REPEAT: previous = NULL; cd.req_varyopt |= reqvary; break; /* Start of nested bracket sub-expression, or comment or lookahead or lookbehind or option setting or condition. First deal with special things that can come after a bracket; all are introduced by ?, and the appearance of any of them means that this is not a referencing group. They were checked for validity in the first pass over the string, so we don't have to check for syntax errors here. */ case '(': skipbytes = 0; if (*(++ptr) == '?') { switch (*(++ptr)) { case ':': /* Non-extracting bracket */ bravalue = OP_BRA; ptr++; break; case '=': /* Positive lookahead */ bravalue = OP_ASSERT; ptr++; break; case '!': /* Negative lookahead */ bravalue = OP_ASSERT_NOT; ptr++; break; /* Character after (? not specially recognized */ default: *errorcodeptr = ERR12; goto FAILED; } } /* Else we have a referencing group; adjust the opcode. If the bracket number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and arrange for the true number to follow later, in an OP_BRANUMBER item. */ else { if (++(*brackets) > EXTRACT_BASIC_MAX) { bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1; code[1 + LINK_SIZE] = OP_BRANUMBER; put2ByteValue(code + 2 + LINK_SIZE, *brackets); skipbytes = 3; } else bravalue = OP_BRA + *brackets; } /* Process nested bracketed re. Assertions may not be repeated, but other kinds can be. We copy code into a non-variable in order to be able to pass its address because some compilers complain otherwise. Pass in a
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -