📄 regex_internal.c
字号:
} else memcpy (pstr->mbs + byte_idx, p, mbclen); if (BE (pstr->offsets_needed != 0, 0)) { size_t i; for (i = 0; i < mbclen; ++i) pstr->offsets[byte_idx + i] = src_idx + i; } src_idx += mbclen; pstr->wcs[byte_idx++] = wcu; /* Write paddings. */ for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) pstr->wcs[byte_idx++] = WEOF; } else if (mbclen == (size_t) -1 || mbclen == 0) { /* It is an invalid character or '\0'. Just use the byte. */ int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx]; if (BE (pstr->trans != NULL, 0)) ch = pstr->trans [ch]; pstr->mbs[byte_idx] = ch; if (BE (pstr->offsets_needed != 0, 0)) pstr->offsets[byte_idx] = src_idx; ++src_idx; /* And also cast it to wide char. */ pstr->wcs[byte_idx++] = (wchar_t) ch; if (BE (mbclen == (size_t) -1, 0)) pstr->cur_state = prev_st; } else { /* The buffer doesn't have enough space, finish to build. */ pstr->cur_state = prev_st; break; } } pstr->valid_len = byte_idx; pstr->valid_raw_len = src_idx; return REG_NOERROR;}/* Skip characters until the index becomes greater than NEW_RAW_IDX. Return the index. */static Idxinternal_functionre_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc){ mbstate_t prev_st; Idx rawbuf_idx; size_t mbclen; wint_t wc = WEOF; /* Skip the characters which are not necessary to check. */ for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len; rawbuf_idx < new_raw_idx;) { wchar_t wc2; Idx remain_len; remain_len = pstr->len - rawbuf_idx; prev_st = pstr->cur_state; mbclen = mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx, remain_len, &pstr->cur_state); if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0)) { /* We treat these cases as a single byte character. */ if (mbclen == 0 || remain_len == 0) wc = L'\0'; else wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx); mbclen = 1; pstr->cur_state = prev_st; } else wc = wc2; /* Then proceed the next character. */ rawbuf_idx += mbclen; } *last_wc = wc; return rawbuf_idx;}#endif /* RE_ENABLE_I18N *//* Build the buffer PSTR->MBS, and apply the translation if we need. This function is used in case of REG_ICASE. */static voidinternal_functionbuild_upper_buffer (re_string_t *pstr){ Idx char_idx, end_idx; end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx) { int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx]; if (BE (pstr->trans != NULL, 0)) ch = pstr->trans[ch]; if (islower (ch)) pstr->mbs[char_idx] = toupper (ch); else pstr->mbs[char_idx] = ch; } pstr->valid_len = char_idx; pstr->valid_raw_len = char_idx;}/* Apply TRANS to the buffer in PSTR. */static voidinternal_functionre_string_translate_buffer (re_string_t *pstr){ Idx buf_idx, end_idx; end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx) { int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx]; pstr->mbs[buf_idx] = pstr->trans[ch]; } pstr->valid_len = buf_idx; pstr->valid_raw_len = buf_idx;}/* This function re-construct the buffers. Concretely, convert to wide character in case of pstr->mb_cur_max > 1, convert to upper case in case of REG_ICASE, apply translation. */static reg_errcode_tinternal_functionre_string_reconstruct (re_string_t *pstr, Idx idx, int eflags){ Idx offset; if (BE (pstr->raw_mbs_idx <= idx, 0)) offset = idx - pstr->raw_mbs_idx; else { /* Reset buffer. */#ifdef RE_ENABLE_I18N if (pstr->mb_cur_max > 1) memset (&pstr->cur_state, '\0', sizeof (mbstate_t));#endif /* RE_ENABLE_I18N */ pstr->len = pstr->raw_len; pstr->stop = pstr->raw_stop; pstr->valid_len = 0; pstr->raw_mbs_idx = 0; pstr->valid_raw_len = 0; pstr->offsets_needed = 0; pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF : CONTEXT_NEWLINE | CONTEXT_BEGBUF); if (!pstr->mbs_allocated) pstr->mbs = (unsigned char *) pstr->raw_mbs; offset = idx; } if (BE (offset != 0, 1)) { /* Should the already checked characters be kept? */ if (BE (offset < pstr->valid_raw_len, 1)) { /* Yes, move them to the front of the buffer. */#ifdef RE_ENABLE_I18N if (BE (pstr->offsets_needed, 0)) { Idx low = 0, high = pstr->valid_len, mid; do { mid = (high + low) / 2; if (pstr->offsets[mid] > offset) high = mid; else if (pstr->offsets[mid] < offset) low = mid + 1; else break; } while (low < high); if (pstr->offsets[mid] < offset) ++mid; pstr->tip_context = re_string_context_at (pstr, mid - 1, eflags); /* This can be quite complicated, so handle specially only the common and easy case where the character with different length representation of lower and upper case is present at or after offset. */ if (pstr->valid_len > offset && mid == offset && pstr->offsets[mid] == offset) { memmove (pstr->wcs, pstr->wcs + offset, (pstr->valid_len - offset) * sizeof (wint_t)); memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset); pstr->valid_len -= offset; pstr->valid_raw_len -= offset; for (low = 0; low < pstr->valid_len; low++) pstr->offsets[low] = pstr->offsets[low + offset] - offset; } else { /* Otherwise, just find out how long the partial multibyte character at offset is and fill it with WEOF/255. */ pstr->len = pstr->raw_len - idx + offset; pstr->stop = pstr->raw_stop - idx + offset; pstr->offsets_needed = 0; while (mid > 0 && pstr->offsets[mid - 1] == offset) --mid; while (mid < pstr->valid_len) if (pstr->wcs[mid] != WEOF) break; else ++mid; if (mid == pstr->valid_len) pstr->valid_len = 0; else { pstr->valid_len = pstr->offsets[mid] - offset; if (pstr->valid_len) { for (low = 0; low < pstr->valid_len; ++low) pstr->wcs[low] = WEOF; memset (pstr->mbs, 255, pstr->valid_len); } } pstr->valid_raw_len = pstr->valid_len; } } else#endif { pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);#ifdef RE_ENABLE_I18N if (pstr->mb_cur_max > 1) memmove (pstr->wcs, pstr->wcs + offset, (pstr->valid_len - offset) * sizeof (wint_t));#endif /* RE_ENABLE_I18N */ if (BE (pstr->mbs_allocated, 0)) memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset); pstr->valid_len -= offset; pstr->valid_raw_len -= offset;#if DEBUG assert (pstr->valid_len > 0);#endif } } else { /* No, skip all characters until IDX. */ Idx prev_valid_len = pstr->valid_len;#ifdef RE_ENABLE_I18N if (BE (pstr->offsets_needed, 0)) { pstr->len = pstr->raw_len - idx + offset; pstr->stop = pstr->raw_stop - idx + offset; pstr->offsets_needed = 0; }#endif pstr->valid_len = 0;#ifdef RE_ENABLE_I18N if (pstr->mb_cur_max > 1) { Idx wcs_idx; wint_t wc = WEOF; if (pstr->is_utf8) { const unsigned char *raw, *p, *end; /* Special case UTF-8. Multi-byte chars start with any byte other than 0x80 - 0xbf. */ raw = pstr->raw_mbs + pstr->raw_mbs_idx; end = raw + (offset - pstr->mb_cur_max); if (end < pstr->raw_mbs) end = pstr->raw_mbs; p = raw + offset - 1;#ifdef _LIBC /* We know the wchar_t encoding is UCS4, so for the simple case, ASCII characters, skip the conversion step. */ if (isascii (*p) && BE (pstr->trans == NULL, 1)) { memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); /* pstr->valid_len = 0; */ wc = (wchar_t) *p; } else#endif for (; p >= end; --p) if ((*p & 0xc0) != 0x80) { mbstate_t cur_state; wchar_t wc2; Idx mlen = raw + pstr->len - p; unsigned char buf[6]; size_t mbclen; if (BE (pstr->trans != NULL, 0)) { int i = mlen < 6 ? mlen : 6; while (--i >= 0) buf[i] = pstr->trans[p[i]]; } /* XXX Don't use mbrtowc, we know which conversion to use (UTF-8 -> UCS4). */ memset (&cur_state, 0, sizeof (cur_state)); mbclen = mbrtowc (&wc2, (const char *) p, mlen, &cur_state); if (raw + offset - p <= mbclen && mbclen < (size_t) -2) { memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); pstr->valid_len = mbclen - (raw + offset - p); wc = wc2; } break; } } if (wc == WEOF) pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx; if (wc == WEOF) pstr->tip_context = re_string_context_at (pstr, prev_valid_len - 1, eflags); else pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc)) ? CONTEXT_WORD : ((IS_WIDE_NEWLINE (wc) && pstr->newline_anchor) ? CONTEXT_NEWLINE : 0)); if (BE (pstr->valid_len, 0)) { for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx) pstr->wcs[wcs_idx] = WEOF; if (pstr->mbs_allocated) memset (pstr->mbs, 255, pstr->valid_len); } pstr->valid_raw_len = pstr->valid_len; } else#endif /* RE_ENABLE_I18N */ { int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1]; pstr->valid_raw_len = 0; if (pstr->trans) c = pstr->trans[c]; pstr->tip_context = (bitset_contain (pstr->word_char, c) ? CONTEXT_WORD : ((IS_NEWLINE (c) && pstr->newline_anchor) ? CONTEXT_NEWLINE : 0)); } } if (!BE (pstr->mbs_allocated, 0)) pstr->mbs += offset; } pstr->raw_mbs_idx = idx; pstr->len -= offset; pstr->stop -= offset; /* Then build the buffers. */#ifdef RE_ENABLE_I18N if (pstr->mb_cur_max > 1) { if (pstr->icase) { reg_errcode_t ret = build_wcs_upper_buffer (pstr); if (BE (ret != REG_NOERROR, 0)) return ret; } else build_wcs_buffer (pstr); } else#endif /* RE_ENABLE_I18N */ if (BE (pstr->mbs_allocated, 0)) { if (pstr->icase) build_upper_buffer (pstr); else if (pstr->trans != NULL) re_string_translate_buffer (pstr); } else pstr->valid_len = pstr->len; pstr->cur_idx = 0; return REG_NOERROR;}static unsigned charinternal_function __attribute ((pure))re_string_peek_byte_case (const re_string_t *pstr, Idx idx){ int ch; Idx off; /* Handle the common (easiest) cases first. */ if (BE (!pstr->mbs_allocated, 1)) return re_string_peek_byte (pstr, idx);#ifdef RE_ENABLE_I18N if (pstr->mb_cur_max > 1 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx)) return re_string_peek_byte (pstr, idx);#endif off = pstr->cur_idx + idx;#ifdef RE_ENABLE_I18N if (pstr->offsets_needed) off = pstr->offsets[off];#endif ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];#ifdef RE_ENABLE_I18N /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I this function returns CAPITAL LETTER I instead of first byte of DOTLESS SMALL LETTER I. The latter would confuse the parser, since peek_byte_case doesn't advance cur_idx in any way. */ if (pstr->offsets_needed && !isascii (ch)) return re_string_peek_byte (pstr, idx);#endif return ch;}static unsigned charinternal_function __attribute ((pure))
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -