📄 utf8.c
字号:
/********************************************************************** utf8.c - Oniguruma (regular expression library)**********************************************************************//*- * Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */#include "regenc.h"#define USE_INVALID_CODE_SCHEME#ifdef USE_INVALID_CODE_SCHEME/* virtual codepoint values for invalid encoding byte 0xfe and 0xff */#define INVALID_CODE_FE 0xfffffffe#define INVALID_CODE_FF 0xffffffff#define VALID_CODE_LIMIT 0x7fffffff#endif#define utf8_islead(c) ((UChar )((c) & 0xc0) != 0x80)static int EncLen_UTF8[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1};static intutf8_mbc_enc_len(const UChar* p){ return EncLen_UTF8[*p];}static OnigCodePointutf8_mbc_to_code(const UChar* p, const UChar* end){ int c, len; OnigCodePoint n; len = enc_len(ONIG_ENCODING_UTF8, p); c = *p++; if (len > 1) { len--; n = c & ((1 << (6 - len)) - 1); while (len--) { c = *p++; n = (n << 6) | (c & ((1 << 6) - 1)); } return n; } else {#ifdef USE_INVALID_CODE_SCHEME if (c > 0xfd) { return ((c == 0xfe) ? INVALID_CODE_FE : INVALID_CODE_FF); }#endif return (OnigCodePoint )c; }}static intutf8_code_to_mbclen(OnigCodePoint code){ if ((code & 0xffffff80) == 0) return 1; else if ((code & 0xfffff800) == 0) { if (code <= 0xff && code >= 0xfe) return 1; return 2; } else if ((code & 0xffff0000) == 0) return 3; else if ((code & 0xffe00000) == 0) return 4; else if ((code & 0xfc000000) == 0) return 5; else if ((code & 0x80000000) == 0) return 6;#ifdef USE_INVALID_CODE_SCHEME else if (code == INVALID_CODE_FE) return 1; else if (code == INVALID_CODE_FF) return 1;#endif else return ONIGENCERR_TOO_BIG_WIDE_CHAR_VALUE;}#if 0static intutf8_code_to_mbc_first(OnigCodePoint code){ if ((code & 0xffffff80) == 0) return code; else { if ((code & 0xfffff800) == 0) return ((code>>6)& 0x1f) | 0xc0; else if ((code & 0xffff0000) == 0) return ((code>>12) & 0x0f) | 0xe0; else if ((code & 0xffe00000) == 0) return ((code>>18) & 0x07) | 0xf0; else if ((code & 0xfc000000) == 0) return ((code>>24) & 0x03) | 0xf8; else if ((code & 0x80000000) == 0) return ((code>>30) & 0x01) | 0xfc; else { return ONIGENCERR_TOO_BIG_WIDE_CHAR_VALUE; } }}#endifstatic intutf8_code_to_mbc(OnigCodePoint code, UChar *buf){#define UTF8_TRAILS(code, shift) (UChar )((((code) >> (shift)) & 0x3f) | 0x80)#define UTF8_TRAIL0(code) (UChar )(((code) & 0x3f) | 0x80) if ((code & 0xffffff80) == 0) { *buf = (UChar )code; return 1; } else { UChar *p = buf; if ((code & 0xfffff800) == 0) { *p++ = (UChar )(((code>>6)& 0x1f) | 0xc0); } else if ((code & 0xffff0000) == 0) { *p++ = (UChar )(((code>>12) & 0x0f) | 0xe0); *p++ = UTF8_TRAILS(code, 6); } else if ((code & 0xffe00000) == 0) { *p++ = (UChar )(((code>>18) & 0x07) | 0xf0); *p++ = UTF8_TRAILS(code, 12); *p++ = UTF8_TRAILS(code, 6); } else if ((code & 0xfc000000) == 0) { *p++ = (UChar )(((code>>24) & 0x03) | 0xf8); *p++ = UTF8_TRAILS(code, 18); *p++ = UTF8_TRAILS(code, 12); *p++ = UTF8_TRAILS(code, 6); } else if ((code & 0x80000000) == 0) { *p++ = (UChar )(((code>>30) & 0x01) | 0xfc); *p++ = UTF8_TRAILS(code, 24); *p++ = UTF8_TRAILS(code, 18); *p++ = UTF8_TRAILS(code, 12); *p++ = UTF8_TRAILS(code, 6); }#ifdef USE_INVALID_CODE_SCHEME else if (code == INVALID_CODE_FE) { *p = 0xfe; return 1; } else if (code == INVALID_CODE_FF) { *p = 0xff; return 1; }#endif else { return ONIGENCERR_TOO_BIG_WIDE_CHAR_VALUE; } *p++ = UTF8_TRAIL0(code); return p - buf; }}static intutf8_mbc_to_normalize(OnigAmbigType flag, const UChar** pp, const UChar* end, UChar* lower){ const UChar* p = *pp; if (ONIGENC_IS_MBC_ASCII(p)) { if (end > p + 1 && (flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0 && ((*p == 's' && *(p+1) == 's') || ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && (*p == 'S' && *(p+1) == 'S')))) { *lower++ = '\303'; *lower = '\237'; (*pp) += 2; return 2; } if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) { *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); } else { *lower = *p; } (*pp)++; return 1; /* return byte length of converted char to lower */ } else { int len; if (*p == 195) { /* 195 == '\303' */ int c = *(p + 1); if (c >= 128) { if (c <= (UChar )'\236' && /* upper */ (flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0) { if (c != (UChar )'\227') { *lower++ = *p; *lower = (UChar )(c + 32); (*pp) += 2; return 2; } }#if 0 else if (c == (UChar )'\237' && (flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { *lower++ = '\303'; *lower = '\237'; (*pp) += 2; return 2; }#endif } } len = enc_len(ONIG_ENCODING_UTF8, p); if (lower != p) { int i; for (i = 0; i < len; i++) { *lower++ = *p++; } } (*pp) += len; return len; /* return byte length of converted char to lower */ }}static intutf8_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end){ const UChar* p = *pp; if (ONIGENC_IS_MBC_ASCII(p)) { if (end > p + 1 && (flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0 && ((*p == 's' && *(p+1) == 's') || ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && (*p == 'S' && *(p+1) == 'S')))) { (*pp) += 2; return TRUE; } (*pp)++; if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) { return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p); } } else { (*pp) += enc_len(ONIG_ENCODING_UTF8, p); if (*p == 195) { /* 195 == '\303' */ int c = *(p + 1); if (c >= 128) { if ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0) { if (c <= (UChar )'\236') { /* upper */ if (c == (UChar )'\227') return FALSE; return TRUE; } else if (c >= (UChar )'\240' && c <= (UChar )'\276') { /* lower */ if (c == (UChar )'\267') return FALSE; return TRUE; } } else if (c == (UChar )'\237' && (flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { return TRUE; } } } } return FALSE;}static OnigCodePoint EmptyRange[] = { 0 };static OnigCodePoint SBAlnum[] = { 3, 0x0030, 0x0039, 0x0041, 0x005a, 0x0061, 0x007a};static OnigCodePoint MBAlnum[] = {#ifdef USE_UNICODE_FULL_RANGE_CTYPE 411,#else 6,#endif 0x00aa, 0x00aa, 0x00b5, 0x00b5, 0x00ba, 0x00ba, 0x00c0, 0x00d6, 0x00d8, 0x00f6, 0x00f8, 0x0236#ifdef USE_UNICODE_FULL_RANGE_CTYPE , 0x0250, 0x02c1, 0x02c6, 0x02d1, 0x02e0, 0x02e4, 0x02ee, 0x02ee, 0x0300, 0x0357, 0x035d, 0x036f, 0x037a, 0x037a, 0x0386, 0x0386, 0x0388, 0x038a, 0x038c, 0x038c, 0x038e, 0x03a1, 0x03a3, 0x03ce, 0x03d0, 0x03f5, 0x03f7, 0x03fb, 0x0400, 0x0481, 0x0483, 0x0486, 0x0488, 0x04ce, 0x04d0, 0x04f5, 0x04f8, 0x04f9, 0x0500, 0x050f, 0x0531, 0x0556, 0x0559, 0x0559, 0x0561, 0x0587, 0x0591, 0x05a1, 0x05a3, 0x05b9, 0x05bb, 0x05bd, 0x05bf, 0x05bf, 0x05c1, 0x05c2, 0x05c4, 0x05c4, 0x05d0, 0x05ea, 0x05f0, 0x05f2, 0x0610, 0x0615, 0x0621, 0x063a, 0x0640, 0x0658, 0x0660, 0x0669, 0x066e, 0x06d3, 0x06d5, 0x06dc, 0x06de, 0x06e8, 0x06ea, 0x06fc, 0x06ff, 0x06ff, 0x0710, 0x074a, 0x074d, 0x074f, 0x0780, 0x07b1, 0x0901, 0x0939, 0x093c, 0x094d, 0x0950, 0x0954, 0x0958, 0x0963, 0x0966, 0x096f, 0x0981, 0x0983, 0x0985, 0x098c, 0x098f, 0x0990, 0x0993, 0x09a8, 0x09aa, 0x09b0, 0x09b2, 0x09b2, 0x09b6, 0x09b9, 0x09bc, 0x09c4, 0x09c7, 0x09c8, 0x09cb, 0x09cd, 0x09d7, 0x09d7, 0x09dc, 0x09dd, 0x09df, 0x09e3, 0x09e6, 0x09f1, 0x0a01, 0x0a03, 0x0a05, 0x0a0a, 0x0a0f, 0x0a10, 0x0a13, 0x0a28, 0x0a2a, 0x0a30, 0x0a32, 0x0a33, 0x0a35, 0x0a36, 0x0a38, 0x0a39, 0x0a3c, 0x0a3c, 0x0a3e, 0x0a42, 0x0a47, 0x0a48, 0x0a4b, 0x0a4d, 0x0a59, 0x0a5c, 0x0a5e, 0x0a5e, 0x0a66, 0x0a74, 0x0a81, 0x0a83, 0x0a85, 0x0a8d, 0x0a8f, 0x0a91, 0x0a93, 0x0aa8, 0x0aaa, 0x0ab0, 0x0ab2, 0x0ab3, 0x0ab5, 0x0ab9, 0x0abc, 0x0ac5, 0x0ac7, 0x0ac9, 0x0acb, 0x0acd, 0x0ad0, 0x0ad0, 0x0ae0, 0x0ae3, 0x0ae6, 0x0aef, 0x0b01, 0x0b03, 0x0b05, 0x0b0c, 0x0b0f, 0x0b10, 0x0b13, 0x0b28, 0x0b2a, 0x0b30, 0x0b32, 0x0b33, 0x0b35, 0x0b39, 0x0b3c, 0x0b43, 0x0b47, 0x0b48, 0x0b4b, 0x0b4d, 0x0b56, 0x0b57, 0x0b5c, 0x0b5d, 0x0b5f, 0x0b61, 0x0b66, 0x0b6f, 0x0b71, 0x0b71, 0x0b82, 0x0b83, 0x0b85, 0x0b8a, 0x0b8e, 0x0b90, 0x0b92, 0x0b95, 0x0b99, 0x0b9a, 0x0b9c, 0x0b9c, 0x0b9e, 0x0b9f, 0x0ba3, 0x0ba4, 0x0ba8, 0x0baa, 0x0bae, 0x0bb5, 0x0bb7, 0x0bb9, 0x0bbe, 0x0bc2, 0x0bc6, 0x0bc8, 0x0bca, 0x0bcd, 0x0bd7, 0x0bd7, 0x0be7, 0x0bef, 0x0c01, 0x0c03, 0x0c05, 0x0c0c, 0x0c0e, 0x0c10, 0x0c12, 0x0c28, 0x0c2a, 0x0c33, 0x0c35, 0x0c39, 0x0c3e, 0x0c44, 0x0c46, 0x0c48, 0x0c4a, 0x0c4d, 0x0c55, 0x0c56, 0x0c60, 0x0c61, 0x0c66, 0x0c6f, 0x0c82, 0x0c83, 0x0c85, 0x0c8c, 0x0c8e, 0x0c90, 0x0c92, 0x0ca8, 0x0caa, 0x0cb3, 0x0cb5, 0x0cb9, 0x0cbc, 0x0cc4, 0x0cc6, 0x0cc8, 0x0cca, 0x0ccd, 0x0cd5, 0x0cd6, 0x0cde, 0x0cde, 0x0ce0, 0x0ce1, 0x0ce6, 0x0cef, 0x0d02, 0x0d03, 0x0d05, 0x0d0c, 0x0d0e, 0x0d10, 0x0d12, 0x0d28, 0x0d2a, 0x0d39, 0x0d3e, 0x0d43, 0x0d46, 0x0d48, 0x0d4a, 0x0d4d, 0x0d57, 0x0d57, 0x0d60, 0x0d61, 0x0d66, 0x0d6f, 0x0d82, 0x0d83, 0x0d85, 0x0d96, 0x0d9a, 0x0db1, 0x0db3, 0x0dbb, 0x0dbd, 0x0dbd, 0x0dc0, 0x0dc6, 0x0dca, 0x0dca, 0x0dcf, 0x0dd4, 0x0dd6, 0x0dd6, 0x0dd8, 0x0ddf, 0x0df2, 0x0df3, 0x0e01, 0x0e3a, 0x0e40, 0x0e4e, 0x0e50, 0x0e59, 0x0e81, 0x0e82, 0x0e84, 0x0e84, 0x0e87, 0x0e88, 0x0e8a, 0x0e8a, 0x0e8d, 0x0e8d, 0x0e94, 0x0e97, 0x0e99, 0x0e9f, 0x0ea1, 0x0ea3, 0x0ea5, 0x0ea5, 0x0ea7, 0x0ea7, 0x0eaa, 0x0eab, 0x0ead, 0x0eb9, 0x0ebb, 0x0ebd, 0x0ec0, 0x0ec4, 0x0ec6, 0x0ec6, 0x0ec8, 0x0ecd, 0x0ed0, 0x0ed9, 0x0edc, 0x0edd, 0x0f00, 0x0f00, 0x0f18, 0x0f19, 0x0f20, 0x0f29, 0x0f35, 0x0f35, 0x0f37, 0x0f37, 0x0f39, 0x0f39, 0x0f3e, 0x0f47, 0x0f49, 0x0f6a, 0x0f71, 0x0f84, 0x0f86, 0x0f8b, 0x0f90, 0x0f97, 0x0f99, 0x0fbc, 0x0fc6, 0x0fc6, 0x1000, 0x1021, 0x1023, 0x1027, 0x1029, 0x102a, 0x102c, 0x1032, 0x1036, 0x1039, 0x1040, 0x1049, 0x1050, 0x1059, 0x10a0, 0x10c5, 0x10d0, 0x10f8, 0x1100, 0x1159, 0x115f, 0x11a2, 0x11a8, 0x11f9, 0x1200, 0x1206, 0x1208, 0x1246, 0x1248, 0x1248, 0x124a, 0x124d, 0x1250, 0x1256, 0x1258, 0x1258, 0x125a, 0x125d, 0x1260, 0x1286, 0x1288, 0x1288, 0x128a, 0x128d, 0x1290, 0x12ae, 0x12b0, 0x12b0, 0x12b2, 0x12b5, 0x12b8, 0x12be, 0x12c0, 0x12c0, 0x12c2, 0x12c5, 0x12c8, 0x12ce, 0x12d0, 0x12d6, 0x12d8, 0x12ee, 0x12f0, 0x130e, 0x1310, 0x1310, 0x1312, 0x1315, 0x1318, 0x131e, 0x1320, 0x1346, 0x1348, 0x135a, 0x1369, 0x1371, 0x13a0, 0x13f4, 0x1401, 0x166c, 0x166f, 0x1676, 0x1681, 0x169a, 0x16a0, 0x16ea, 0x1700, 0x170c, 0x170e, 0x1714, 0x1720, 0x1734, 0x1740, 0x1753, 0x1760, 0x176c, 0x176e, 0x1770, 0x1772, 0x1773, 0x1780, 0x17b3, 0x17b6, 0x17d3, 0x17d7, 0x17d7, 0x17dc, 0x17dd, 0x17e0, 0x17e9, 0x180b, 0x180d, 0x1810, 0x1819, 0x1820, 0x1877, 0x1880, 0x18a9, 0x1900, 0x191c, 0x1920, 0x192b, 0x1930, 0x193b, 0x1946, 0x196d, 0x1970, 0x1974, 0x1d00, 0x1d6b, 0x1e00, 0x1e9b, 0x1ea0, 0x1ef9, 0x1f00, 0x1f15, 0x1f18, 0x1f1d, 0x1f20, 0x1f45, 0x1f48, 0x1f4d, 0x1f50, 0x1f57, 0x1f59, 0x1f59, 0x1f5b, 0x1f5b, 0x1f5d, 0x1f5d, 0x1f5f, 0x1f7d, 0x1f80, 0x1fb4, 0x1fb6, 0x1fbc, 0x1fbe, 0x1fbe, 0x1fc2, 0x1fc4, 0x1fc6, 0x1fcc, 0x1fd0, 0x1fd3, 0x1fd6, 0x1fdb, 0x1fe0, 0x1fec, 0x1ff2, 0x1ff4, 0x1ff6, 0x1ffc, 0x2071, 0x2071, 0x207f, 0x207f, 0x20d0, 0x20ea, 0x2102, 0x2102, 0x2107, 0x2107, 0x210a, 0x2113, 0x2115, 0x2115, 0x2119, 0x211d, 0x2124, 0x2124, 0x2126, 0x2126, 0x2128, 0x2128, 0x212a, 0x212d, 0x212f, 0x2131, 0x2133, 0x2139, 0x213d, 0x213f, 0x2145, 0x2149, 0x3005, 0x3006, 0x302a, 0x302f, 0x3031, 0x3035, 0x303b, 0x303c, 0x3041, 0x3096, 0x3099, 0x309a, 0x309d, 0x309f, 0x30a1, 0x30fa, 0x30fc, 0x30ff, 0x3105, 0x312c, 0x3131, 0x318e, 0x31a0, 0x31b7, 0x31f0, 0x31ff, 0x3400, 0x4db5, 0x4e00, 0x9fa5, 0xa000, 0xa48c, 0xac00, 0xd7a3, 0xf900, 0xfa2d, 0xfa30, 0xfa6a, 0xfb00, 0xfb06, 0xfb13, 0xfb17, 0xfb1d, 0xfb28, 0xfb2a, 0xfb36, 0xfb38, 0xfb3c, 0xfb3e, 0xfb3e, 0xfb40, 0xfb41, 0xfb43, 0xfb44, 0xfb46, 0xfbb1, 0xfbd3, 0xfd3d, 0xfd50, 0xfd8f, 0xfd92, 0xfdc7, 0xfdf0, 0xfdfb, 0xfe00, 0xfe0f, 0xfe20, 0xfe23, 0xfe70, 0xfe74, 0xfe76, 0xfefc, 0xff10, 0xff19, 0xff21, 0xff3a, 0xff41, 0xff5a, 0xff66, 0xffbe, 0xffc2, 0xffc7, 0xffca, 0xffcf, 0xffd2, 0xffd7, 0xffda, 0xffdc, 0x10000, 0x1000b, 0x1000d, 0x10026, 0x10028, 0x1003a, 0x1003c, 0x1003d, 0x1003f, 0x1004d, 0x10050, 0x1005d, 0x10080, 0x100fa, 0x10300, 0x1031e, 0x10330, 0x10349, 0x10380, 0x1039d,
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -