📄 wchar.c
字号:
l = 1; } break; } return l;}static intpg_johab_verifier(const unsigned char *s, int len){ int l, mbl; unsigned char c; l = mbl = pg_johab_mblen(s); if (len < l) return -1; if (!IS_HIGHBIT_SET(*s)) return mbl; while (--l > 0) { c = *++s; if (!IS_EUC_RANGE_VALID(c)) return -1; } return mbl;}static intpg_mule_verifier(const unsigned char *s, int len){ int l, mbl; unsigned char c; l = mbl = pg_mule_mblen(s); if (len < l) return -1; while (--l > 0) { c = *++s; if (!IS_HIGHBIT_SET(c)) return -1; } return mbl;}static intpg_latin1_verifier(const unsigned char *s, int len){ return 1;}static intpg_sjis_verifier(const unsigned char *s, int len){ int l, mbl; unsigned char c1, c2; l = mbl = pg_sjis_mblen(s); if (len < l) return -1; if (l == 1) /* pg_sjis_mblen already verified it */ return mbl; c1 = *s++; c2 = *s; if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2)) return -1; return mbl;}static intpg_big5_verifier(const unsigned char *s, int len){ int l, mbl; l = mbl = pg_big5_mblen(s); if (len < l) return -1; while (--l > 0) { if (*++s == '\0') return -1; } return mbl;}static intpg_gbk_verifier(const unsigned char *s, int len){ int l, mbl; l = mbl = pg_gbk_mblen(s); if (len < l) return -1; while (--l > 0) { if (*++s == '\0') return -1; } return mbl;}static intpg_uhc_verifier(const unsigned char *s, int len){ int l, mbl; l = mbl = pg_uhc_mblen(s); if (len < l) return -1; while (--l > 0) { if (*++s == '\0') return -1; } return mbl;}static intpg_gb18030_verifier(const unsigned char *s, int len){ int l, mbl; l = mbl = pg_gb18030_mblen(s); if (len < l) return -1; while (--l > 0) { if (*++s == '\0') return -1; } return mbl;}static intpg_utf8_verifier(const unsigned char *s, int len){ int l = pg_utf_mblen(s); if (len < l) return -1; if (!pg_utf8_islegal(s, l)) return -1; return l;}/* * Check for validity of a single UTF-8 encoded character * * This directly implements the rules in RFC3629. The bizarre-looking * restrictions on the second byte are meant to ensure that there isn't * more than one encoding of a given Unicode character point; that is, * you may not use a longer-than-necessary byte sequence with high order * zero bits to represent a character that would fit in fewer bytes. * To do otherwise is to create security hazards (eg, create an apparent * non-ASCII character that decodes to plain ASCII). * * length is assumed to have been obtained by pg_utf_mblen(), and the * caller must have checked that that many bytes are present in the buffer. */boolpg_utf8_islegal(const unsigned char *source, int length){ unsigned char a; switch (length) { default: /* reject lengths 5 and 6 for now */ return false; case 4: a = source[3]; if (a < 0x80 || a > 0xBF) return false; /* FALL THRU */ case 3: a = source[2]; if (a < 0x80 || a > 0xBF) return false; /* FALL THRU */ case 2: a = source[1]; switch (*source) { case 0xE0: if (a < 0xA0 || a > 0xBF) return false; break; case 0xED: if (a < 0x80 || a > 0x9F) return false; break; case 0xF0: if (a < 0x90 || a > 0xBF) return false; break; case 0xF4: if (a < 0x80 || a > 0x8F) return false; break; default: if (a < 0x80 || a > 0xBF) return false; break; } /* FALL THRU */ case 1: a = *source; if (a >= 0x80 && a < 0xC2) return false; if (a > 0xF4) return false; break; } return true;}/* *------------------------------------------------------------------- * encoding info table * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h) *------------------------------------------------------------------- */pg_wchar_tbl pg_wchar_table[] = { {pg_ascii2wchar_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifier, 1}, /* 0; PG_SQL_ASCII */ {pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* 1; PG_EUC_JP */ {pg_euccn2wchar_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 2}, /* 2; PG_EUC_CN */ {pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifier, 3}, /* 3; PG_EUC_KR */ {pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifier, 4}, /* 4; PG_EUC_TW */ {pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* 5; PG_EUC_JIS_2004 */ {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifier, 4}, /* 6; PG_UTF8 */ {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifier, 4}, /* 7; PG_MULE_INTERNAL */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 8; PG_LATIN1 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 9; PG_LATIN2 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 10; PG_LATIN3 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 11; PG_LATIN4 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 12; PG_LATIN5 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 13; PG_LATIN6 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 14; PG_LATIN7 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 15; PG_LATIN8 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 16; PG_LATIN9 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 17; PG_LATIN10 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 18; PG_WIN1256 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 19; PG_WIN1258 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 20; PG_WIN866 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 21; PG_WIN874 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 22; PG_KOI8R */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 23; PG_WIN1251 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 24; PG_WIN1252 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 25; ISO-8859-5 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 26; ISO-8859-6 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 27; ISO-8859-7 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 28; ISO-8859-8 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 29; PG_WIN1250 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 30; PG_WIN1253 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 31; PG_WIN1254 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 32; PG_WIN1255 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 33; PG_WIN1257 */ {0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}, /* 34; PG_SJIS */ {0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifier, 2}, /* 35; PG_BIG5 */ {0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2}, /* 36; PG_GBK */ {0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifier, 2}, /* 37; PG_UHC */ {0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 4}, /* 38; PG_GB18030 */ {0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifier, 3}, /* 39; PG_JOHAB */ {0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2} /* 40; PG_SHIFT_JIS_2004 */};/* returns the byte length of a word for mule internal code */intpg_mic_mblen(const unsigned char *mbstr){ return pg_mule_mblen(mbstr);}/* * Returns the byte length of a multibyte character. */intpg_encoding_mblen(int encoding, const char *mbstr){ Assert(PG_VALID_ENCODING(encoding)); return ((encoding >= 0 && encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ? ((*pg_wchar_table[encoding].mblen) ((const unsigned char *) mbstr)) : ((*pg_wchar_table[PG_SQL_ASCII].mblen) ((const unsigned char *) mbstr)));}/* * Returns the display length of a multibyte character. */intpg_encoding_dsplen(int encoding, const char *mbstr){ Assert(PG_VALID_ENCODING(encoding)); return ((encoding >= 0 && encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ? ((*pg_wchar_table[encoding].dsplen) ((const unsigned char *) mbstr)) : ((*pg_wchar_table[PG_SQL_ASCII].dsplen) ((const unsigned char *) mbstr)));}/* * Verify the first multibyte character of the given string. * Return its byte length if good, -1 if bad. (See comments above for * full details of the mbverify API.) */intpg_encoding_verifymb(int encoding, const char *mbstr, int len){ Assert(PG_VALID_ENCODING(encoding)); return ((encoding >= 0 && encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ? ((*pg_wchar_table[encoding].mbverify) ((const unsigned char *) mbstr, len)) : ((*pg_wchar_table[PG_SQL_ASCII].mbverify) ((const unsigned char *) mbstr, len)));}/* * fetch maximum length of a given encoding */intpg_encoding_max_length(int encoding){ Assert(PG_VALID_ENCODING(encoding)); return pg_wchar_table[encoding].maxmblen;}#ifndef FRONTEND/* * fetch maximum length of the encoding for the current database */intpg_database_encoding_max_length(void){ return pg_wchar_table[GetDatabaseEncoding()].maxmblen;}/* * Verify mbstr to make sure that it is validly encoded in the specified * encoding. */boolpg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError){ return pg_verify_mbstr_len(encoding, mbstr, len, noError) >= 0;}/* * Verify mbstr to make sure that it is validly encoded in the specified encoding. * 验证是不是特定的编码 * mbstr is not necessarily zero terminated; length of mbstr is * specified by len. * * If OK, return length of string in the encoding. If a problem is found, return -1 when noError is * true; when noError is false, ereport() a descriptive message. * 如果成功,返回字符串的长度,如果出现问题,返回-1。 */intpg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError){ mbverifier mbverify; int mb_len; Assert(PG_VALID_ENCODING(encoding)); /* * In single-byte encodings, we need only reject nulls (\0). */ if (pg_encoding_max_length(encoding) <= 1) { const char *nullpos = (const char *)memchr(mbstr, 0, len); if (nullpos == NULL) return len; if (noError) return -1; report_invalid_encoding(encoding, nullpos, 1); } /* fetch function pointer just once */ mbverify = pg_wchar_table[encoding].mbverify; mb_len = 0; while (len > 0) { int l; /* fast path for ASCII-subset characters */ if (!IS_HIGHBIT_SET(*mbstr)) { if (*mbstr != '\0') { mb_len++; mbstr++; len--; continue; } if (noError) return -1; report_invalid_encoding(encoding, mbstr, len); } l = (*mbverify) ((const unsigned char *) mbstr, len); if (l < 0) { if (noError) return -1; report_invalid_encoding(encoding, mbstr, len); } mbstr += l; len -= l; mb_len++; } return mb_len;}/* * report_invalid_encoding: complain about invalid multibyte character * * note: len is remaining length of string, not length of character; * len must be greater than zero, as we always examine the first byte. */voidreport_invalid_encoding(int encoding, const char *mbstr, int len){ int l = pg_encoding_mblen(encoding, mbstr); char buf[8 * 2 + 1]; char *p = buf; int j, jlimit; jlimit = Min(l, len); jlimit = Min(jlimit, 8); /* prevent buffer overrun */ for (j = 0; j < jlimit; j++) p += sprintf(p, "%02x", (unsigned char) mbstr[j]);/* ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), errmsg("invalid byte sequence for encoding \"%s\": 0x%s", pg_enc2name_tbl[encoding].name, buf), errhint("This error can also happen if the byte sequence does not " "match the encoding expected by the server, which is controlled " "by \"client_encoding\".")));*/ fprintf(stderr,"invalid byte sequence for encoding \"%s\": 0x%s",pg_enc2name_tbl[encoding].name,buf);}/* * report_untranslatable_char: complain about untranslatable character * * note: len is remaining length of string, not length of character; * len must be greater than zero, as we always examine the first byte. */voidreport_untranslatable_char(int src_encoding, int dest_encoding, const char *mbstr, int len){ int l = pg_encoding_mblen(src_encoding, mbstr); char buf[8 * 2 + 1]; char *p = buf; int j, jlimit; jlimit = Min(l, len); jlimit = Min(jlimit, 8); /* prevent buffer overrun */ for (j = 0; j < jlimit; j++) p += sprintf(p, "%02x", (unsigned char) mbstr[j]);/* ereport(ERROR, (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER), errmsg("character 0x%s of encoding \"%s\" has no equivalent in \"%s\"", buf, pg_enc2name_tbl[src_encoding].name, pg_enc2name_tbl[dest_encoding].name)));*/ fprintf(stderr,"character 0x%s of encoding \"%s\" has no equivalent in \"%s\"", buf, pg_enc2name_tbl[src_encoding].name, pg_enc2name_tbl[dest_encoding].name);}#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -