📄 gutf8.c
字号:
/* unicode_strchr *//** * g_unichar_to_utf8: * @c: a ISO10646 character code * @outbuf: output buffer, must have at least 6 bytes of space. * If %NULL, the length will be computed and returned * and nothing will be written to @outbuf. * * Converts a single character to UTF-8. * * Return value: number of bytes written **/intg_unichar_to_utf8 (gunichar c, gchar *outbuf){ guint len = 0; int first; int i; if (c < 0x80) { first = 0; len = 1; } else if (c < 0x800) { first = 0xc0; len = 2; } else if (c < 0x10000) { first = 0xe0; len = 3; } else if (c < 0x200000) { first = 0xf0; len = 4; } else if (c < 0x4000000) { first = 0xf8; len = 5; } else { first = 0xfc; len = 6; } if (outbuf) { for (i = len - 1; i > 0; --i) { outbuf[i] = (c & 0x3f) | 0x80; c >>= 6; } outbuf[0] = c | first; } return len;}/** * g_utf8_strchr: * @p: a nul-terminated UTF-8 encoded string * @len: the maximum length of @p * @c: a ISO10646 character * * Finds the leftmost occurrence of the given ISO10646 character * in a UTF-8 encoded string, while limiting the search to @len bytes. * If @len is -1, allow unbounded search. * * Return value: %NULL if the string does not contain the character, * otherwise, a pointer to the start of the leftmost occurrence of * the character in the string. **/gchar *g_utf8_strchr (const char *p, gssize len, gunichar c){ gchar ch[10]; gint charlen = g_unichar_to_utf8 (c, ch); ch[charlen] = '\0'; return g_strstr_len (p, len, ch);}/** * g_utf8_strrchr: * @p: a nul-terminated UTF-8 encoded string * @len: the maximum length of @p * @c: a ISO10646 character * * Find the rightmost occurrence of the given ISO10646 character * in a UTF-8 encoded string, while limiting the search to @len bytes. * If @len is -1, allow unbounded search. * * Return value: %NULL if the string does not contain the character, * otherwise, a pointer to the start of the rightmost occurrence of the * character in the string. **/gchar *g_utf8_strrchr (const char *p, gssize len, gunichar c){ gchar ch[10]; gint charlen = g_unichar_to_utf8 (c, ch); ch[charlen] = '\0'; return g_strrstr_len (p, len, ch);}/* Like g_utf8_get_char, but take a maximum length * and return (gunichar)-2 on incomplete trailing character */static inline gunicharg_utf8_get_char_extended (const gchar *p, gssize max_len) { guint i, len; gunichar wc = (guchar) *p; if (wc < 0x80) { return wc; } else if (wc < 0xc0) { return (gunichar)-1; } else if (wc < 0xe0) { len = 2; wc &= 0x1f; } else if (wc < 0xf0) { len = 3; wc &= 0x0f; } else if (wc < 0xf8) { len = 4; wc &= 0x07; } else if (wc < 0xfc) { len = 5; wc &= 0x03; } else if (wc < 0xfe) { len = 6; wc &= 0x01; } else { return (gunichar)-1; } if (max_len >= 0 && len > max_len) { for (i = 1; i < max_len; i++) { if ((((guchar *)p)[i] & 0xc0) != 0x80) return (gunichar)-1; } return (gunichar)-2; } for (i = 1; i < len; ++i) { gunichar ch = ((guchar *)p)[i]; if ((ch & 0xc0) != 0x80) { if (ch) return (gunichar)-1; else return (gunichar)-2; } wc <<= 6; wc |= (ch & 0x3f); } if (UTF8_LENGTH(wc) != len) return (gunichar)-1; return wc;}/** * g_utf8_get_char_validated: * @p: a pointer to Unicode character encoded as UTF-8 * @max_len: the maximum number of bytes to read, or -1, for no maximum. * * Convert a sequence of bytes encoded as UTF-8 to a Unicode character. * This function checks for incomplete characters, for invalid characters * such as characters that are out of the range of Unicode, and for * overlong encodings of valid characters. * * Return value: the resulting character. If @p points to a partial * sequence at the end of a string that could begin a valid character, * returns (gunichar)-2; otherwise, if @p does not point to a valid * UTF-8 encoded Unicode character, returns (gunichar)-1. **/gunicharg_utf8_get_char_validated (const gchar *p, gssize max_len){ gunichar result = g_utf8_get_char_extended (p, max_len); if (result & 0x80000000) return result; else if (!UNICODE_VALID (result)) return (gunichar)-1; else return result;}/** * g_utf8_to_ucs4_fast: * @str: a UTF-8 encoded string * @len: the maximum length of @str to use. If @len < 0, then * the string is nul-terminated. * @items_written: location to store the number of characters in the * result, or %NULL. * * Convert a string from UTF-8 to a 32-bit fixed width * representation as UCS-4, assuming valid UTF-8 input. * This function is roughly twice as fast as g_utf8_to_ucs4() * but does no error checking on the input. * * Return value: a pointer to a newly allocated UCS-4 string. * This value must be freed with g_free(). **/gunichar *g_utf8_to_ucs4_fast (const gchar *str, glong len, glong *items_written) { gint j, charlen; gunichar *result; gint n_chars, i; const gchar *p; g_return_val_if_fail (str != NULL, NULL); p = str; n_chars = 0; if (len < 0) { while (*p) { p = g_utf8_next_char (p); ++n_chars; } } else { while (p < str + len && *p) { p = g_utf8_next_char (p); ++n_chars; } } result = g_new (gunichar, n_chars + 1); p = str; for (i=0; i < n_chars; i++) { gunichar wc = ((unsigned char *)p)[0]; if (wc < 0x80) { result[i] = wc; p++; } else { if (wc < 0xe0) { charlen = 2; wc &= 0x1f; } else if (wc < 0xf0) { charlen = 3; wc &= 0x0f; } else if (wc < 0xf8) { charlen = 4; wc &= 0x07; } else if (wc < 0xfc) { charlen = 5; wc &= 0x03; } else { charlen = 6; wc &= 0x01; } for (j = 1; j < charlen; j++) { wc <<= 6; wc |= ((unsigned char *)p)[j] & 0x3f; } result[i] = wc; p += charlen; } } result[i] = 0; if (items_written) *items_written = i; return result;}/** * g_unichar_validate: * @ch: a Unicode character * * Checks whether @ch is a valid Unicode character. Some possible * integer values of @ch will not be valid. 0 is considered a valid * character, though it's normally a string terminator. * * Return value: %TRUE if @ch is a valid Unicode character **/gbooleang_unichar_validate (gunichar ch){ return UNICODE_VALID (ch);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -