📄 gutf8.c
字号:
/* gutf8.c - Operations on UTF-8 strings. * * Copyright (C) 1999 Tom Tromey * Copyright (C) 2000 Red Hat, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the * Free Software Foundation, Inc., 59 Temple Place - Suite 330, * Boston, MA 02111-1307, USA. */#include "glibconfig.h"#include <stdlib.h>#ifdef HAVE_CODESET#include <langinfo.h>#endif#include <string.h>#include "glib.h"#ifdef G_PLATFORM_WIN32#include <stdio.h>#define STRICT#include <windows.h>#undef STRICT#endif#include "libcharset.h"#include "glibintl.h"#define UTF8_COMPUTE(Char, Mask, Len) \ if (Char < 128) \ { \ Len = 1; \ Mask = 0x7f; \ } \ else if ((Char & 0xe0) == 0xc0) \ { \ Len = 2; \ Mask = 0x1f; \ } \ else if ((Char & 0xf0) == 0xe0) \ { \ Len = 3; \ Mask = 0x0f; \ } \ else if ((Char & 0xf8) == 0xf0) \ { \ Len = 4; \ Mask = 0x07; \ } \ else if ((Char & 0xfc) == 0xf8) \ { \ Len = 5; \ Mask = 0x03; \ } \ else if ((Char & 0xfe) == 0xfc) \ { \ Len = 6; \ Mask = 0x01; \ } \ else \ Len = -1;#define UTF8_LENGTH(Char) \ ((Char) < 0x80 ? 1 : \ ((Char) < 0x800 ? 2 : \ ((Char) < 0x10000 ? 3 : \ ((Char) < 0x200000 ? 4 : \ ((Char) < 0x4000000 ? 5 : 6))))) #define UTF8_GET(Result, Chars, Count, Mask, Len) \ (Result) = (Chars)[0] & (Mask); \ for ((Count) = 1; (Count) < (Len); ++(Count)) \ { \ if (((Chars)[(Count)] & 0xc0) != 0x80) \ { \ (Result) = -1; \ break; \ } \ (Result) <<= 6; \ (Result) |= ((Chars)[(Count)] & 0x3f); \ }#define UNICODE_VALID(Char) \ ((Char) < 0x110000 && \ ((Char) < 0xD800 || (Char) >= 0xE000) && \ (Char) != 0xFFFE && (Char) != 0xFFFF) static const gchar utf8_skip_data[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1};const gchar * const g_utf8_skip = utf8_skip_data;/** * g_utf8_find_prev_char: * @str: pointer to the beginning of a UTF-8 encoded string * @p: pointer to some position within @str * * Given a position @p with a UTF-8 encoded string @str, find the start * of the previous UTF-8 character starting before @p. Returns %NULL if no * UTF-8 characters are present in @p before @str. * * @p does not have to be at the beginning of a UTF-8 character. No check * is made to see if the character found is actually valid other than * it starts with an appropriate byte. * * Return value: a pointer to the found character or %NULL. **/gchar *g_utf8_find_prev_char (const char *str, const char *p){ for (--p; p >= str; --p) { if ((*p & 0xc0) != 0x80) return (gchar *)p; } return NULL;}/** * g_utf8_find_next_char: * @p: a pointer to a position within a UTF-8 encoded string * @end: a pointer to the end of the string, or %NULL to indicate * that the string is nul-terminated, in which case * the returned value will be * * Finds the start of the next UTF-8 character in the string after @p. * * @p does not have to be at the beginning of a UTF-8 character. No check * is made to see if the character found is actually valid other than * it starts with an appropriate byte. * * Return value: a pointer to the found character or %NULL **/gchar *g_utf8_find_next_char (const gchar *p, const gchar *end){ if (*p) { if (end) for (++p; p < end && (*p & 0xc0) == 0x80; ++p) ; else for (++p; (*p & 0xc0) == 0x80; ++p) ; } return (p == end) ? NULL : (gchar *)p;}/** * g_utf8_prev_char: * @p: a pointer to a position within a UTF-8 encoded string * * Finds the previous UTF-8 character in the string before @p. * * @p does not have to be at the beginning of a UTF-8 character. No check * is made to see if the character found is actually valid other than * it starts with an appropriate byte. If @p might be the first * character of the string, you must use g_utf8_find_prev_char() instead. * * Return value: a pointer to the found character. **/gchar *g_utf8_prev_char (const gchar *p){ while (TRUE) { p--; if ((*p & 0xc0) != 0x80) return (gchar *)p; }}/** * g_utf8_strlen: * @p: pointer to the start of a UTF-8 encoded string. * @max: the maximum number of bytes to examine. If @max * is less than 0, then the string is assumed to be * nul-terminated. * * Returns the length of the string in characters. * * Return value: the length of the string in characters **/glongg_utf8_strlen (const gchar *p, gssize max){ glong len = 0; const gchar *start = p; if (max < 0) { while (*p) { p = g_utf8_next_char (p); ++len; } } else { if (max == 0 || !*p) return 0; p = g_utf8_next_char (p); while (p - start < max && *p) { ++len; p = g_utf8_next_char (p); } /* only do the last len increment if we got a complete * char (don't count partial chars) */ if (p - start == max) ++len; } return len;}/** * g_utf8_get_char: * @p: a pointer to Unicode character encoded as UTF-8 * * Converts a sequence of bytes encoded as UTF-8 to a Unicode character. * If @p does not point to a valid UTF-8 encoded character, results are * undefined. If you are not sure that the bytes are complete * valid Unicode characters, you should use g_utf8_get_char_validated() * instead. * * Return value: the resulting character **/gunicharg_utf8_get_char (const gchar *p){ int i, mask = 0, len; gunichar result; unsigned char c = (unsigned char) *p; UTF8_COMPUTE (c, mask, len); if (len == -1) return (gunichar)-1; UTF8_GET (result, p, i, mask, len); return result;}/** * g_utf8_offset_to_pointer: * @str: a UTF-8 encoded string * @offset: a character offset within @str * * Converts from an integer character offset to a pointer to a position * within the string. * * Return value: the resulting pointer **/gchar *g_utf8_offset_to_pointer (const gchar *str, glong offset) { const gchar *s = str; while (offset--) s = g_utf8_next_char (s); return (gchar *)s;}/** * g_utf8_pointer_to_offset: * @str: a UTF-8 encoded string * @pos: a pointer to a position within @str * * Converts from a pointer to position within a string to a integer * character offset. * * Return value: the resulting character offset **/glong g_utf8_pointer_to_offset (const gchar *str, const gchar *pos){ const gchar *s = str; glong offset = 0; while (s < pos) { s = g_utf8_next_char (s); offset++; } return offset;}/** * g_utf8_strncpy: * @dest: buffer to fill with characters from @src * @src: UTF-8 encoded string * @n: character count * * Like the standard C <function>strncpy()</function> function, but * copies a given number of characters instead of a given number of * bytes. The @src string must be valid UTF-8 encoded text. * (Use g_utf8_validate() on all text before trying to use UTF-8 * utility functions with it.) * * Return value: @dest **/gchar *g_utf8_strncpy (gchar *dest, const gchar *src, gsize n){ const gchar *s = src; while (n && *s) { s = g_utf8_next_char(s); n--; } strncpy(dest, src, s - src); dest[s - src] = 0; return dest;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -