tclutf.c
来自「tcl是工具命令语言」· C语言 代码 · 共 1,949 行 · 第 1/4 页
C
1,949 行
/* * tclUtf.c -- * * Routines for manipulating UTF-8 strings. * * Copyright (c) 1997-1998 Sun Microsystems, Inc. * * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * * RCS: @(#) $Id: tclUtf.c,v 1.30 2003/02/18 02:25:45 hobbs Exp $ */#include "tclInt.h"/* * Include the static character classification tables and macros. */#include "tclUniData.c"/* * The following macros are used for fast character category tests. The * x_BITS values are shifted right by the category value to determine whether * the given category is included in the set. */ #define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \ | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER))#define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)#define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \ | (1 << PARAGRAPH_SEPARATOR))#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)#define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \ (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \ (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \ (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \ (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \ (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \ (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \ (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \ (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))#define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \ (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \ (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \ (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))/* * Unicode characters less than this value are represented by themselves * in UTF-8 strings. */#define UNICODE_SELF 0x80/* * The following structures are used when mapping between Unicode (UCS-2) * and UTF-8. */static CONST unsigned char totalBytes[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,#if TCL_UTF_MAX > 3 4,4,4,4,4,4,4,4,#else 1,1,1,1,1,1,1,1,#endif#if TCL_UTF_MAX > 4 5,5,5,5,#else 1,1,1,1,#endif#if TCL_UTF_MAX > 5 6,6,6,6#else 1,1,1,1#endif};/* * Procedures used only in this module. */static int UtfCount _ANSI_ARGS_((int ch));/* *--------------------------------------------------------------------------- * * UtfCount -- * * Find the number of bytes in the Utf character "ch". * * Results: * The return values is the number of bytes in the Utf character "ch". * * Side effects: * None. * *--------------------------------------------------------------------------- */ INLINE static intUtfCount(ch) int ch; /* The Tcl_UniChar whose size is returned. */{ if ((ch > 0) && (ch < UNICODE_SELF)) { return 1; } if (ch <= 0x7FF) { return 2; } if (ch <= 0xFFFF) { return 3; }#if TCL_UTF_MAX > 3 if (ch <= 0x1FFFFF) { return 4; } if (ch <= 0x3FFFFFF) { return 5; } if (ch <= 0x7FFFFFFF) { return 6; }#endif return 3;}/* *--------------------------------------------------------------------------- * * Tcl_UniCharToUtf -- * * Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the * provided buffer. Equivalent to Plan 9 runetochar(). * * Results: * The return values is the number of bytes in the buffer that * were consumed. * * Side effects: * None. * *--------------------------------------------------------------------------- */ INLINE intTcl_UniCharToUtf(ch, str) int ch; /* The Tcl_UniChar to be stored in the * buffer. */ char *str; /* Buffer in which the UTF-8 representation * of the Tcl_UniChar is stored. Buffer must * be large enough to hold the UTF-8 character * (at most TCL_UTF_MAX bytes). */{ if ((ch > 0) && (ch < UNICODE_SELF)) { str[0] = (char) ch; return 1; } if (ch <= 0x7FF) { str[1] = (char) ((ch | 0x80) & 0xBF); str[0] = (char) ((ch >> 6) | 0xC0); return 2; } if (ch <= 0xFFFF) { three: str[2] = (char) ((ch | 0x80) & 0xBF); str[1] = (char) (((ch >> 6) | 0x80) & 0xBF); str[0] = (char) ((ch >> 12) | 0xE0); return 3; }#if TCL_UTF_MAX > 3 if (ch <= 0x1FFFFF) { str[3] = (char) ((ch | 0x80) & 0xBF); str[2] = (char) (((ch >> 6) | 0x80) & 0xBF); str[1] = (char) (((ch >> 12) | 0x80) & 0xBF); str[0] = (char) ((ch >> 18) | 0xF0); return 4; } if (ch <= 0x3FFFFFF) { str[4] = (char) ((ch | 0x80) & 0xBF); str[3] = (char) (((ch >> 6) | 0x80) & 0xBF); str[2] = (char) (((ch >> 12) | 0x80) & 0xBF); str[1] = (char) (((ch >> 18) | 0x80) & 0xBF); str[0] = (char) ((ch >> 24) | 0xF8); return 5; } if (ch <= 0x7FFFFFFF) { str[5] = (char) ((ch | 0x80) & 0xBF); str[4] = (char) (((ch >> 6) | 0x80) & 0xBF); str[3] = (char) (((ch >> 12) | 0x80) & 0xBF); str[2] = (char) (((ch >> 18) | 0x80) & 0xBF); str[1] = (char) (((ch >> 24) | 0x80) & 0xBF); str[0] = (char) ((ch >> 30) | 0xFC); return 6; }#endif ch = 0xFFFD; goto three;}/* *--------------------------------------------------------------------------- * * Tcl_UniCharToUtfDString -- * * Convert the given Unicode string to UTF-8. * * Results: * The return value is a pointer to the UTF-8 representation of the * Unicode string. Storage for the return value is appended to the * end of dsPtr. * * Side effects: * None. * *--------------------------------------------------------------------------- */ char *Tcl_UniCharToUtfDString(wString, numChars, dsPtr) CONST Tcl_UniChar *wString; /* Unicode string to convert to UTF-8. */ int numChars; /* Length of Unicode string in Tcl_UniChars * (must be >= 0). */ Tcl_DString *dsPtr; /* UTF-8 representation of string is * appended to this previously initialized * DString. */{ CONST Tcl_UniChar *w, *wEnd; char *p, *string; int oldLength; /* * UTF-8 string length in bytes will be <= Unicode string length * * TCL_UTF_MAX. */ oldLength = Tcl_DStringLength(dsPtr); Tcl_DStringSetLength(dsPtr, (oldLength + numChars + 1) * TCL_UTF_MAX); string = Tcl_DStringValue(dsPtr) + oldLength; p = string; wEnd = wString + numChars; for (w = wString; w < wEnd; ) { p += Tcl_UniCharToUtf(*w, p); w++; } Tcl_DStringSetLength(dsPtr, oldLength + (p - string)); return string;}/* *--------------------------------------------------------------------------- * * Tcl_UtfToUniChar -- * * Extract the Tcl_UniChar represented by the UTF-8 string. Bad * UTF-8 sequences are converted to valid Tcl_UniChars and processing * continues. Equivalent to Plan 9 chartorune(). * * The caller must ensure that the source buffer is long enough that * this routine does not run off the end and dereference non-existent * memory looking for trail bytes. If the source buffer is known to * be '\0' terminated, this cannot happen. Otherwise, the caller * should call Tcl_UtfCharComplete() before calling this routine to * ensure that enough bytes remain in the string. * * Results: * *chPtr is filled with the Tcl_UniChar, and the return value is the * number of bytes from the UTF-8 string that were consumed. * * Side effects: * None. * *--------------------------------------------------------------------------- */ intTcl_UtfToUniChar(str, chPtr) register CONST char *str; /* The UTF-8 string. */ register Tcl_UniChar *chPtr; /* Filled with the Tcl_UniChar represented * by the UTF-8 string. */{ register int byte; /* * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones. */ byte = *((unsigned char *) str); if (byte < 0xC0) { /* * Handles properly formed UTF-8 characters between 0x01 and 0x7F. * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid * characters representing themselves. */ *chPtr = (Tcl_UniChar) byte; return 1; } else if (byte < 0xE0) { if ((str[1] & 0xC0) == 0x80) { /* * Two-byte-character lead-byte followed by a trail-byte. */ *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (str[1] & 0x3F)); return 2; } /* * A two-byte-character lead-byte not followed by trail-byte * represents itself. */ *chPtr = (Tcl_UniChar) byte; return 1; } else if (byte < 0xF0) { if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80)) { /* * Three-byte-character lead byte followed by two trail bytes. */ *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F)); return 3; } /* * A three-byte-character lead-byte not followed by two trail-bytes * represents itself. */ *chPtr = (Tcl_UniChar) byte; return 1; }#if TCL_UTF_MAX > 3 else { int ch, total, trail; total = totalBytes[byte]; trail = total - 1; if (trail > 0) { ch = byte & (0x3F >> trail); do { str++; if ((*str & 0xC0) != 0x80) { *chPtr = byte; return 1; } ch <<= 6; ch |= (*str & 0x3F); trail--; } while (trail > 0); *chPtr = ch; return total; } }#endif *chPtr = (Tcl_UniChar) byte; return 1;}/* *--------------------------------------------------------------------------- * * Tcl_UtfToUniCharDString -- * * Convert the UTF-8 string to Unicode. * * Results: * The return value is a pointer to the Unicode representation of the * UTF-8 string. Storage for the return value is appended to the * end of dsPtr. The Unicode string is terminated with a Unicode * NULL character. * * Side effects: * None. * *--------------------------------------------------------------------------- */Tcl_UniChar *Tcl_UtfToUniCharDString(string, length, dsPtr) CONST char *string; /* UTF-8 string to convert to Unicode. */ int length; /* Length of UTF-8 string in bytes, or -1 * for strlen(). */ Tcl_DString *dsPtr; /* Unicode representation of string is * appended to this previously initialized * DString. */{ Tcl_UniChar *w, *wString; CONST char *p, *end; int oldLength; if (length < 0) { length = strlen(string); } /* * Unicode string length in Tcl_UniChars will be <= UTF-8 string length * in bytes. */ oldLength = Tcl_DStringLength(dsPtr); Tcl_DStringSetLength(dsPtr, (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar))); wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength); w = wString; end = string + length; for (p = string; p < end; ) { p += TclUtfToUniChar(p, w); w++; } *w = '\0'; Tcl_DStringSetLength(dsPtr, (oldLength + ((char *) w - (char *) wString))); return wString;}/* *--------------------------------------------------------------------------- * * Tcl_UtfCharComplete -- * * Determine if the UTF-8 string of the given length is long enough * to be decoded by Tcl_UtfToUniChar(). This does not ensure that the * UTF-8 string is properly formed. Equivalent to Plan 9 fullrune(). * * Results: * The return value is 0 if the string is not long enough, non-zero * otherwise. * * Side effects: * None. * *--------------------------------------------------------------------------- */intTcl_UtfCharComplete(str, len) CONST char *str; /* String to check if first few bytes * contain a complete UTF-8 character. */ int len; /* Length of above string in bytes. */{ int ch; ch = *((unsigned char *) str); return len >= totalBytes[ch];}/* *--------------------------------------------------------------------------- * * Tcl_NumUtfChars -- * * Returns the number of characters (not bytes) in the UTF-8 string, * not including the terminating NULL byte. This is equivalent to * Plan 9 utflen() and utfnlen(). * * Results: * As above. * * Side effects: * None. * *--------------------------------------------------------------------------- */ int Tcl_NumUtfChars(str, len) register CONST char *str; /* The UTF-8 string to measure. */ int len; /* The length of the string in bytes, or -1
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?