tclutf.c

来自「tcl是工具命令语言」· C语言 代码 · 共 1,949 行 · 第 1/4 页

C
1,949
字号
/* * tclUtf.c -- * *	Routines for manipulating UTF-8 strings. * * Copyright (c) 1997-1998 Sun Microsystems, Inc. * * See the file "license.terms" for information on usage and redistribution * of this file, and for a DISCLAIMER OF ALL WARRANTIES. * * RCS: @(#) $Id: tclUtf.c,v 1.30 2003/02/18 02:25:45 hobbs Exp $ */#include "tclInt.h"/* * Include the static character classification tables and macros. */#include "tclUniData.c"/* * The following macros are used for fast character category tests.  The * x_BITS values are shifted right by the category value to determine whether * the given category is included in the set. */ #define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \    | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER))#define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)#define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \    | (1 << PARAGRAPH_SEPARATOR))#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)#define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \	    (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \	    (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \	    (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \	    (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \	    (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \	    (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \	    (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \	    (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))#define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \	    (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \	    (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \	    (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))/* * Unicode characters less than this value are represented by themselves  * in UTF-8 strings.  */#define UNICODE_SELF	0x80/* * The following structures are used when mapping between Unicode (UCS-2) * and UTF-8. */static CONST unsigned char totalBytes[256] = {    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,#if TCL_UTF_MAX > 3    4,4,4,4,4,4,4,4,#else    1,1,1,1,1,1,1,1,#endif#if TCL_UTF_MAX > 4    5,5,5,5,#else    1,1,1,1,#endif#if TCL_UTF_MAX > 5    6,6,6,6#else    1,1,1,1#endif};/* * Procedures used only in this module. */static int UtfCount _ANSI_ARGS_((int ch));/* *--------------------------------------------------------------------------- * * UtfCount -- * *	Find the number of bytes in the Utf character "ch". * * Results: *	The return values is the number of bytes in the Utf character "ch". * * Side effects: *	None. * *--------------------------------------------------------------------------- */ INLINE static intUtfCount(ch)    int ch;			/* The Tcl_UniChar whose size is returned. */{    if ((ch > 0) && (ch < UNICODE_SELF)) {	return 1;    }    if (ch <= 0x7FF) {	return 2;    }    if (ch <= 0xFFFF) {	return 3;    }#if TCL_UTF_MAX > 3    if (ch <= 0x1FFFFF) {	return 4;    }    if (ch <= 0x3FFFFFF) {	return 5;    }    if (ch <= 0x7FFFFFFF) {	return 6;    }#endif    return 3;}/* *--------------------------------------------------------------------------- * * Tcl_UniCharToUtf -- * *	Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the *	provided buffer.  Equivalent to Plan 9 runetochar(). * * Results: *	The return values is the number of bytes in the buffer that *	were consumed.   * * Side effects: *	None. * *--------------------------------------------------------------------------- */ INLINE intTcl_UniCharToUtf(ch, str)    int ch;			/* The Tcl_UniChar to be stored in the				 * buffer. */    char *str;			/* Buffer in which the UTF-8 representation				 * of the Tcl_UniChar is stored.  Buffer must				 * be large enough to hold the UTF-8 character				 * (at most TCL_UTF_MAX bytes). */{    if ((ch > 0) && (ch < UNICODE_SELF)) {	str[0] = (char) ch;	return 1;    }    if (ch <= 0x7FF) {	str[1] = (char) ((ch | 0x80) & 0xBF);	str[0] = (char) ((ch >> 6) | 0xC0);	return 2;    }    if (ch <= 0xFFFF) {	three:	str[2] = (char) ((ch | 0x80) & 0xBF);	str[1] = (char) (((ch >> 6) | 0x80) & 0xBF);	str[0] = (char) ((ch >> 12) | 0xE0);	return 3;    }#if TCL_UTF_MAX > 3    if (ch <= 0x1FFFFF) {	str[3] = (char) ((ch | 0x80) & 0xBF);	str[2] = (char) (((ch >> 6) | 0x80) & 0xBF);	str[1] = (char) (((ch >> 12) | 0x80) & 0xBF);	str[0] = (char) ((ch >> 18) | 0xF0);	return 4;    }    if (ch <= 0x3FFFFFF) {	str[4] = (char) ((ch | 0x80) & 0xBF);	str[3] = (char) (((ch >> 6) | 0x80) & 0xBF);	str[2] = (char) (((ch >> 12) | 0x80) & 0xBF);	str[1] = (char) (((ch >> 18) | 0x80) & 0xBF);	str[0] = (char) ((ch >> 24) | 0xF8);	return 5;    }    if (ch <= 0x7FFFFFFF) {	str[5] = (char) ((ch | 0x80) & 0xBF);	str[4] = (char) (((ch >> 6) | 0x80) & 0xBF);	str[3] = (char) (((ch >> 12) | 0x80) & 0xBF);	str[2] = (char) (((ch >> 18) | 0x80) & 0xBF);	str[1] = (char) (((ch >> 24) | 0x80) & 0xBF);	str[0] = (char) ((ch >> 30) | 0xFC);	return 6;    }#endif    ch = 0xFFFD;    goto three;}/* *--------------------------------------------------------------------------- * * Tcl_UniCharToUtfDString -- * *	Convert the given Unicode string to UTF-8. * * Results: *	The return value is a pointer to the UTF-8 representation of the *	Unicode string.  Storage for the return value is appended to the *	end of dsPtr. * * Side effects: *	None. * *--------------------------------------------------------------------------- */ char *Tcl_UniCharToUtfDString(wString, numChars, dsPtr)    CONST Tcl_UniChar *wString;	/* Unicode string to convert to UTF-8. */    int numChars;		/* Length of Unicode string in Tcl_UniChars				 * (must be >= 0). */    Tcl_DString *dsPtr;		/* UTF-8 representation of string is				 * appended to this previously initialized				 * DString. */{    CONST Tcl_UniChar *w, *wEnd;    char *p, *string;    int oldLength;    /*     * UTF-8 string length in bytes will be <= Unicode string length *     * TCL_UTF_MAX.     */    oldLength = Tcl_DStringLength(dsPtr);    Tcl_DStringSetLength(dsPtr, (oldLength + numChars + 1) * TCL_UTF_MAX);    string = Tcl_DStringValue(dsPtr) + oldLength;    p = string;    wEnd = wString + numChars;    for (w = wString; w < wEnd; ) {	p += Tcl_UniCharToUtf(*w, p);	w++;    }    Tcl_DStringSetLength(dsPtr, oldLength + (p - string));    return string;}/* *--------------------------------------------------------------------------- * * Tcl_UtfToUniChar -- * *	Extract the Tcl_UniChar represented by the UTF-8 string.  Bad *	UTF-8 sequences are converted to valid Tcl_UniChars and processing *	continues.  Equivalent to Plan 9 chartorune(). * *	The caller must ensure that the source buffer is long enough that *	this routine does not run off the end and dereference non-existent *	memory looking for trail bytes.  If the source buffer is known to *	be '\0' terminated, this cannot happen.  Otherwise, the caller *	should call Tcl_UtfCharComplete() before calling this routine to *	ensure that enough bytes remain in the string. * * Results: *	*chPtr is filled with the Tcl_UniChar, and the return value is the *	number of bytes from the UTF-8 string that were consumed. * * Side effects: *	None. * *--------------------------------------------------------------------------- */ intTcl_UtfToUniChar(str, chPtr)    register CONST char *str;	 /* The UTF-8 string. */    register Tcl_UniChar *chPtr; /* Filled with the Tcl_UniChar represented				  * by the UTF-8 string. */{    register int byte;        /*     * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.     */    byte = *((unsigned char *) str);    if (byte < 0xC0) {	/*	 * Handles properly formed UTF-8 characters between 0x01 and 0x7F.	 * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid	 * characters representing themselves.	 */	*chPtr = (Tcl_UniChar) byte;	return 1;    } else if (byte < 0xE0) {	if ((str[1] & 0xC0) == 0x80) {	    /*	     * Two-byte-character lead-byte followed by a trail-byte.	     */	    *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (str[1] & 0x3F));	    return 2;	}	/*	 * A two-byte-character lead-byte not followed by trail-byte	 * represents itself.	 */	*chPtr = (Tcl_UniChar) byte;	return 1;    } else if (byte < 0xF0) {	if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80)) {	    /*	     * Three-byte-character lead byte followed by two trail bytes.	     */	    *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) 		    | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F));	    return 3;	}	/*	 * A three-byte-character lead-byte not followed by two trail-bytes	 * represents itself.	 */	*chPtr = (Tcl_UniChar) byte;	return 1;    }#if TCL_UTF_MAX > 3    else {	int ch, total, trail;	total = totalBytes[byte];	trail = total - 1;	if (trail > 0) {	    ch = byte & (0x3F >> trail);	    do {		str++;		if ((*str & 0xC0) != 0x80) {		    *chPtr = byte;		    return 1;		}		ch <<= 6;		ch |= (*str & 0x3F);		trail--;	    } while (trail > 0);	    *chPtr = ch;	    return total;	}    }#endif    *chPtr = (Tcl_UniChar) byte;    return 1;}/* *--------------------------------------------------------------------------- * * Tcl_UtfToUniCharDString -- * *	Convert the UTF-8 string to Unicode. * * Results: *	The return value is a pointer to the Unicode representation of the *	UTF-8 string.  Storage for the return value is appended to the *	end of dsPtr.  The Unicode string is terminated with a Unicode *	NULL character. * * Side effects: *	None. * *--------------------------------------------------------------------------- */Tcl_UniChar *Tcl_UtfToUniCharDString(string, length, dsPtr)    CONST char *string;		/* UTF-8 string to convert to Unicode. */    int length;			/* Length of UTF-8 string in bytes, or -1				 * for strlen(). */    Tcl_DString *dsPtr;		/* Unicode representation of string is				 * appended to this previously initialized				 * DString. */{    Tcl_UniChar *w, *wString;    CONST char *p, *end;    int oldLength;    if (length < 0) {	length = strlen(string);    }    /*     * Unicode string length in Tcl_UniChars will be <= UTF-8 string length     * in bytes.     */    oldLength = Tcl_DStringLength(dsPtr);    Tcl_DStringSetLength(dsPtr,	    (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));    wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);    w = wString;    end = string + length;    for (p = string; p < end; ) {	p += TclUtfToUniChar(p, w);	w++;    }    *w = '\0';    Tcl_DStringSetLength(dsPtr,	    (oldLength + ((char *) w - (char *) wString)));    return wString;}/* *--------------------------------------------------------------------------- * * Tcl_UtfCharComplete -- * *	Determine if the UTF-8 string of the given length is long enough *	to be decoded by Tcl_UtfToUniChar().  This does not ensure that the *	UTF-8 string is properly formed.  Equivalent to Plan 9 fullrune(). * * Results: *	The return value is 0 if the string is not long enough, non-zero *	otherwise. * * Side effects: *	None. * *--------------------------------------------------------------------------- */intTcl_UtfCharComplete(str, len)    CONST char *str;		/* String to check if first few bytes				 * contain a complete UTF-8 character. */    int len;			/* Length of above string in bytes. */{    int ch;    ch = *((unsigned char *) str);    return len >= totalBytes[ch];}/* *--------------------------------------------------------------------------- * * Tcl_NumUtfChars -- * *	Returns the number of characters (not bytes) in the UTF-8 string, *	not including the terminating NULL byte.  This is equivalent to *	Plan 9 utflen() and utfnlen(). * * Results: *	As above.   * * Side effects: *	None. * *--------------------------------------------------------------------------- */ int Tcl_NumUtfChars(str, len)    register CONST char *str;	/* The UTF-8 string to measure. */    int len;			/* The length of the string in bytes, or -1

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?