⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 unicodeobject.c

📁 python s60 1.4.5版本的源代码
💻 C
📖 第 1 页 / 共 5 页
字号:
    if (size == 0)
        return (PyObject *)unicode;

    /* Unpack UTF-16 encoded data */
    p = unicode->str;
    q = (unsigned char *)s;
    e = q + size;

    if (byteorder)
        bo = *byteorder;

    /* Check for BOM marks (U+FEFF) in the input and adjust current
       byte order setting accordingly. In native mode, the leading BOM
       mark is skipped, in all other modes, it is copied to the output
       stream as-is (giving a ZWNBSP character). */
    if (bo == 0) {
        const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
	if (bom == 0xFEFF) {
	    q += 2;
	    bo = -1;
	}
        else if (bom == 0xFFFE) {
	    q += 2;
	    bo = 1;
	}
#else
	if (bom == 0xFEFF) {
	    q += 2;
	    bo = 1;
	}
        else if (bom == 0xFFFE) {
	    q += 2;
	    bo = -1;
	}
#endif
    }

    if (bo == -1) {
        /* force LE */
        ihi = 1;
        ilo = 0;
    }
    else if (bo == 1) {
        /* force BE */
        ihi = 0;
        ilo = 1;
    }

    while (q < e) {
	Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
	q += 2;

	if (ch < 0xD800 || ch > 0xDFFF) {
	    *p++ = ch;
	    continue;
	}

	/* UTF-16 code pair: */
	if (q >= e) {
	    errmsg = "unexpected end of data";
	    goto utf16Error;
	}
	if (0xD800 <= ch && ch <= 0xDBFF) {
	    Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
	    q += 2;
	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
#ifndef Py_UNICODE_WIDE
		*p++ = ch;
		*p++ = ch2;
#else
		*p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
#endif
		continue;
	    }
	    else {
                errmsg = "illegal UTF-16 surrogate";
		goto utf16Error;
	    }

	}
	errmsg = "illegal encoding";
	/* Fall through to report the error */

    utf16Error:
	if (utf16_decoding_error(&p, errors, errmsg))
	    goto onError;
    }

    if (byteorder)
        *byteorder = bo;

    /* Adjust length */
    if (_PyUnicode_Resize(&unicode, p - unicode->str))
        goto onError;

    return (PyObject *)unicode;

onError:
    Py_DECREF(unicode);
    return NULL;
}

DL_EXPORT(PyObject *)
PyUnicode_EncodeUTF16(const Py_UNICODE *s,
		      int size,
		      const char *errors,
		      int byteorder)
{
    PyObject *v;
    unsigned char *p;
    int i, pairs;
    /* Offsets from p for storing byte pairs in the right order. */
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
    int ihi = 1, ilo = 0;
#else
    int ihi = 0, ilo = 1;
#endif

#define STORECHAR(CH)                   \
    do {                                \
        p[ihi] = ((CH) >> 8) & 0xff;    \
        p[ilo] = (CH) & 0xff;           \
        p += 2;                         \
    } while(0)

    for (i = pairs = 0; i < size; i++)
	if (s[i] >= 0x10000)
	    pairs++;
    v = PyString_FromStringAndSize(NULL,
		  2 * (size + pairs + (byteorder == 0)));
    if (v == NULL)
        return NULL;

    p = (unsigned char *)PyString_AS_STRING(v);
    if (byteorder == 0)
	STORECHAR(0xFEFF);
    if (size == 0)
        return v;

    if (byteorder == -1) {
        /* force LE */
        ihi = 1;
        ilo = 0;
    }
    else if (byteorder == 1) {
        /* force BE */
        ihi = 0;
        ilo = 1;
    }

    while (size-- > 0) {
	Py_UNICODE ch = *s++;
	Py_UNICODE ch2 = 0;
	if (ch >= 0x10000) {
	    ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
	    ch  = 0xD800 | ((ch-0x10000) >> 10);
	}
        STORECHAR(ch);
        if (ch2)
            STORECHAR(ch2);
    }
    return v;
#undef STORECHAR
}

DL_EXPORT(PyObject *)
PyUnicode_AsUTF16String(PyObject *unicode)
{
    if (!PyUnicode_Check(unicode)) {
        PyErr_BadArgument();
        return NULL;
    }
    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				 PyUnicode_GET_SIZE(unicode),
				 NULL,
				 0);
}

/* --- Unicode Escape Codec ----------------------------------------------- */

static
int unicodeescape_decoding_error(Py_UNICODE **x,
                                 const char *errors,
                                 const char *details)
{
    if ((errors == NULL) ||
        (strcmp(errors,"strict") == 0)) {
        PyErr_Format(PyExc_UnicodeError,
                     "Unicode-Escape decoding error: %.400s",
                     details);
        return -1;
    }
    else if (strcmp(errors,"ignore") == 0) {
        return 0;
    }
    else if (strcmp(errors,"replace") == 0) {
        **x = Py_UNICODE_REPLACEMENT_CHARACTER;
	(*x)++;
        return 0;
    }
    else {
        PyErr_Format(PyExc_ValueError,
                     "Unicode-Escape decoding error; "
                     "unknown error handling code: %.400s",
                     errors);
        return -1;
    }
}

// static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;

DL_EXPORT(PyObject *)
PyUnicode_DecodeUnicodeEscape(const char *s,
			      int size,
			      const char *errors)
{
    PyUnicodeObject *v;
    Py_UNICODE *p, *buf;
    const char *end;
    char* message;
    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */

    /* Escaped strings will always be longer than the resulting
       Unicode string, so we start with size here and then reduce the
       length after conversion to the true value. */
    v = _PyUnicode_New(size);
    if (v == NULL)
        goto onError;
    if (size == 0)
        return (PyObject *)v;

    p = buf = PyUnicode_AS_UNICODE(v);
    end = s + size;

    while (s < end) {
        unsigned char c;
        Py_UNICODE x;
        int i, digits;

        /* Non-escape characters are interpreted as Unicode ordinals */
        if (*s != '\\') {
            *p++ = (unsigned char) *s++;
            continue;
        }

        /* \ - Escapes */
        s++;
        switch (*s++) {

        /* \x escapes */
        case '\n': break;
        case '\\': *p++ = '\\'; break;
        case '\'': *p++ = '\''; break;
        case '\"': *p++ = '\"'; break;
        case 'b': *p++ = '\b'; break;
        case 'f': *p++ = '\014'; break; /* FF */
        case 't': *p++ = '\t'; break;
        case 'n': *p++ = '\n'; break;
        case 'r': *p++ = '\r'; break;
        case 'v': *p++ = '\013'; break; /* VT */
        case 'a': *p++ = '\007'; break; /* BEL, not classic C */

        /* \OOO (octal) escapes */
        case '0': case '1': case '2': case '3':
        case '4': case '5': case '6': case '7':
            x = s[-1] - '0';
            if ('0' <= *s && *s <= '7') {
                x = (x<<3) + *s++ - '0';
                if ('0' <= *s && *s <= '7')
                    x = (x<<3) + *s++ - '0';
            }
            *p++ = x;
            break;

        /* hex escapes */
        /* \xXX */
        case 'x':
            digits = 2;
            message = "truncated \\xXX escape";
            goto hexescape;

        /* \uXXXX */
        case 'u':
            digits = 4;
            message = "truncated \\uXXXX escape";
            goto hexescape;

        /* \UXXXXXXXX */
        case 'U':
            digits = 8;
            message = "truncated \\UXXXXXXXX escape";
        hexescape:
            chr = 0;
            for (i = 0; i < digits; i++) {
                c = (unsigned char) s[i];
                if (!isxdigit(c)) {
                    if (unicodeescape_decoding_error(&p, errors, message))
                        goto onError;
                    chr = 0xffffffff;
                    i++;
                    break;
                }
                chr = (chr<<4) & ~0xF;
                if (c >= '0' && c <= '9')
                    chr += c - '0';
                else if (c >= 'a' && c <= 'f')
                    chr += 10 + c - 'a';
                else
                    chr += 10 + c - 'A';
            }
            s += i;
	    if (chr == 0xffffffff)
		    /* _decoding_error will have already written into the
		       target buffer. */
		    break;
        store:
            /* when we get here, chr is a 32-bit unicode character */
            if (chr <= 0xffff)
                /* UCS-2 character */
                *p++ = (Py_UNICODE) chr;
            else if (chr <= 0x10ffff) {
                /* UCS-4 character. Either store directly, or as
		   surrogate pair. */
#ifdef Py_UNICODE_WIDE
                *p++ = chr;
#else
                chr -= 0x10000L;
                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
                *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
#endif
            } else {
                if (unicodeescape_decoding_error(
                    &p, errors,
                    "illegal Unicode character")
                    )
                    goto onError;
            }
            break;

        /* \N{name} */
        case 'N':
            message = "malformed \\N character escape";
            if (PY_GLOB(ucnhash_CAPI) == NULL) {
                /* load the unicode data module */
                PyObject *m, *v;
                m = PyImport_ImportModule("unicodedata");
                if (m == NULL)
                    goto ucnhashError;
                v = PyObject_GetAttrString(m, "ucnhash_CAPI");
                Py_DECREF(m);
                if (v == NULL)
                    goto ucnhashError;
                PY_GLOB(ucnhash_CAPI) = PyCObject_AsVoidPtr(v);
                Py_DECREF(v);
                if (PY_GLOB(ucnhash_CAPI) == NULL)
                    goto ucnhashError;
            }
            if (*s == '{') {
                const char *start = s+1;
                /* look for the closing brace */
                while (*s != '}' && s < end)
                    s++;
                if (s > start && s < end && *s == '}') {
                    /* found a name.  look it up in the unicode database */
                    message = "unknown Unicode character name";
                    s++;
                    if (((_PyUnicode_Name_CAPI *)
			 PY_GLOB(ucnhash_CAPI))->getcode(start, s-start-1, &chr))
                        goto store;
                }
            }
            if (unicodeescape_decoding_error(&p, errors, message))
                goto onError;
            break;

        default:
	    if (s > end) {
		if (unicodeescape_decoding_error(&p, errors, "\\ at end of string"))
		    goto onError;
	    }
	    else {
		*p++ = '\\';
		*p++ = (unsigned char)s[-1];
	    }
            break;
        }
    }
    if (_PyUnicode_Resize(&v, (int)(p - buf)))
		goto onError;
    return (PyObject *)v;

ucnhashError:
    PyErr_SetString(
        PyExc_UnicodeError,
        "\\N escapes not supported (can't load unicodedata module)"
        );
    return NULL;

onError:
    Py_XDECREF(v);
    return NULL;
}

/* Return a Unicode-Escape string version of the Unicode object.

   If quotes is true, the string is enclosed in u"" or u'' quotes as
   appropriate.

*/

static const Py_UNICODE *findchar(const Py_UNICODE *s,
				  int size,
				  Py_UNICODE ch);

static
PyObject *unicodeescape_string(const Py_UNICODE *s,
                               int size,
                               int quotes)
{
    PyObject *repr;
    char *p;

    static const char *const hexdigit = "0123456789abcdef";

    repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
    if (repr == NULL)
        return NULL;

    p = PyString_AS_STRING(repr);

    if (quotes) {
        *p++ = 'u';
        *p++ = (findchar(s, size, '\'') &&
                !findchar(s, size, '"')) ? '"' : '\'';
    }
    while (size-- > 0) {
        Py_UNICODE ch = *s++;

        /* Escape quotes */
        if (quotes &&
	    (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
            *p++ = '\\';
            *p++ = (char) ch;
	    continue;
        }

#ifdef Py_UNICODE_WIDE
        /* Map 21-bit characters to '\U00xxxxxx' */
        else if (ch >= 0x10000) {
	    int offset = p - PyString_AS_STRING(repr);

	    /* Resize the string if necessary */
	    if (offset + 12 > PyString_GET_SIZE(repr)) {
		if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
		    return NULL;
		p = PyString_AS_STRING(repr) + offset;
	    }

            *p++ = '\\';
            *p++ = 'U';
            *p++ = hexdigit[(ch >> 28) & 0x0000000F];
            *p++ = hexdigit[(ch >> 24) & 0x0000000F];
            *p++ = hexdigit[(ch >> 20) & 0x0000000F];
            *p++ = hexdigit[(ch >> 16) & 0x0000000F];
            *p++ = hexdigit[(ch >> 12) & 0x0000000F];
            *p++ = hexdigit[(ch >> 8) & 0x0000000F];
            *p++ = hexdigit[(ch >> 4) & 0x0000000F];
            *p++ = hexdigit[ch & 0x0000000F];
	    continue;
        }
#endif
	/* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
	else if (ch >= 0xD800 && ch < 0xDC00) {
	    Py_UNICODE ch2;
	    Py_UCS4 ucs;

	    ch2 = *s++;
	    size--;
	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
		*p++ = '\\';

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -