📄 unicodeobject.c
字号:
if (size == 0)
return (PyObject *)unicode;
/* Unpack UTF-16 encoded data */
p = unicode->str;
q = (unsigned char *)s;
e = q + size;
if (byteorder)
bo = *byteorder;
/* Check for BOM marks (U+FEFF) in the input and adjust current
byte order setting accordingly. In native mode, the leading BOM
mark is skipped, in all other modes, it is copied to the output
stream as-is (giving a ZWNBSP character). */
if (bo == 0) {
const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
if (bom == 0xFEFF) {
q += 2;
bo = -1;
}
else if (bom == 0xFFFE) {
q += 2;
bo = 1;
}
#else
if (bom == 0xFEFF) {
q += 2;
bo = 1;
}
else if (bom == 0xFFFE) {
q += 2;
bo = -1;
}
#endif
}
if (bo == -1) {
/* force LE */
ihi = 1;
ilo = 0;
}
else if (bo == 1) {
/* force BE */
ihi = 0;
ilo = 1;
}
while (q < e) {
Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
q += 2;
if (ch < 0xD800 || ch > 0xDFFF) {
*p++ = ch;
continue;
}
/* UTF-16 code pair: */
if (q >= e) {
errmsg = "unexpected end of data";
goto utf16Error;
}
if (0xD800 <= ch && ch <= 0xDBFF) {
Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
q += 2;
if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
#ifndef Py_UNICODE_WIDE
*p++ = ch;
*p++ = ch2;
#else
*p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
#endif
continue;
}
else {
errmsg = "illegal UTF-16 surrogate";
goto utf16Error;
}
}
errmsg = "illegal encoding";
/* Fall through to report the error */
utf16Error:
if (utf16_decoding_error(&p, errors, errmsg))
goto onError;
}
if (byteorder)
*byteorder = bo;
/* Adjust length */
if (_PyUnicode_Resize(&unicode, p - unicode->str))
goto onError;
return (PyObject *)unicode;
onError:
Py_DECREF(unicode);
return NULL;
}
DL_EXPORT(PyObject *)
PyUnicode_EncodeUTF16(const Py_UNICODE *s,
int size,
const char *errors,
int byteorder)
{
PyObject *v;
unsigned char *p;
int i, pairs;
/* Offsets from p for storing byte pairs in the right order. */
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
int ihi = 1, ilo = 0;
#else
int ihi = 0, ilo = 1;
#endif
#define STORECHAR(CH) \
do { \
p[ihi] = ((CH) >> 8) & 0xff; \
p[ilo] = (CH) & 0xff; \
p += 2; \
} while(0)
for (i = pairs = 0; i < size; i++)
if (s[i] >= 0x10000)
pairs++;
v = PyString_FromStringAndSize(NULL,
2 * (size + pairs + (byteorder == 0)));
if (v == NULL)
return NULL;
p = (unsigned char *)PyString_AS_STRING(v);
if (byteorder == 0)
STORECHAR(0xFEFF);
if (size == 0)
return v;
if (byteorder == -1) {
/* force LE */
ihi = 1;
ilo = 0;
}
else if (byteorder == 1) {
/* force BE */
ihi = 0;
ilo = 1;
}
while (size-- > 0) {
Py_UNICODE ch = *s++;
Py_UNICODE ch2 = 0;
if (ch >= 0x10000) {
ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
ch = 0xD800 | ((ch-0x10000) >> 10);
}
STORECHAR(ch);
if (ch2)
STORECHAR(ch2);
}
return v;
#undef STORECHAR
}
DL_EXPORT(PyObject *)
PyUnicode_AsUTF16String(PyObject *unicode)
{
if (!PyUnicode_Check(unicode)) {
PyErr_BadArgument();
return NULL;
}
return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
PyUnicode_GET_SIZE(unicode),
NULL,
0);
}
/* --- Unicode Escape Codec ----------------------------------------------- */
static
int unicodeescape_decoding_error(Py_UNICODE **x,
const char *errors,
const char *details)
{
if ((errors == NULL) ||
(strcmp(errors,"strict") == 0)) {
PyErr_Format(PyExc_UnicodeError,
"Unicode-Escape decoding error: %.400s",
details);
return -1;
}
else if (strcmp(errors,"ignore") == 0) {
return 0;
}
else if (strcmp(errors,"replace") == 0) {
**x = Py_UNICODE_REPLACEMENT_CHARACTER;
(*x)++;
return 0;
}
else {
PyErr_Format(PyExc_ValueError,
"Unicode-Escape decoding error; "
"unknown error handling code: %.400s",
errors);
return -1;
}
}
// static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
DL_EXPORT(PyObject *)
PyUnicode_DecodeUnicodeEscape(const char *s,
int size,
const char *errors)
{
PyUnicodeObject *v;
Py_UNICODE *p, *buf;
const char *end;
char* message;
Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
/* Escaped strings will always be longer than the resulting
Unicode string, so we start with size here and then reduce the
length after conversion to the true value. */
v = _PyUnicode_New(size);
if (v == NULL)
goto onError;
if (size == 0)
return (PyObject *)v;
p = buf = PyUnicode_AS_UNICODE(v);
end = s + size;
while (s < end) {
unsigned char c;
Py_UNICODE x;
int i, digits;
/* Non-escape characters are interpreted as Unicode ordinals */
if (*s != '\\') {
*p++ = (unsigned char) *s++;
continue;
}
/* \ - Escapes */
s++;
switch (*s++) {
/* \x escapes */
case '\n': break;
case '\\': *p++ = '\\'; break;
case '\'': *p++ = '\''; break;
case '\"': *p++ = '\"'; break;
case 'b': *p++ = '\b'; break;
case 'f': *p++ = '\014'; break; /* FF */
case 't': *p++ = '\t'; break;
case 'n': *p++ = '\n'; break;
case 'r': *p++ = '\r'; break;
case 'v': *p++ = '\013'; break; /* VT */
case 'a': *p++ = '\007'; break; /* BEL, not classic C */
/* \OOO (octal) escapes */
case '0': case '1': case '2': case '3':
case '4': case '5': case '6': case '7':
x = s[-1] - '0';
if ('0' <= *s && *s <= '7') {
x = (x<<3) + *s++ - '0';
if ('0' <= *s && *s <= '7')
x = (x<<3) + *s++ - '0';
}
*p++ = x;
break;
/* hex escapes */
/* \xXX */
case 'x':
digits = 2;
message = "truncated \\xXX escape";
goto hexescape;
/* \uXXXX */
case 'u':
digits = 4;
message = "truncated \\uXXXX escape";
goto hexescape;
/* \UXXXXXXXX */
case 'U':
digits = 8;
message = "truncated \\UXXXXXXXX escape";
hexescape:
chr = 0;
for (i = 0; i < digits; i++) {
c = (unsigned char) s[i];
if (!isxdigit(c)) {
if (unicodeescape_decoding_error(&p, errors, message))
goto onError;
chr = 0xffffffff;
i++;
break;
}
chr = (chr<<4) & ~0xF;
if (c >= '0' && c <= '9')
chr += c - '0';
else if (c >= 'a' && c <= 'f')
chr += 10 + c - 'a';
else
chr += 10 + c - 'A';
}
s += i;
if (chr == 0xffffffff)
/* _decoding_error will have already written into the
target buffer. */
break;
store:
/* when we get here, chr is a 32-bit unicode character */
if (chr <= 0xffff)
/* UCS-2 character */
*p++ = (Py_UNICODE) chr;
else if (chr <= 0x10ffff) {
/* UCS-4 character. Either store directly, or as
surrogate pair. */
#ifdef Py_UNICODE_WIDE
*p++ = chr;
#else
chr -= 0x10000L;
*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
*p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
#endif
} else {
if (unicodeescape_decoding_error(
&p, errors,
"illegal Unicode character")
)
goto onError;
}
break;
/* \N{name} */
case 'N':
message = "malformed \\N character escape";
if (PY_GLOB(ucnhash_CAPI) == NULL) {
/* load the unicode data module */
PyObject *m, *v;
m = PyImport_ImportModule("unicodedata");
if (m == NULL)
goto ucnhashError;
v = PyObject_GetAttrString(m, "ucnhash_CAPI");
Py_DECREF(m);
if (v == NULL)
goto ucnhashError;
PY_GLOB(ucnhash_CAPI) = PyCObject_AsVoidPtr(v);
Py_DECREF(v);
if (PY_GLOB(ucnhash_CAPI) == NULL)
goto ucnhashError;
}
if (*s == '{') {
const char *start = s+1;
/* look for the closing brace */
while (*s != '}' && s < end)
s++;
if (s > start && s < end && *s == '}') {
/* found a name. look it up in the unicode database */
message = "unknown Unicode character name";
s++;
if (((_PyUnicode_Name_CAPI *)
PY_GLOB(ucnhash_CAPI))->getcode(start, s-start-1, &chr))
goto store;
}
}
if (unicodeescape_decoding_error(&p, errors, message))
goto onError;
break;
default:
if (s > end) {
if (unicodeescape_decoding_error(&p, errors, "\\ at end of string"))
goto onError;
}
else {
*p++ = '\\';
*p++ = (unsigned char)s[-1];
}
break;
}
}
if (_PyUnicode_Resize(&v, (int)(p - buf)))
goto onError;
return (PyObject *)v;
ucnhashError:
PyErr_SetString(
PyExc_UnicodeError,
"\\N escapes not supported (can't load unicodedata module)"
);
return NULL;
onError:
Py_XDECREF(v);
return NULL;
}
/* Return a Unicode-Escape string version of the Unicode object.
If quotes is true, the string is enclosed in u"" or u'' quotes as
appropriate.
*/
static const Py_UNICODE *findchar(const Py_UNICODE *s,
int size,
Py_UNICODE ch);
static
PyObject *unicodeescape_string(const Py_UNICODE *s,
int size,
int quotes)
{
PyObject *repr;
char *p;
static const char *const hexdigit = "0123456789abcdef";
repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
if (repr == NULL)
return NULL;
p = PyString_AS_STRING(repr);
if (quotes) {
*p++ = 'u';
*p++ = (findchar(s, size, '\'') &&
!findchar(s, size, '"')) ? '"' : '\'';
}
while (size-- > 0) {
Py_UNICODE ch = *s++;
/* Escape quotes */
if (quotes &&
(ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
*p++ = '\\';
*p++ = (char) ch;
continue;
}
#ifdef Py_UNICODE_WIDE
/* Map 21-bit characters to '\U00xxxxxx' */
else if (ch >= 0x10000) {
int offset = p - PyString_AS_STRING(repr);
/* Resize the string if necessary */
if (offset + 12 > PyString_GET_SIZE(repr)) {
if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
return NULL;
p = PyString_AS_STRING(repr) + offset;
}
*p++ = '\\';
*p++ = 'U';
*p++ = hexdigit[(ch >> 28) & 0x0000000F];
*p++ = hexdigit[(ch >> 24) & 0x0000000F];
*p++ = hexdigit[(ch >> 20) & 0x0000000F];
*p++ = hexdigit[(ch >> 16) & 0x0000000F];
*p++ = hexdigit[(ch >> 12) & 0x0000000F];
*p++ = hexdigit[(ch >> 8) & 0x0000000F];
*p++ = hexdigit[(ch >> 4) & 0x0000000F];
*p++ = hexdigit[ch & 0x0000000F];
continue;
}
#endif
/* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
else if (ch >= 0xD800 && ch < 0xDC00) {
Py_UNICODE ch2;
Py_UCS4 ucs;
ch2 = *s++;
size--;
if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
*p++ = '\\';
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -