📄 unicodeobject.c
字号:
start = out = PyString_AS_STRING(v);
for (;i < size; ++i) {
Py_UNICODE ch = s[i];
if (!inShift) {
if (ch == '+') {
*out++ = '+';
*out++ = '-';
} else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
charsleft = ch;
bitsleft = 16;
*out++ = '+';
/* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
inShift = bitsleft > 0;
} else {
*out++ = (char) ch;
}
} else {
if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
*out++ = B64(charsleft << (6-bitsleft));
charsleft = 0;
bitsleft = 0;
/* Characters not in the BASE64 set implicitly unshift the sequence
so no '-' is required, except if the character is itself a '-' */
if (B64CHAR(ch) || ch == '-') {
*out++ = '-';
}
inShift = 0;
*out++ = (char) ch;
} else {
bitsleft += 16;
charsleft = (charsleft << 16) | ch;
/* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
/* If the next character is special then we dont' need to terminate
the shift sequence. If the next character is not a BASE64 character
or '-' then the shift sequence will be terminated implicitly and we
don't have to insert a '-'. */
if (bitsleft == 0) {
if (i + 1 < size) {
Py_UNICODE ch2 = s[i+1];
if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
} else if (B64CHAR(ch2) || ch2 == '-') {
*out++ = '-';
inShift = 0;
} else {
inShift = 0;
}
}
else {
*out++ = '-';
inShift = 0;
}
}
}
}
}
if (bitsleft) {
*out++= B64(charsleft << (6-bitsleft) );
*out++ = '-';
}
_PyString_Resize(&v, out - start);
return v;
}
#undef SPECIAL
#undef B64
#undef B64CHAR
#undef UB64
#undef ENCODE
#undef DECODE
/* --- UTF-8 Codec -------------------------------------------------------- */
const static
char utf8_code_length[256] = {
/* Map UTF-8 encoded prefix byte to sequence length. zero means
illegal prefix. see RFC 2279 for details */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
};
static
int utf8_decoding_error(const char **source,
Py_UNICODE **dest,
const char *errors,
const char *details)
{
if ((errors == NULL) ||
(strcmp(errors,"strict") == 0)) {
PyErr_Format(PyExc_UnicodeError,
"UTF-8 decoding error: %.400s",
details);
return -1;
}
else if (strcmp(errors,"ignore") == 0) {
(*source)++;
return 0;
}
else if (strcmp(errors,"replace") == 0) {
(*source)++;
**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
(*dest)++;
return 0;
}
else {
PyErr_Format(PyExc_ValueError,
"UTF-8 decoding error; unknown error handling code: %.400s",
errors);
return -1;
}
}
DL_EXPORT(PyObject *)
PyUnicode_DecodeUTF8(const char *s,
int size,
const char *errors)
{
int n;
const char *e;
PyUnicodeObject *unicode;
Py_UNICODE *p;
const char *errmsg = "";
/* Note: size will always be longer than the resulting Unicode
character count */
unicode = _PyUnicode_New(size);
if (!unicode)
return NULL;
if (size == 0)
return (PyObject *)unicode;
/* Unpack UTF-8 encoded data */
p = unicode->str;
e = s + size;
while (s < e) {
Py_UCS4 ch = (unsigned char)*s;
if (ch < 0x80) {
*p++ = (Py_UNICODE)ch;
s++;
continue;
}
n = utf8_code_length[ch];
if (s + n > e) {
errmsg = "unexpected end of data";
goto utf8Error;
}
switch (n) {
case 0:
/* Work-around for bug in Python 2.2.0 and 2.2.1: the
UTF-8 encoder "forgot" to add the correct \xed prefix
for the lone surrogates 0xd800 - 0xdcff. */
if (((unsigned char)s[0] >= 0xa0) &&
((unsigned char)s[0] <= 0xaf)) {
n = 2;
if (s + n > e) {
errmsg = "unexpected end of data";
goto utf8Error;
}
if ((s[0] & 0xc0) != 0x80 ||
(s[1] & 0xc0) != 0x80) {
errmsg = "invalid data";
goto utf8Error;
}
ch = 0xd000 + ((s[0] & 0x3f) << 6) + (s[1] & 0x3f);
if (ch < 0x0800) {
/* Note: UTF-8 encodings of surrogates are considered
legal UTF-8 sequences;
XXX For wide builds (UCS-4) we should probably try
to recombine the surrogates into a single code
unit.
*/
errmsg = "illegal encoding";
goto utf8Error;
}
else
*p++ = (Py_UNICODE)ch;
break;
}
errmsg = "unexpected code byte";
goto utf8Error;
case 1:
errmsg = "internal error";
goto utf8Error;
case 2:
if ((s[1] & 0xc0) != 0x80) {
errmsg = "invalid data";
goto utf8Error;
}
ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
if (ch < 0x80) {
errmsg = "illegal encoding";
goto utf8Error;
}
else
*p++ = (Py_UNICODE)ch;
break;
case 3:
if ((s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80) {
errmsg = "invalid data";
goto utf8Error;
}
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
if (ch < 0x0800) {
/* Note: UTF-8 encodings of surrogates are considered
legal UTF-8 sequences;
XXX For wide builds (UCS-4) we should probably try
to recombine the surrogates into a single code
unit.
*/
errmsg = "illegal encoding";
goto utf8Error;
}
else
*p++ = (Py_UNICODE)ch;
break;
case 4:
if ((s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80 ||
(s[3] & 0xc0) != 0x80) {
errmsg = "invalid data";
goto utf8Error;
}
ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
/* validate and convert to UTF-16 */
if ((ch < 0x10000) /* minimum value allowed for 4
byte encoding */
|| (ch > 0x10ffff)) /* maximum value allowed for
UTF-16 */
{
errmsg = "illegal encoding";
goto utf8Error;
}
#ifdef Py_UNICODE_WIDE
*p++ = (Py_UNICODE)ch;
#else
/* compute and append the two surrogates: */
/* translate from 10000..10FFFF to 0..FFFF */
ch -= 0x10000;
/* high surrogate = top 10 bits added to D800 */
*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
/* low surrogate = bottom 10 bits added to DC00 */
*p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
#endif
break;
default:
/* Other sizes are only needed for UCS-4 */
errmsg = "unsupported Unicode code range";
goto utf8Error;
}
s += n;
continue;
utf8Error:
if (utf8_decoding_error(&s, &p, errors, errmsg))
goto onError;
}
/* Adjust length */
if (_PyUnicode_Resize(&unicode, p - unicode->str))
goto onError;
return (PyObject *)unicode;
onError:
Py_DECREF(unicode);
return NULL;
}
/* Allocation strategy: if the string is short, convert into a stack buffer
and allocate exactly as much space needed at the end. Else allocate the
maximum possible needed (4 result bytes per Unicode character), and return
the excess memory at the end.
*/
DL_EXPORT(PyObject *)
PyUnicode_EncodeUTF8(const Py_UNICODE *s,
int size,
const char *errors)
{
#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
int i; /* index into s of next input byte */
PyObject *v; /* result string object */
char *p; /* next free byte in output buffer */
int nallocated; /* number of result bytes allocated */
int nneeded; /* number of result bytes needed */
char stackbuf[MAX_SHORT_UNICHARS * 4];
assert(s != NULL);
assert(size >= 0);
if (size <= MAX_SHORT_UNICHARS) {
/* Write into the stack buffer; nallocated can't overflow.
* At the end, we'll allocate exactly as much heap space as it
* turns out we need.
*/
nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
v = NULL; /* will allocate after we're done */
p = stackbuf;
}
else {
/* Overallocate on the heap, and give the excess back at the end. */
nallocated = size * 4;
if (nallocated / 4 != size) /* overflow! */
return PyErr_NoMemory();
v = PyString_FromStringAndSize(NULL, nallocated);
if (v == NULL)
return NULL;
p = PyString_AS_STRING(v);
}
for (i = 0; i < size;) {
Py_UCS4 ch = s[i++];
if (ch < 0x80)
/* Encode ASCII */
*p++ = (char) ch;
else if (ch < 0x0800) {
/* Encode Latin-1 */
*p++ = (char)(0xc0 | (ch >> 6));
*p++ = (char)(0x80 | (ch & 0x3f));
}
else {
/* Encode UCS2 Unicode ordinals */
if (ch < 0x10000) {
/* Special case: check for high surrogate */
if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
Py_UCS4 ch2 = s[i];
/* Check for low surrogate and combine the two to
form a UCS4 value */
if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
i++;
goto encodeUCS4;
}
/* Fall through: handles isolated high surrogates */
}
*p++ = (char)(0xe0 | (ch >> 12));
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
*p++ = (char)(0x80 | (ch & 0x3f));
continue;
}
encodeUCS4:
/* Encode UCS4 Unicode ordinals */
*p++ = (char)(0xf0 | (ch >> 18));
*p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
*p++ = (char)(0x80 | (ch & 0x3f));
}
}
if (v == NULL) {
/* This was stack allocated. */
nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
assert(nneeded <= nallocated);
v = PyString_FromStringAndSize(stackbuf, nneeded);
}
else {
/* Cut back to size actually needed. */
nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
assert(nneeded <= nallocated);
_PyString_Resize(&v, nneeded);
}
return v;
#undef MAX_SHORT_UNICHARS
}
DL_EXPORT(PyObject *)
PyUnicode_AsUTF8String(PyObject *unicode)
{
if (!PyUnicode_Check(unicode)) {
PyErr_BadArgument();
return NULL;
}
return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
PyUnicode_GET_SIZE(unicode),
NULL);
}
/* --- UTF-16 Codec ------------------------------------------------------- */
static
int utf16_decoding_error(Py_UNICODE **dest,
const char *errors,
const char *details)
{
if ((errors == NULL) ||
(strcmp(errors,"strict") == 0)) {
PyErr_Format(PyExc_UnicodeError,
"UTF-16 decoding error: %.400s",
details);
return -1;
}
else if (strcmp(errors,"ignore") == 0) {
return 0;
}
else if (strcmp(errors,"replace") == 0) {
if (dest) {
**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
(*dest)++;
}
return 0;
}
else {
PyErr_Format(PyExc_ValueError,
"UTF-16 decoding error; "
"unknown error handling code: %.400s",
errors);
return -1;
}
}
DL_EXPORT(PyObject *)
PyUnicode_DecodeUTF16(const char *s,
int size,
const char *errors,
int *byteorder)
{
PyUnicodeObject *unicode;
Py_UNICODE *p;
const unsigned char *q, *e;
int bo = 0; /* assume native ordering by default */
const char *errmsg = "";
/* Offsets from q for retrieving byte pairs in the right order. */
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
int ihi = 1, ilo = 0;
#else
int ihi = 0, ilo = 1;
#endif
/* size should be an even number */
if (size & 1) {
if (utf16_decoding_error(NULL, errors, "truncated data"))
return NULL;
--size; /* else ignore the oddball byte */
}
/* Note: size will always be longer than the resulting Unicode
character count */
unicode = _PyUnicode_New(size);
if (!unicode)
return NULL;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -