📄 unicodeobject.c
字号:
*p++ = 'U';
*p++ = hexdigit[(ucs >> 28) & 0x0000000F];
*p++ = hexdigit[(ucs >> 24) & 0x0000000F];
*p++ = hexdigit[(ucs >> 20) & 0x0000000F];
*p++ = hexdigit[(ucs >> 16) & 0x0000000F];
*p++ = hexdigit[(ucs >> 12) & 0x0000000F];
*p++ = hexdigit[(ucs >> 8) & 0x0000000F];
*p++ = hexdigit[(ucs >> 4) & 0x0000000F];
*p++ = hexdigit[ucs & 0x0000000F];
continue;
}
/* Fall through: isolated surrogates are copied as-is */
s--;
size++;
}
/* Map 16-bit characters to '\uxxxx' */
if (ch >= 256) {
*p++ = '\\';
*p++ = 'u';
*p++ = hexdigit[(ch >> 12) & 0x000F];
*p++ = hexdigit[(ch >> 8) & 0x000F];
*p++ = hexdigit[(ch >> 4) & 0x000F];
*p++ = hexdigit[ch & 0x000F];
}
/* Map special whitespace to '\t', \n', '\r' */
else if (ch == '\t') {
*p++ = '\\';
*p++ = 't';
}
else if (ch == '\n') {
*p++ = '\\';
*p++ = 'n';
}
else if (ch == '\r') {
*p++ = '\\';
*p++ = 'r';
}
/* Map non-printable US ASCII to '\xhh' */
else if (ch < ' ' || ch >= 0x7F) {
*p++ = '\\';
*p++ = 'x';
*p++ = hexdigit[(ch >> 4) & 0x000F];
*p++ = hexdigit[ch & 0x000F];
}
/* Copy everything else as-is */
else
*p++ = (char) ch;
}
if (quotes)
*p++ = PyString_AS_STRING(repr)[1];
*p = '\0';
_PyString_Resize(&repr, p - PyString_AS_STRING(repr));
return repr;
}
DL_EXPORT(PyObject *)
PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
int size)
{
return unicodeescape_string(s, size, 0);
}
DL_EXPORT(PyObject *)
PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
{
if (!PyUnicode_Check(unicode)) {
PyErr_BadArgument();
return NULL;
}
return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
PyUnicode_GET_SIZE(unicode));
}
/* --- Raw Unicode Escape Codec ------------------------------------------- */
DL_EXPORT(PyObject *)
PyUnicode_DecodeRawUnicodeEscape(const char *s,
int size,
const char *errors)
{
PyUnicodeObject *v;
Py_UNICODE *p, *buf;
const char *end;
const char *bs;
/* Escaped strings will always be longer than the resulting
Unicode string, so we start with size here and then reduce the
length after conversion to the true value. */
v = _PyUnicode_New(size);
if (v == NULL)
goto onError;
if (size == 0)
return (PyObject *)v;
p = buf = PyUnicode_AS_UNICODE(v);
end = s + size;
while (s < end) {
unsigned char c;
Py_UCS4 x;
int i;
/* Non-escape characters are interpreted as Unicode ordinals */
if (*s != '\\') {
*p++ = (unsigned char)*s++;
continue;
}
/* \u-escapes are only interpreted iff the number of leading
backslashes if odd */
bs = s;
for (;s < end;) {
if (*s != '\\')
break;
*p++ = (unsigned char)*s++;
}
if (((s - bs) & 1) == 0 ||
s >= end ||
*s != 'u') {
continue;
}
p--;
s++;
/* \uXXXX with 4 hex digits */
for (x = 0, i = 0; i < 4; i++) {
c = (unsigned char)s[i];
if (!isxdigit(c)) {
if (unicodeescape_decoding_error(&p, errors,
"truncated \\uXXXX"))
goto onError;
x = 0xffffffff;
i++;
break;
}
x = (x<<4) & ~0xF;
if (c >= '0' && c <= '9')
x += c - '0';
else if (c >= 'a' && c <= 'f')
x += 10 + c - 'a';
else
x += 10 + c - 'A';
}
s += i;
if (x != 0xffffffff)
*p++ = x;
}
if (_PyUnicode_Resize(&v, (int)(p - buf)))
goto onError;
return (PyObject *)v;
onError:
Py_XDECREF(v);
return NULL;
}
DL_EXPORT(PyObject *)
PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
int size)
{
PyObject *repr;
char *p;
char *q;
static const char *const hexdigit = "0123456789abcdef";
repr = PyString_FromStringAndSize(NULL, 6 * size);
if (repr == NULL)
return NULL;
if (size == 0)
return repr;
p = q = PyString_AS_STRING(repr);
while (size-- > 0) {
Py_UNICODE ch = *s++;
/* Map 16-bit characters to '\uxxxx' */
if (ch >= 256) {
*p++ = '\\';
*p++ = 'u';
*p++ = hexdigit[(ch >> 12) & 0xf];
*p++ = hexdigit[(ch >> 8) & 0xf];
*p++ = hexdigit[(ch >> 4) & 0xf];
*p++ = hexdigit[ch & 15];
}
/* Copy everything else as-is */
else
*p++ = (char) ch;
}
*p = '\0';
_PyString_Resize(&repr, p - q);
return repr;
}
DL_EXPORT(PyObject *)
PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
{
if (!PyUnicode_Check(unicode)) {
PyErr_BadArgument();
return NULL;
}
return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
PyUnicode_GET_SIZE(unicode));
}
/* --- Latin-1 Codec ------------------------------------------------------ */
DL_EXPORT(PyObject *)
PyUnicode_DecodeLatin1(const char *s,
int size,
const char *errors)
{
PyUnicodeObject *v;
Py_UNICODE *p;
/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
if (size == 1) {
Py_UNICODE r = *(unsigned char*)s;
return PyUnicode_FromUnicode(&r, 1);
}
v = _PyUnicode_New(size);
if (v == NULL)
goto onError;
if (size == 0)
return (PyObject *)v;
p = PyUnicode_AS_UNICODE(v);
while (size-- > 0)
*p++ = (unsigned char)*s++;
return (PyObject *)v;
onError:
Py_XDECREF(v);
return NULL;
}
static
int latin1_encoding_error(const Py_UNICODE **source,
char **dest,
const char *errors,
const char *details)
{
if ((errors == NULL) ||
(strcmp(errors,"strict") == 0)) {
PyErr_Format(PyExc_UnicodeError,
"Latin-1 encoding error: %.400s",
details);
return -1;
}
else if (strcmp(errors,"ignore") == 0) {
return 0;
}
else if (strcmp(errors,"replace") == 0) {
**dest = '?';
(*dest)++;
return 0;
}
else {
PyErr_Format(PyExc_ValueError,
"Latin-1 encoding error; "
"unknown error handling code: %.400s",
errors);
return -1;
}
}
DL_EXPORT(PyObject *)
PyUnicode_EncodeLatin1(const Py_UNICODE *p,
int size,
const char *errors)
{
PyObject *repr;
char *s, *start;
repr = PyString_FromStringAndSize(NULL, size);
if (repr == NULL)
return NULL;
if (size == 0)
return repr;
s = PyString_AS_STRING(repr);
start = s;
while (size-- > 0) {
Py_UNICODE ch = *p++;
if (ch >= 256) {
if (latin1_encoding_error(&p, &s, errors,
"ordinal not in range(256)"))
goto onError;
}
else
*s++ = (char)ch;
}
/* Resize if error handling skipped some characters */
if (s - start < PyString_GET_SIZE(repr))
_PyString_Resize(&repr, s - start);
return repr;
onError:
Py_DECREF(repr);
return NULL;
}
DL_EXPORT(PyObject *)
PyUnicode_AsLatin1String(PyObject *unicode)
{
if (!PyUnicode_Check(unicode)) {
PyErr_BadArgument();
return NULL;
}
return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
PyUnicode_GET_SIZE(unicode),
NULL);
}
/* --- 7-bit ASCII Codec -------------------------------------------------- */
static
int ascii_decoding_error(const char **source,
Py_UNICODE **dest,
const char *errors,
const char *details)
{
if ((errors == NULL) ||
(strcmp(errors,"strict") == 0)) {
PyErr_Format(PyExc_UnicodeError,
"ASCII decoding error: %.400s",
details);
return -1;
}
else if (strcmp(errors,"ignore") == 0) {
return 0;
}
else if (strcmp(errors,"replace") == 0) {
**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
(*dest)++;
return 0;
}
else {
PyErr_Format(PyExc_ValueError,
"ASCII decoding error; "
"unknown error handling code: %.400s",
errors);
return -1;
}
}
DL_EXPORT(PyObject *)
PyUnicode_DecodeASCII(const char *s,
int size,
const char *errors)
{
PyUnicodeObject *v;
Py_UNICODE *p;
/* ASCII is equivalent to the first 128 ordinals in Unicode. */
if (size == 1 && *(unsigned char*)s < 128) {
Py_UNICODE r = *(unsigned char*)s;
return PyUnicode_FromUnicode(&r, 1);
}
v = _PyUnicode_New(size);
if (v == NULL)
goto onError;
if (size == 0)
return (PyObject *)v;
p = PyUnicode_AS_UNICODE(v);
while (size-- > 0) {
register unsigned char c;
c = (unsigned char)*s++;
if (c < 128)
*p++ = c;
else if (ascii_decoding_error(&s, &p, errors,
"ordinal not in range(128)"))
goto onError;
}
if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
goto onError;
return (PyObject *)v;
onError:
Py_XDECREF(v);
return NULL;
}
static
int ascii_encoding_error(const Py_UNICODE **source,
char **dest,
const char *errors,
const char *details)
{
if ((errors == NULL) ||
(strcmp(errors,"strict") == 0)) {
PyErr_Format(PyExc_UnicodeError,
"ASCII encoding error: %.400s",
details);
return -1;
}
else if (strcmp(errors,"ignore") == 0) {
return 0;
}
else if (strcmp(errors,"replace") == 0) {
**dest = '?';
(*dest)++;
return 0;
}
else {
PyErr_Format(PyExc_ValueError,
"ASCII encoding error; "
"unknown error handling code: %.400s",
errors);
return -1;
}
}
DL_EXPORT(PyObject *)
PyUnicode_EncodeASCII(const Py_UNICODE *p,
int size,
const char *errors)
{
PyObject *repr;
char *s, *start;
repr = PyString_FromStringAndSize(NULL, size);
if (repr == NULL)
return NULL;
if (size == 0)
return repr;
s = PyString_AS_STRING(repr);
start = s;
while (size-- > 0) {
Py_UNICODE ch = *p++;
if (ch >= 128) {
if (ascii_encoding_error(&p, &s, errors,
"ordinal not in range(128)"))
goto onError;
}
else
*s++ = (char)ch;
}
/* Resize if error handling skipped some characters */
if (s - start < PyString_GET_SIZE(repr))
_PyString_Resize(&repr, s - start);
return repr;
onError:
Py_DECREF(repr);
return NULL;
}
DL_EXPORT(PyObject *)
PyUnicode_AsASCIIString(PyObject *unicode)
{
if (!PyUnicode_Check(unicode)) {
PyErr_BadArgument();
return NULL;
}
return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
PyUnicode_GET_SIZE(unicode),
NULL);
}
#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
/* --- MBCS codecs for Windows -------------------------------------------- */
DL_EXPORT(PyObject *)
PyUnicode_DecodeMBCS(const char *s,
int size,
const char *errors)
{
PyUnicodeObject *v;
Py_UNICODE *p;
/* First get the size of the result */
DWORD usize = MultiByteToWideChar(CP_AC
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -