📄 unicodeobject.c
字号:
DL_EXPORT(PyObject *)
PyUnicode_FromObject(register PyObject *obj)
{
/* XXX Perhaps we should make this API an alias of
PyObject_Unicode() instead ?! */
if (PyUnicode_CheckExact(obj)) {
Py_INCREF(obj);
return obj;
}
if (PyUnicode_Check(obj)) {
/* For a Unicode subtype that's not a Unicode object,
return a true Unicode object with the same data. */
return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
PyUnicode_GET_SIZE(obj));
}
return PyUnicode_FromEncodedObject(obj, NULL, "strict");
}
DL_EXPORT(PyObject *)
PyUnicode_FromEncodedObject(register PyObject *obj,
const char *encoding,
const char *errors)
{
const char *s = NULL;
int len;
int owned = 0;
PyObject *v;
if (obj == NULL) {
PyErr_BadInternalCall();
return NULL;
}
#if 0
/* For b/w compatibility we also accept Unicode objects provided
that no encodings is given and then redirect to
PyObject_Unicode() which then applies the additional logic for
Unicode subclasses.
NOTE: This API should really only be used for object which
represent *encoded* Unicode !
*/
if (PyUnicode_Check(obj)) {
if (encoding) {
PyErr_SetString(PyExc_TypeError,
"decoding Unicode is not supported");
return NULL;
}
return PyObject_Unicode(obj);
}
#else
if (PyUnicode_Check(obj)) {
PyErr_SetString(PyExc_TypeError,
"decoding Unicode is not supported");
return NULL;
}
#endif
/* Coerce object */
if (PyString_Check(obj)) {
s = PyString_AS_STRING(obj);
len = PyString_GET_SIZE(obj);
}
else if (PyObject_AsCharBuffer(obj, &s, &len)) {
/* Overwrite the error message with something more useful in
case of a TypeError. */
if (PyErr_ExceptionMatches(PyExc_TypeError))
PyErr_Format(PyExc_TypeError,
"coercing to Unicode: need string or buffer, "
"%.80s found",
obj->ob_type->tp_name);
goto onError;
}
/* Convert to Unicode */
if (len == 0) {
Py_INCREF(PY_GLOB(unicode_empty));
v = (PyObject *)PY_GLOB(unicode_empty);
}
else
v = PyUnicode_Decode(s, len, encoding, errors);
if (owned) {
Py_DECREF(obj);
}
return v;
onError:
if (owned) {
Py_DECREF(obj);
}
return NULL;
}
DL_EXPORT(PyObject *)
PyUnicode_Decode(const char *s,
int size,
const char *encoding,
const char *errors)
{
PyObject *buffer = NULL, *unicode;
if (encoding == NULL)
encoding = PyUnicode_GetDefaultEncoding();
/* Shortcuts for common default encodings */
if (strcmp(encoding, "utf-8") == 0)
return PyUnicode_DecodeUTF8(s, size, errors);
else if (strcmp(encoding, "latin-1") == 0)
return PyUnicode_DecodeLatin1(s, size, errors);
else if (strcmp(encoding, "ascii") == 0)
return PyUnicode_DecodeASCII(s, size, errors);
/* Decode via the codec registry */
buffer = PyBuffer_FromMemory((void *)s, size);
if (buffer == NULL)
goto onError;
unicode = PyCodec_Decode(buffer, encoding, errors);
if (unicode == NULL)
goto onError;
if (!PyUnicode_Check(unicode)) {
PyErr_Format(PyExc_TypeError,
"decoder did not return an unicode object (type=%.400s)",
unicode->ob_type->tp_name);
Py_DECREF(unicode);
goto onError;
}
Py_DECREF(buffer);
return unicode;
onError:
Py_XDECREF(buffer);
return NULL;
}
DL_EXPORT(PyObject *)
PyUnicode_Encode(const Py_UNICODE *s,
int size,
const char *encoding,
const char *errors)
{
PyObject *v, *unicode;
unicode = PyUnicode_FromUnicode(s, size);
if (unicode == NULL)
return NULL;
v = PyUnicode_AsEncodedString(unicode, encoding, errors);
Py_DECREF(unicode);
return v;
}
DL_EXPORT(PyObject *)
PyUnicode_AsEncodedString(PyObject *unicode,
const char *encoding,
const char *errors)
{
PyObject *v;
if (!PyUnicode_Check(unicode)) {
PyErr_BadArgument();
goto onError;
}
if (encoding == NULL)
encoding = PyUnicode_GetDefaultEncoding();
/* Shortcuts for common default encodings */
if (errors == NULL) {
if (strcmp(encoding, "utf-8") == 0)
return PyUnicode_AsUTF8String(unicode);
else if (strcmp(encoding, "latin-1") == 0)
return PyUnicode_AsLatin1String(unicode);
else if (strcmp(encoding, "ascii") == 0)
return PyUnicode_AsASCIIString(unicode);
}
/* Encode via the codec registry */
v = PyCodec_Encode(unicode, encoding, errors);
if (v == NULL)
goto onError;
/* XXX Should we really enforce this ? */
if (!PyString_Check(v)) {
PyErr_Format(PyExc_TypeError,
"encoder did not return a string object (type=%.400s)",
v->ob_type->tp_name);
Py_DECREF(v);
goto onError;
}
return v;
onError:
return NULL;
}
DL_EXPORT(PyObject *)
_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
const char *errors)
{
PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
if (v)
return v;
v = PyUnicode_AsEncodedString(unicode, NULL, errors);
if (v && errors == NULL)
((PyUnicodeObject *)unicode)->defenc = v;
return v;
}
DL_EXPORT(Py_UNICODE *)
PyUnicode_AsUnicode(PyObject *unicode)
{
if (!PyUnicode_Check(unicode)) {
PyErr_BadArgument();
goto onError;
}
return PyUnicode_AS_UNICODE(unicode);
onError:
return NULL;
}
DL_EXPORT(int)
PyUnicode_GetSize(PyObject *unicode)
{
if (!PyUnicode_Check(unicode)) {
PyErr_BadArgument();
goto onError;
}
return PyUnicode_GET_SIZE(unicode);
onError:
return -1;
}
DL_EXPORT(const char *)
PyUnicode_GetDefaultEncoding(void)
{
return PY_GLOB(unicode_default_encoding);
}
DL_EXPORT(int)
PyUnicode_SetDefaultEncoding(const char *encoding)
{
PyObject *v;
/* Make sure the encoding is valid. As side effect, this also
loads the encoding into the codec registry cache. */
v = _PyCodec_Lookup(encoding);
if (v == NULL)
goto onError;
Py_DECREF(v);
strncpy(PY_GLOB(unicode_default_encoding),
encoding,
sizeof(PY_GLOB(unicode_default_encoding)));
return 0;
onError:
return -1;
}
/* --- UTF-7 Codec -------------------------------------------------------- */
/* see RFC2152 for details */
const static
char utf7_special[128] = {
/* indicate whether a UTF-7 character is special i.e. cannot be directly
encoded:
0 - not special
1 - special
2 - whitespace (optional)
3 - RFC2152 Set O (optional) */
1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
};
#define SPECIAL(c, encodeO, encodeWS) \
(((c)>127 || utf7_special[(c)] == 1) || \
(encodeWS && (utf7_special[(c)] == 2)) || \
(encodeO && (utf7_special[(c)] == 3)))
#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
(c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
#define ENCODE(out, ch, bits) \
while (bits >= 6) { \
*out++ = B64(ch >> (bits-6)); \
bits -= 6; \
}
#define DECODE(out, ch, bits, surrogate) \
while (bits >= 16) { \
Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
bits -= 16; \
if (surrogate) { \
/* We have already generated an error for the high surrogate
so let's not bother seeing if the low surrogate is correct or not */\
surrogate = 0; \
} else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
/* This is a surrogate pair. Unfortunately we can't represent \
it in a 16-bit character */ \
surrogate = 1; \
errmsg = "code pairs are not supported"; \
goto utf7Error; \
} else { \
*out++ = outCh; \
} \
} \
static
int utf7_decoding_error(Py_UNICODE **dest,
const char *errors,
const char *details)
{
if ((errors == NULL) ||
(strcmp(errors,"strict") == 0)) {
PyErr_Format(PyExc_UnicodeError,
"UTF-7 decoding error: %.400s",
details);
return -1;
}
else if (strcmp(errors,"ignore") == 0) {
return 0;
}
else if (strcmp(errors,"replace") == 0) {
if (dest != NULL) {
**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
(*dest)++;
}
return 0;
}
else {
PyErr_Format(PyExc_ValueError,
"UTF-7 decoding error; unknown error handling code: %.400s",
errors);
return -1;
}
}
DL_EXPORT(PyObject *)
PyUnicode_DecodeUTF7(const char *s,
int size,
const char *errors)
{
const char *e;
PyUnicodeObject *unicode;
Py_UNICODE *p;
const char *errmsg = "";
int inShift = 0;
unsigned int bitsleft = 0;
unsigned long charsleft = 0;
int surrogate = 0;
unicode = _PyUnicode_New(size);
if (!unicode)
return NULL;
if (size == 0)
return (PyObject *)unicode;
p = unicode->str;
e = s + size;
while (s < e) {
Py_UNICODE ch = *s;
if (inShift) {
if ((ch == '-') || !B64CHAR(ch)) {
inShift = 0;
s++;
/* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
if (bitsleft >= 6) {
/* The shift sequence has a partial character in it. If
bitsleft < 6 then we could just classify it as padding
but that is not the case here */
errmsg = "partial character in shift sequence";
goto utf7Error;
}
/* According to RFC2152 the remaining bits should be zero. We
choose to signal an error/insert a replacement character
here so indicate the potential of a misencoded character. */
/* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
errmsg = "non-zero padding bits in shift sequence";
goto utf7Error;
}
if (ch == '-') {
if ((s < e) && (*(s) == '-')) {
*p++ = '-';
inShift = 1;
}
} else if (SPECIAL(ch,0,0)) {
errmsg = "unexpected special character";
goto utf7Error;
} else {
*p++ = ch;
}
} else {
charsleft = (charsleft << 6) | UB64(ch);
bitsleft += 6;
s++;
/* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
}
}
else if ( ch == '+' ) {
s++;
if (s < e && *s == '-') {
s++;
*p++ = '+';
} else
{
inShift = 1;
bitsleft = 0;
}
}
else if (SPECIAL(ch,0,0)) {
errmsg = "unexpected special character";
s++;
goto utf7Error;
}
else {
*p++ = ch;
s++;
}
continue;
utf7Error:
if (utf7_decoding_error(&p, errors, errmsg))
goto onError;
}
if (inShift) {
if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
goto onError;
}
if (_PyUnicode_Resize(&unicode, p - unicode->str))
goto onError;
return (PyObject *)unicode;
onError:
Py_DECREF(unicode);
return NULL;
}
DL_EXPORT(PyObject *)
PyUnicode_EncodeUTF7(const Py_UNICODE *s,
int size,
int encodeSetO,
int encodeWhiteSpace,
const char *errors)
{
PyObject *v;
/* It might be possible to tighten this worst case */
unsigned int cbAllocated = 5 * size;
int inShift = 0;
int i = 0;
unsigned int bitsleft = 0;
unsigned long charsleft = 0;
char * out;
char * start;
if (size == 0)
return PyString_FromStringAndSize(NULL, 0);
v = PyString_FromStringAndSize(NULL, cbAllocated);
if (v == NULL)
return NULL;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -