📄 encoding.c.svn-base
字号:
/* * encoding.c : implements the encoding conversion functions needed for XML * * Related specs: * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies * rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau * [ISO-10646] UTF-8 and UTF-16 in Annexes * [ISO-8859-1] ISO Latin-1 characters codes. * [UNICODE] The Unicode Consortium, "The Unicode Standard -- * Worldwide Character Encoding -- Version 1.0", Addison- * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is * described in Unicode Technical Report #4. * [US-ASCII] Coded Character Set--7-bit American Standard Code for * Information Interchange, ANSI X3.4-1986. * * See Copyright for the status of this software. * * daniel@veillard.com * * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org> */#define IN_LIBXML#include "libxml.h"#include <string.h>#ifdef HAVE_CTYPE_H#include <ctype.h>#endif#ifdef HAVE_STDLIB_H#include <stdlib.h>#endif#ifdef LIBXML_ICONV_ENABLED#ifdef HAVE_ERRNO_H#include <errno.h>#endif#endif#include <libxml/encoding.h>#include <libxml/xmlmemory.h>#ifdef LIBXML_HTML_ENABLED#include <libxml/HTMLparser.h>#endif#include <libxml/globals.h>#include <libxml/xmlerror.h>static xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;static xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;struct _xmlCharEncodingAlias { const char *name; const char *alias;};static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;static int xmlCharEncodingAliasesNb = 0;static int xmlCharEncodingAliasesMax = 0;#ifdef LIBXML_ICONV_ENABLED#if 0#define DEBUG_ENCODING /* Define this to get encoding traces */#endif#else#ifdef LIBXML_ISO8859X_ENABLEDstatic void xmlRegisterCharEncodingHandlersISO8859x (void);#endif#endifstatic int xmlLittleEndian = 1;/************************************************************************ * * * Conversions To/From UTF8 encoding * * * ************************************************************************//** * asciiToUTF8: * @out: a pointer to an array of bytes to store the result * @outlen: the length of @out * @in: a pointer to an array of ASCII chars * @inlen: the length of @in * * Take a block of ASCII chars in and try to convert it to an UTF-8 * block of chars out. * Returns 0 if success, or -1 otherwise * The value of @inlen after return is the number of octets consumed * if the return value is positive, else unpredictable. * The value of @outlen after return is the number of octets consumed. */static intasciiToUTF8(unsigned char* out, int *outlen, const unsigned char* in, int *inlen) { unsigned char* outstart = out; const unsigned char* base = in; const unsigned char* processed = in; unsigned char* outend = out + *outlen; const unsigned char* inend; unsigned int c; int bits; inend = in + (*inlen); while ((in < inend) && (out - outstart + 5 < *outlen)) { c= *in++; /* assertion: c is a single UTF-4 value */ if (out >= outend) break; if (c < 0x80) { *out++= c; bits= -6; } else { *outlen = out - outstart; *inlen = processed - base; return(-1); } for ( ; bits >= 0; bits-= 6) { if (out >= outend) break; *out++= ((c >> bits) & 0x3F) | 0x80; } processed = (const unsigned char*) in; } *outlen = out - outstart; *inlen = processed - base; return(0);}#ifdef LIBXML_OUTPUT_ENABLED/** * UTF8Toascii: * @out: a pointer to an array of bytes to store the result * @outlen: the length of @out * @in: a pointer to an array of UTF-8 chars * @inlen: the length of @in * * Take a block of UTF-8 chars in and try to convert it to an ASCII * block of chars out. * * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise * The value of @inlen after return is the number of octets consumed * if the return value is positive, else unpredictable. * The value of @outlen after return is the number of octets consumed. */static intUTF8Toascii(unsigned char* out, int *outlen, const unsigned char* in, int *inlen) { const unsigned char* processed = in; const unsigned char* outend; const unsigned char* outstart = out; const unsigned char* instart = in; const unsigned char* inend; unsigned int c, d; int trailing; if (in == NULL) { /* * initialization nothing to do */ *outlen = 0; *inlen = 0; return(0); } inend = in + (*inlen); outend = out + (*outlen); while (in < inend) { d = *in++; if (d < 0x80) { c= d; trailing= 0; } else if (d < 0xC0) { /* trailing byte in leading position */ *outlen = out - outstart; *inlen = processed - instart; return(-2); } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } else if (d < 0xF8) { c= d & 0x07; trailing= 3; } else { /* no chance for this in Ascii */ *outlen = out - outstart; *inlen = processed - instart; return(-2); } if (inend - in < trailing) { break; } for ( ; trailing; trailing--) { if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) break; c <<= 6; c |= d & 0x3F; } /* assertion: c is a single UTF-4 value */ if (c < 0x80) { if (out >= outend) break; *out++ = c; } else { /* no chance for this in Ascii */ *outlen = out - outstart; *inlen = processed - instart; return(-2); } processed = in; } *outlen = out - outstart; *inlen = processed - instart; return(0);}#endif /* LIBXML_OUTPUT_ENABLED *//** * isolat1ToUTF8: * @out: a pointer to an array of bytes to store the result * @outlen: the length of @out * @in: a pointer to an array of ISO Latin 1 chars * @inlen: the length of @in * * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8 * block of chars out. * Returns 0 if success, or -1 otherwise * The value of @inlen after return is the number of octets consumed * if the return value is positive, else unpredictable. * The value of @outlen after return is the number of octets consumed. */intisolat1ToUTF8(unsigned char* out, int *outlen, const unsigned char* in, int *inlen) { unsigned char* outstart = out; const unsigned char* base = in; unsigned char* outend = out + *outlen; const unsigned char* inend; const unsigned char* instop; inend = in + (*inlen); instop = inend; while (in < inend && out < outend - 1) { if (*in >= 0x80) { *out++ = (((*in) >> 6) & 0x1F) | 0xC0; *out++ = ((*in) & 0x3F) | 0x80; ++in; } if (instop - in > outend - out) instop = in + (outend - out); while (in < instop && *in < 0x80) { *out++ = *in++; } } if (in < inend && out < outend && *in < 0x80) { *out++ = *in++; } *outlen = out - outstart; *inlen = in - base; return(0);}/** * UTF8ToUTF8: * @out: a pointer to an array of bytes to store the result * @outlen: the length of @out * @inb: a pointer to an array of UTF-8 chars * @inlenb: the length of @in in UTF-8 chars * * No op copy operation for UTF8 handling. * * Returns the number of bytes written, or -1 if lack of space. * The value of *inlen after return is the number of octets consumed * if the return value is positive, else unpredictable. */static intUTF8ToUTF8(unsigned char* out, int *outlen, const unsigned char* inb, int *inlenb){ int len; if ((out == NULL) || (inb == NULL) || (outlen == NULL) || (inlenb == NULL)) return(-1); if (*outlen > *inlenb) { len = *inlenb; } else { len = *outlen; } if (len < 0) return(-1); memcpy(out, inb, len); *outlen = len; *inlenb = len; return(0);}#ifdef LIBXML_OUTPUT_ENABLED/** * UTF8Toisolat1: * @out: a pointer to an array of bytes to store the result * @outlen: the length of @out * @in: a pointer to an array of UTF-8 chars * @inlen: the length of @in * * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1 * block of chars out. * * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise * The value of @inlen after return is the number of octets consumed * if the return value is positive, else unpredictable. * The value of @outlen after return is the number of octets consumed. */intUTF8Toisolat1(unsigned char* out, int *outlen, const unsigned char* in, int *inlen) { const unsigned char* processed = in; const unsigned char* outend; const unsigned char* outstart = out; const unsigned char* instart = in; const unsigned char* inend; unsigned int c, d; int trailing; if (in == NULL) { /* * initialization nothing to do */ *outlen = 0; *inlen = 0; return(0); } inend = in + (*inlen); outend = out + (*outlen); while (in < inend) { d = *in++; if (d < 0x80) { c= d; trailing= 0; } else if (d < 0xC0) { /* trailing byte in leading position */ *outlen = out - outstart; *inlen = processed - instart; return(-2); } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } else if (d < 0xF8) { c= d & 0x07; trailing= 3; } else { /* no chance for this in IsoLat1 */ *outlen = out - outstart; *inlen = processed - instart; return(-2); } if (inend - in < trailing) { break; } for ( ; trailing; trailing--) { if (in >= inend) break; if (((d= *in++) & 0xC0) != 0x80) { *outlen = out - outstart; *inlen = processed - instart; return(-2); } c <<= 6; c |= d & 0x3F; } /* assertion: c is a single UTF-4 value */ if (c <= 0xFF) { if (out >= outend) break; *out++ = c; } else { /* no chance for this in IsoLat1 */ *outlen = out - outstart; *inlen = processed - instart; return(-2); } processed = in; } *outlen = out - outstart; *inlen = processed - instart; return(0);}#endif /* LIBXML_OUTPUT_ENABLED *//** * UTF16LEToUTF8: * @out: a pointer to an array of bytes to store the result * @outlen: the length of @out * @inb: a pointer to an array of UTF-16LE passwd as a byte array * @inlenb: the length of @in in UTF-16LE chars * * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8 * block of chars out. This function assumes the endian property * is the same between the native type of this machine and the * inputed one. * * Returns the number of bytes written, or -1 if lack of space, or -2 * if the transcoding fails (if *in is not a valid utf16 string) * The value of *inlen after return is the number of octets consumed * if the return value is positive, else unpredictable. */static intUTF16LEToUTF8(unsigned char* out, int *outlen, const unsigned char* inb, int *inlenb){ unsigned char* outstart = out; const unsigned char* processed = inb; unsigned char* outend = out + *outlen; unsigned short* in = (unsigned short*) inb; unsigned short* inend; unsigned int c, d, inlen; unsigned char *tmp; int bits; if ((*inlenb % 2) == 1) (*inlenb)--; inlen = *inlenb / 2; inend = in + inlen; while ((in < inend) && (out - outstart + 5 < *outlen)) { if (xmlLittleEndian) { c= *in++; } else { tmp = (unsigned char *) in; c = *tmp++; c = c | (((unsigned int)*tmp) << 8); in++; } if ((c & 0xFC00) == 0xD800) { /* surrogates */ if (in >= inend) { /* (in > inend) shouldn't happens */ break; } if (xmlLittleEndian) { d = *in++; } else { tmp = (unsigned char *) in; d = *tmp++; d = d | (((unsigned int)*tmp) << 8); in++; } if ((d & 0xFC00) == 0xDC00) { c &= 0x03FF; c <<= 10; c |= d & 0x03FF; c += 0x10000; } else {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -