📄 encoding.c.svn-base

📁 这是一个用于解析xml文件的类库。使用这个类库
💻 SVN-BASE
📖 第 1 页 / 共 5 页
字号:
12 3 4 5 下一页
/* * encoding.c : implements the encoding conversion functions needed for XML * * Related specs:  * rfc2044        (UTF-8 and UTF-16) F. Yergeau Alis Technologies * rfc2781        UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau * [ISO-10646]    UTF-8 and UTF-16 in Annexes * [ISO-8859-1]   ISO Latin-1 characters codes. * [UNICODE]      The Unicode Consortium, "The Unicode Standard -- *                Worldwide Character Encoding -- Version 1.0", Addison- *                Wesley, Volume 1, 1991, Volume 2, 1992.  UTF-8 is *                described in Unicode Technical Report #4. * [US-ASCII]     Coded Character Set--7-bit American Standard Code for *                Information Interchange, ANSI X3.4-1986. * * See Copyright for the status of this software. * * daniel@veillard.com * * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org> */#define IN_LIBXML#include "libxml.h"#include <string.h>#ifdef HAVE_CTYPE_H#include <ctype.h>#endif#ifdef HAVE_STDLIB_H#include <stdlib.h>#endif#ifdef LIBXML_ICONV_ENABLED#ifdef HAVE_ERRNO_H#include <errno.h>#endif#endif#include <libxml/encoding.h>#include <libxml/xmlmemory.h>#ifdef LIBXML_HTML_ENABLED#include <libxml/HTMLparser.h>#endif#include <libxml/globals.h>#include <libxml/xmlerror.h>static xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;static xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;struct _xmlCharEncodingAlias {    const char *name;    const char *alias;};static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;static int xmlCharEncodingAliasesNb = 0;static int xmlCharEncodingAliasesMax = 0;#ifdef LIBXML_ICONV_ENABLED#if 0#define DEBUG_ENCODING  /* Define this to get encoding traces */#endif#else#ifdef LIBXML_ISO8859X_ENABLEDstatic void xmlRegisterCharEncodingHandlersISO8859x (void);#endif#endifstatic int xmlLittleEndian = 1;/************************************************************************ *									* *		Conversions To/From UTF8 encoding			* *									* ************************************************************************//** * asciiToUTF8: * @out:  a pointer to an array of bytes to store the result * @outlen:  the length of @out * @in:  a pointer to an array of ASCII chars * @inlen:  the length of @in * * Take a block of ASCII chars in and try to convert it to an UTF-8 * block of chars out. * Returns 0 if success, or -1 otherwise * The value of @inlen after return is the number of octets consumed *     if the return value is positive, else unpredictable. * The value of @outlen after return is the number of octets consumed. */static intasciiToUTF8(unsigned char* out, int *outlen,              const unsigned char* in, int *inlen) {    unsigned char* outstart = out;    const unsigned char* base = in;    const unsigned char* processed = in;    unsigned char* outend = out + *outlen;    const unsigned char* inend;    unsigned int c;    int bits;    inend = in + (*inlen);    while ((in < inend) && (out - outstart + 5 < *outlen)) {	c= *in++;	/* assertion: c is a single UTF-4 value */        if (out >= outend)	    break;        if      (c <    0x80) {  *out++=  c;                bits= -6; }        else { 	    *outlen = out - outstart;	    *inlen = processed - base;	    return(-1);	}         for ( ; bits >= 0; bits-= 6) {            if (out >= outend)	        break;            *out++= ((c >> bits) & 0x3F) | 0x80;        }	processed = (const unsigned char*) in;    }    *outlen = out - outstart;    *inlen = processed - base;    return(0);}#ifdef LIBXML_OUTPUT_ENABLED/** * UTF8Toascii: * @out:  a pointer to an array of bytes to store the result * @outlen:  the length of @out * @in:  a pointer to an array of UTF-8 chars * @inlen:  the length of @in * * Take a block of UTF-8 chars in and try to convert it to an ASCII * block of chars out. * * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise * The value of @inlen after return is the number of octets consumed *     if the return value is positive, else unpredictable. * The value of @outlen after return is the number of octets consumed. */static intUTF8Toascii(unsigned char* out, int *outlen,              const unsigned char* in, int *inlen) {    const unsigned char* processed = in;    const unsigned char* outend;    const unsigned char* outstart = out;    const unsigned char* instart = in;    const unsigned char* inend;    unsigned int c, d;    int trailing;    if (in == NULL) {        /*	 * initialization nothing to do	 */	*outlen = 0;	*inlen = 0;	return(0);    }    inend = in + (*inlen);    outend = out + (*outlen);    while (in < inend) {	d = *in++;	if      (d < 0x80)  { c= d; trailing= 0; }	else if (d < 0xC0) {	    /* trailing byte in leading position */	    *outlen = out - outstart;	    *inlen = processed - instart;	    return(-2);        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }	else {	    /* no chance for this in Ascii */	    *outlen = out - outstart;	    *inlen = processed - instart;	    return(-2);	}	if (inend - in < trailing) {	    break;	} 	for ( ; trailing; trailing--) {	    if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))		break;	    c <<= 6;	    c |= d & 0x3F;	}	/* assertion: c is a single UTF-4 value */	if (c < 0x80) {	    if (out >= outend)		break;	    *out++ = c;	} else {	    /* no chance for this in Ascii */	    *outlen = out - outstart;	    *inlen = processed - instart;	    return(-2);	}	processed = in;    }    *outlen = out - outstart;    *inlen = processed - instart;    return(0);}#endif /* LIBXML_OUTPUT_ENABLED *//** * isolat1ToUTF8: * @out:  a pointer to an array of bytes to store the result * @outlen:  the length of @out * @in:  a pointer to an array of ISO Latin 1 chars * @inlen:  the length of @in * * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8 * block of chars out. * Returns 0 if success, or -1 otherwise * The value of @inlen after return is the number of octets consumed *     if the return value is positive, else unpredictable. * The value of @outlen after return is the number of octets consumed. */intisolat1ToUTF8(unsigned char* out, int *outlen,              const unsigned char* in, int *inlen) {    unsigned char* outstart = out;    const unsigned char* base = in;    unsigned char* outend = out + *outlen;    const unsigned char* inend;    const unsigned char* instop;    inend = in + (*inlen);    instop = inend;        while (in < inend && out < outend - 1) {    	if (*in >= 0x80) {	    *out++ = (((*in) >>  6) & 0x1F) | 0xC0;        *out++ = ((*in) & 0x3F) | 0x80;	    ++in;	}	if (instop - in > outend - out) instop = in + (outend - out); 	while (in < instop && *in < 0x80) {	    *out++ = *in++;	}    }	    if (in < inend && out < outend && *in < 0x80) {        *out++ = *in++;    }    *outlen = out - outstart;    *inlen = in - base;    return(0);}/** * UTF8ToUTF8: * @out:  a pointer to an array of bytes to store the result * @outlen:  the length of @out * @inb:  a pointer to an array of UTF-8 chars * @inlenb:  the length of @in in UTF-8 chars * * No op copy operation for UTF8 handling. * * Returns the number of bytes written, or -1 if lack of space. *     The value of *inlen after return is the number of octets consumed *     if the return value is positive, else unpredictable. */static intUTF8ToUTF8(unsigned char* out, int *outlen,           const unsigned char* inb, int *inlenb){    int len;    if ((out == NULL) || (inb == NULL) || (outlen == NULL) || (inlenb == NULL))	return(-1);    if (*outlen > *inlenb) {	len = *inlenb;    } else {	len = *outlen;    }    if (len < 0)	return(-1);    memcpy(out, inb, len);    *outlen = len;    *inlenb = len;    return(0);}#ifdef LIBXML_OUTPUT_ENABLED/** * UTF8Toisolat1: * @out:  a pointer to an array of bytes to store the result * @outlen:  the length of @out * @in:  a pointer to an array of UTF-8 chars * @inlen:  the length of @in * * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1 * block of chars out. * * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise * The value of @inlen after return is the number of octets consumed *     if the return value is positive, else unpredictable. * The value of @outlen after return is the number of octets consumed. */intUTF8Toisolat1(unsigned char* out, int *outlen,              const unsigned char* in, int *inlen) {    const unsigned char* processed = in;    const unsigned char* outend;    const unsigned char* outstart = out;    const unsigned char* instart = in;    const unsigned char* inend;    unsigned int c, d;    int trailing;    if (in == NULL) {        /*	 * initialization nothing to do	 */	*outlen = 0;	*inlen = 0;	return(0);    }    inend = in + (*inlen);    outend = out + (*outlen);    while (in < inend) {	d = *in++;	if      (d < 0x80)  { c= d; trailing= 0; }	else if (d < 0xC0) {	    /* trailing byte in leading position */	    *outlen = out - outstart;	    *inlen = processed - instart;	    return(-2);        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }	else {	    /* no chance for this in IsoLat1 */	    *outlen = out - outstart;	    *inlen = processed - instart;	    return(-2);	}	if (inend - in < trailing) {	    break;	} 	for ( ; trailing; trailing--) {	    if (in >= inend)		break;	    if (((d= *in++) & 0xC0) != 0x80) {		*outlen = out - outstart;		*inlen = processed - instart;		return(-2);	    }	    c <<= 6;	    c |= d & 0x3F;	}	/* assertion: c is a single UTF-4 value */	if (c <= 0xFF) {	    if (out >= outend)		break;	    *out++ = c;	} else {	    /* no chance for this in IsoLat1 */	    *outlen = out - outstart;	    *inlen = processed - instart;	    return(-2);	}	processed = in;    }    *outlen = out - outstart;    *inlen = processed - instart;    return(0);}#endif /* LIBXML_OUTPUT_ENABLED *//** * UTF16LEToUTF8: * @out:  a pointer to an array of bytes to store the result * @outlen:  the length of @out * @inb:  a pointer to an array of UTF-16LE passwd as a byte array * @inlenb:  the length of @in in UTF-16LE chars * * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8 * block of chars out. This function assumes the endian property * is the same between the native type of this machine and the * inputed one. * * Returns the number of bytes written, or -1 if lack of space, or -2 *     if the transcoding fails (if *in is not a valid utf16 string) *     The value of *inlen after return is the number of octets consumed *     if the return value is positive, else unpredictable. */static intUTF16LEToUTF8(unsigned char* out, int *outlen,            const unsigned char* inb, int *inlenb){    unsigned char* outstart = out;    const unsigned char* processed = inb;    unsigned char* outend = out + *outlen;    unsigned short* in = (unsigned short*) inb;    unsigned short* inend;    unsigned int c, d, inlen;    unsigned char *tmp;    int bits;    if ((*inlenb % 2) == 1)        (*inlenb)--;    inlen = *inlenb / 2;    inend = in + inlen;    while ((in < inend) && (out - outstart + 5 < *outlen)) {        if (xmlLittleEndian) {	    c= *in++;	} else {	    tmp = (unsigned char *) in;	    c = *tmp++;	    c = c | (((unsigned int)*tmp) << 8);	    in++;	}        if ((c & 0xFC00) == 0xD800) {    /* surrogates */	    if (in >= inend) {           /* (in > inend) shouldn't happens */		break;	    }	    if (xmlLittleEndian) {		d = *in++;	    } else {		tmp = (unsigned char *) in;		d = *tmp++;		d = d | (((unsigned int)*tmp) << 8);		in++;	    }            if ((d & 0xFC00) == 0xDC00) {                c &= 0x03FF;                c <<= 10;                c |= d & 0x03FF;                c += 0x10000;            }            else {
12 3 4 5 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -