📄 encoding.c

📁 Vovida 社区开源的 SIP 协议源码
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* * encoding.c : implements the encoding conversion functions needed for XML * * Related specs:  * rfc2044        (UTF-8 and UTF-16) F. Yergeau Alis Technologies * [ISO-10646]    UTF-8 and UTF-16 in Annexes * [ISO-8859-1]   ISO Latin-1 characters codes. * [UNICODE]      The Unicode Consortium, "The Unicode Standard -- *                Worldwide Character Encoding -- Version 1.0", Addison- *                Wesley, Volume 1, 1991, Volume 2, 1992.  UTF-8 is *                described in Unicode Technical Report #4. * [US-ASCII]     Coded Character Set--7-bit American Standard Code for *                Information Interchange, ANSI X3.4-1986. * * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org> * * See Copyright for the status of this software. * * Daniel.Veillard@w3.org */#ifdef WIN32#include "win32config.h"#else#include "config.h"#endif#include <stdio.h>#include <string.h>#ifdef HAVE_CTYPE_H#include <ctype.h>#endif#ifdef HAVE_STDLIB_H#include <stdlib.h>#endif#include <libxml/encoding.h>#include <libxml/xmlmemory.h>xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;/* * From rfc2044: encoding of the Unicode values on UTF-8: * * UCS-4 range (hex.)           UTF-8 octet sequence (binary) * 0000 0000-0000 007F   0xxxxxxx * 0000 0080-0000 07FF   110xxxxx 10xxxxxx * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx  * * I hope we won't use values > 0xFFFF anytime soon ! *//** * xmlCheckUTF8: Check utf-8 string for legality. * @utf: Pointer to putative utf-8 encoded string. * * Checks @utf for being valid utf-8. @utf is assumed to be * null-terminated. This function is not super-strict, as it will * allow longer utf-8 sequences than necessary. Note that Java is * capable of producing these sequences if provoked. Also note, this * routine checks for the 4-byte maxiumum size, but does not check for * 0x10ffff maximum value. * * Return value: true if @utf is valid. **/intxmlCheckUTF8(const unsigned char *utf){    int ix;    unsigned char c;    for (ix = 0; (c = utf[ix]);) {        if (c & 0x80) {	    if ((utf[ix + 1] & 0xc0) != 0x80)	        return(0);	    if ((c & 0xe0) == 0xe0) {	        if ((utf[ix + 2] & 0xc0) != 0x80)		    return(0);	        if ((c & 0xf0) == 0xf0) {		    if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)		        return(0);		    ix += 4;		    /* 4-byte code */	        } else		  /* 3-byte code */		    ix += 3;	    } else	      /* 2-byte code */	        ix += 2;	} else	    /* 1-byte code */	    ix++;      }      return(1);}/** * isolat1ToUTF8: * @out:  a pointer to an array of bytes to store the result * @outlen:  the length of @out * @in:  a pointer to an array of ISO Latin 1 chars * @inlen:  the length of @in * * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8 * block of chars out. * Returns the number of byte written, or -1 by lack of space. */intisolat1ToUTF8(unsigned char* out, int outlen,              const unsigned char* in, int *inlen) {    unsigned char* outstart= out;    unsigned char* outend= out+outlen;    const unsigned char* inend= in+*inlen;    unsigned char c;    while (in < inend) {        c= *in++;        if (c < 0x80) {            if (out >= outend)  return(-1);            *out++ = c;        }        else {            if (out >= outend)  return(-1);            *out++ = 0xC0 | (c >> 6);            if (out >= outend)  return(-1);            *out++ = 0x80 | (0x3F & c);        }    }    return(out-outstart);}/** * UTF8Toisolat1: * @out:  a pointer to an array of bytes to store the result * @outlen:  the length of @out * @in:  a pointer to an array of UTF-8 chars * @inlen:  the length of @in * * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1 * block of chars out. * TODO: UTF8Toisolat1 need a fallback mechanism ... * * Returns the number of byte written, or -1 by lack of space, or -2 *     if the transcoding fails (for *in is not valid utf8 string or *     the result of transformation can't fit into the encoding we want) * The value of @inlen after return is the number of octets consumed *     as the return value is positive, else unpredictiable. */intUTF8Toisolat1(unsigned char* out, int outlen,              const unsigned char* in, int *inlen) {    unsigned char* outstart= out;    unsigned char* outend= out+outlen;    const unsigned char* inend= in+*inlen;    unsigned char c;    while (in < inend) {        c= *in++;        if (c < 0x80) {            if (out >= outend)  return(-1);            *out++= c;        }	else if (in == inend) {            *inlen -= 1;            break;	}	else if (((c & 0xFC) == 0xC0) && ((*in & 0xC0) == 0x80)) {	    /* a two byte utf-8 and can be encoding as isolate1 */            *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);	}	else	    return(-2);	/* TODO : some should be represent as "&#x____;" */    }    return(out-outstart);}/** * UTF16LEToUTF8: * @out:  a pointer to an array of bytes to store the result * @outlen:  the length of @out * @inb:  a pointer to an array of UTF-16LE passwd as a byte array * @inlenb:  the length of @in in UTF-16LE chars * * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8 * block of chars out. This function assume the endian properity * is the same between the native type of this machine and the * inputed one. * * Returns the number of byte written, or -1 by lack of space, or -2 *     if the transcoding fails (for *in is not valid utf16 string) *     The value of *inlen after return is the number of octets consumed *     as the return value is positive, else unpredictiable. */intUTF16LEToUTF8(unsigned char* out, int outlen,            const unsigned char* inb, int *inlenb){    unsigned char* outstart= out;    unsigned char* outend= out+outlen;    unsigned short* in = (unsigned short*) inb;    unsigned short* inend;    unsigned int c, d, inlen;    unsigned char *tmp;    int bits;    if ((*inlenb % 2) == 1)        (*inlenb)--;    inlen = *inlenb / 2;    inend= in + inlen;    while (in < inend) {#ifdef BIG_ENDIAN	tmp = (unsigned char *) in;	c = *tmp++;	c = c | (((unsigned int)*tmp) << 8);	in++;#else /* BIG_ENDIAN */        c= *in++;#endif /* BIG_ENDIAN */        if ((c & 0xFC00) == 0xD800) {    /* surrogates */            if (in >= inend) {           /* (in > inend) shouldn't happens */                (*inlenb) -= 2;                break;            }#ifdef BIG_ENDIAN            tmp = (unsigned char *) in;            d = *tmp++;	    d = d | (((unsigned int)*tmp) << 8);	    in++;#else /* BIG_ENDIAN */            d = *in++;#endif /* BIG_ENDIAN */            if ((d & 0xFC00) == 0xDC00) {                c &= 0x03FF;                c <<= 10;                c |= d & 0x03FF;                c += 0x10000;            }            else	        return(-2);        }	/* assertion: c is a single UTF-4 value */        if (out >= outend)	    return(-1);        if      (c <    0x80) {  *out++=  c;                bits= -6; }        else if (c <   0x800) {  *out++= ((c >>  6) & 0x1F) | 0xC0;  bits=  0; }        else if (c < 0x10000) {  *out++= ((c >> 12) & 0x0F) | 0xE0;  bits=  6; }        else                  {  *out++= ((c >> 18) & 0x07) | 0xF0;  bits= 12; }         for ( ; bits >= 0; bits-= 6) {            if (out >= outend)	        return(-1);            *out++= ((c >> bits) & 0x3F) | 0x80;        }    }    return(out-outstart);}/** * UTF8ToUTF16LE: * @outb:  a pointer to an array of bytes to store the result * @outlen:  the length of @outb * @in:  a pointer to an array of UTF-8 chars * @inlen:  the length of @in * * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE * block of chars out. * TODO: UTF8ToUTF16LE need a fallback mechanism ... * * Returns the number of byte written, or -1 by lack of space, or -2 *     if the transcoding failed.  */intUTF8ToUTF16LE(unsigned char* outb, int outlen,            const unsigned char* in, int *inlen){    unsigned short* out = (unsigned short*) outb;    unsigned short* outstart= out;    unsigned short* outend;    const unsigned char* inend= in+*inlen;    unsigned int c, d, trailing;#ifdef BIG_ENDIAN    unsigned char *tmp;    unsigned short tmp1, tmp2;#endif /* BIG_ENDIAN */    outlen /= 2; /* convert in short length */    outend = out + outlen;    while (in < inend) {      d= *in++;      if      (d < 0x80)  { c= d; trailing= 0; }      else if (d < 0xC0)          return(-2);    /* trailing byte in leading position */      else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }      else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }      else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }      else          return(-2);    /* no chance for this in UTF-16 */      if (inend - in < trailing) {          *inlen -= (inend - in);          break;      }       for ( ; trailing; trailing--) {          if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))	      return(-1);          c <<= 6;          c |= d & 0x3F;      }      /* assertion: c is a single UTF-4 value */        if (c < 0x10000) {            if (out >= outend)	        return(-1);#ifdef BIG_ENDIAN            tmp = (unsigned char *) out;            *tmp = c ;            *(tmp + 1) = c >> 8 ;            out++;#else /* BIG_ENDIAN */            *out++ = c;#endif /* BIG_ENDIAN */        }        else if (c < 0x110000) {            if (out+1 >= outend)	        return(-1);            c -= 0x10000;#ifdef BIG_ENDIAN            tmp1 = 0xD800 | (c >> 10);            tmp = (unsigned char *) out;            *tmp = tmp1;            *(tmp + 1) = tmp1 >> 8;            out++;            tmp2 = 0xDC00 | (c & 0x03FF);            tmp = (unsigned char *) out;            *tmp  = tmp2;            *(tmp + 1) = tmp2 >> 8;            out++;#else /* BIG_ENDIAN */            *out++ = 0xD800 | (c >> 10);            *out++ = 0xDC00 | (c & 0x03FF);#endif /* BIG_ENDIAN */        }        else	    return(-1);    }    return(out-outstart);}/** * UTF16BEToUTF8: * @out:  a pointer to an array of bytes to store the result * @outlen:  the length of @out * @inb:  a pointer to an array of UTF-16 passwd as a byte array * @inlenb:  the length of @in in UTF-16 chars * * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8 * block of chars out. This function assume the endian properity * is the same between the native type of this machine and the * inputed one. * * Returns the number of byte written, or -1 by lack of space, or -2 *     if the transcoding fails (for *in is not valid utf16 string) * The value of *inlen after return is the number of octets consumed *     as the return value is positive, else unpredictiable. */intUTF16BEToUTF8(unsigned char* out, int outlen,            const unsigned char* inb, int *inlenb){    unsigned char* outstart= out;    unsigned char* outend= out+outlen;    unsigned short* in = (unsigned short*) inb;    unsigned short* inend;    unsigned int c, d, inlen;#ifdef BIG_ENDIAN#else /* BIG_ENDIAN */    unsigned char *tmp;#endif /* BIG_ENDIAN */        int bits;    if ((*inlenb % 2) == 1)        (*inlenb)--;    inlen = *inlenb / 2;    inend= in + inlen;    while (in < inend) {#ifdef BIG_ENDIAN            c= *in++;#else        tmp = (unsigned char *) in;	c = *tmp++;	c = c << 8;	c = c | (unsigned int) *tmp;	in++;#endif	        if ((c & 0xFC00) == 0xD800) {    /* surrogates */	    if (in >= inend) {           /* (in > inend) shouldn't happens */	        (*inlenb) -= 2;		break;	    }#ifdef BIG_ENDIAN            d= *in++;#else            tmp = (unsigned char *) in;	    d = *tmp++;	    d = d << 8;	    d = d | (unsigned int) *tmp;	    in++;#endif	                if ((d & 0xFC00) == 0xDC00) {                c &= 0x03FF;                c <<= 10;                c |= d & 0x03FF;                c += 0x10000;            }            else 	        return(-2);        }	/* assertion: c is a single UTF-4 value */        if (out >= outend) 	    return(-1);        if      (c <    0x80) {  *out++=  c;                bits= -6; }        else if (c <   0x800) {  *out++= ((c >>  6) & 0x1F) | 0xC0;  bits=  0; }        else if (c < 0x10000) {  *out++= ((c >> 12) & 0x0F) | 0xE0;  bits=  6; }        else                  {  *out++= ((c >> 18) & 0x07) | 0xF0;  bits= 12; }         for ( ; bits >= 0; bits-= 6) {            if (out >= outend) 	        return(-1);            *out++= ((c >> bits) & 0x3F) | 0x80;        }    }    return(out-outstart);}
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -