📄 encoding.c.svn-base

📁 这是一个用于解析xml文件的类库。使用这个类库
💻 SVN-BASE
📖 第 1 页 / 共 5 页
字号:
		*outlen = out - outstart;		*inlenb = processed - inb;	        return(-2);	    }        }	/* assertion: c is a single UTF-4 value */        if (out >= outend)	    break;        if      (c <    0x80) {  *out++=  c;                bits= -6; }        else if (c <   0x800) {  *out++= ((c >>  6) & 0x1F) | 0xC0;  bits=  0; }        else if (c < 0x10000) {  *out++= ((c >> 12) & 0x0F) | 0xE0;  bits=  6; }        else                  {  *out++= ((c >> 18) & 0x07) | 0xF0;  bits= 12; }         for ( ; bits >= 0; bits-= 6) {            if (out >= outend)	        break;            *out++= ((c >> bits) & 0x3F) | 0x80;        }	processed = (const unsigned char*) in;    }    *outlen = out - outstart;    *inlenb = processed - inb;    return(0);}#ifdef LIBXML_OUTPUT_ENABLED/** * UTF8ToUTF16LE: * @outb:  a pointer to an array of bytes to store the result * @outlen:  the length of @outb * @in:  a pointer to an array of UTF-8 chars * @inlen:  the length of @in * * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE * block of chars out. * * Returns the number of bytes written, or -1 if lack of space, or -2 *     if the transcoding failed.  */static intUTF8ToUTF16LE(unsigned char* outb, int *outlen,            const unsigned char* in, int *inlen){    unsigned short* out = (unsigned short*) outb;    const unsigned char* processed = in;    const unsigned char *const instart = in;    unsigned short* outstart= out;    unsigned short* outend;    const unsigned char* inend= in+*inlen;    unsigned int c, d;    int trailing;    unsigned char *tmp;    unsigned short tmp1, tmp2;    /* UTF16LE encoding has no BOM */    if (in == NULL) {	*outlen = 0;	*inlen = 0;	return(0);    }    outend = out + (*outlen / 2);    while (in < inend) {      d= *in++;      if      (d < 0x80)  { c= d; trailing= 0; }      else if (d < 0xC0) {          /* trailing byte in leading position */	  *outlen = (out - outstart) * 2;	  *inlen = processed - instart;	  return(-2);      } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }      else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }      else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }      else {	/* no chance for this in UTF-16 */	*outlen = (out - outstart) * 2;	*inlen = processed - instart;	return(-2);      }      if (inend - in < trailing) {          break;      }       for ( ; trailing; trailing--) {          if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))	      break;          c <<= 6;          c |= d & 0x3F;      }      /* assertion: c is a single UTF-4 value */        if (c < 0x10000) {            if (out >= outend)	        break;	    if (xmlLittleEndian) {		*out++ = c;	    } else {		tmp = (unsigned char *) out;		*tmp = c ;		*(tmp + 1) = c >> 8 ;		out++;	    }        }        else if (c < 0x110000) {            if (out+1 >= outend)	        break;            c -= 0x10000;	    if (xmlLittleEndian) {		*out++ = 0xD800 | (c >> 10);		*out++ = 0xDC00 | (c & 0x03FF);	    } else {		tmp1 = 0xD800 | (c >> 10);		tmp = (unsigned char *) out;		*tmp = (unsigned char) tmp1;		*(tmp + 1) = tmp1 >> 8;		out++;		tmp2 = 0xDC00 | (c & 0x03FF);		tmp = (unsigned char *) out;		*tmp  = (unsigned char) tmp2;		*(tmp + 1) = tmp2 >> 8;		out++;	    }        }        else	    break;	processed = in;    }    *outlen = (out - outstart) * 2;    *inlen = processed - instart;    return(0);}/** * UTF8ToUTF16: * @outb:  a pointer to an array of bytes to store the result * @outlen:  the length of @outb * @in:  a pointer to an array of UTF-8 chars * @inlen:  the length of @in * * Take a block of UTF-8 chars in and try to convert it to an UTF-16 * block of chars out. * * Returns the number of bytes written, or -1 if lack of space, or -2 *     if the transcoding failed.  */static intUTF8ToUTF16(unsigned char* outb, int *outlen,            const unsigned char* in, int *inlen){    if (in == NULL) {	/*	 * initialization, add the Byte Order Mark for UTF-16LE	 */        if (*outlen >= 2) {	    outb[0] = 0xFF;	    outb[1] = 0xFE;	    *outlen = 2;	    *inlen = 0;#ifdef DEBUG_ENCODING            xmlGenericError(xmlGenericErrorContext,		    "Added FFFE Byte Order Mark\n");#endif	    return(2);	}	*outlen = 0;	*inlen = 0;	return(0);    }    return (UTF8ToUTF16LE(outb, outlen, in, inlen));}#endif /* LIBXML_OUTPUT_ENABLED *//** * UTF16BEToUTF8: * @out:  a pointer to an array of bytes to store the result * @outlen:  the length of @out * @inb:  a pointer to an array of UTF-16 passed as a byte array * @inlenb:  the length of @in in UTF-16 chars * * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8 * block of chars out. This function assumes the endian property * is the same between the native type of this machine and the * inputed one. * * Returns the number of bytes written, or -1 if lack of space, or -2 *     if the transcoding fails (if *in is not a valid utf16 string) * The value of *inlen after return is the number of octets consumed *     if the return value is positive, else unpredictable. */static intUTF16BEToUTF8(unsigned char* out, int *outlen,            const unsigned char* inb, int *inlenb){    unsigned char* outstart = out;    const unsigned char* processed = inb;    unsigned char* outend = out + *outlen;    unsigned short* in = (unsigned short*) inb;    unsigned short* inend;    unsigned int c, d, inlen;    unsigned char *tmp;    int bits;    if ((*inlenb % 2) == 1)        (*inlenb)--;    inlen = *inlenb / 2;    inend= in + inlen;    while (in < inend) {	if (xmlLittleEndian) {	    tmp = (unsigned char *) in;	    c = *tmp++;	    c = c << 8;	    c = c | (unsigned int) *tmp;	    in++;	} else {	    c= *in++;	}         if ((c & 0xFC00) == 0xD800) {    /* surrogates */	    if (in >= inend) {           /* (in > inend) shouldn't happens */		*outlen = out - outstart;		*inlenb = processed - inb;	        return(-2);	    }	    if (xmlLittleEndian) {		tmp = (unsigned char *) in;		d = *tmp++;		d = d << 8;		d = d | (unsigned int) *tmp;		in++;	    } else {		d= *in++;	    }            if ((d & 0xFC00) == 0xDC00) {                c &= 0x03FF;                c <<= 10;                c |= d & 0x03FF;                c += 0x10000;            }            else {		*outlen = out - outstart;		*inlenb = processed - inb;	        return(-2);	    }        }	/* assertion: c is a single UTF-4 value */        if (out >= outend) 	    break;        if      (c <    0x80) {  *out++=  c;                bits= -6; }        else if (c <   0x800) {  *out++= ((c >>  6) & 0x1F) | 0xC0;  bits=  0; }        else if (c < 0x10000) {  *out++= ((c >> 12) & 0x0F) | 0xE0;  bits=  6; }        else                  {  *out++= ((c >> 18) & 0x07) | 0xF0;  bits= 12; }         for ( ; bits >= 0; bits-= 6) {            if (out >= outend) 	        break;            *out++= ((c >> bits) & 0x3F) | 0x80;        }	processed = (const unsigned char*) in;    }    *outlen = out - outstart;    *inlenb = processed - inb;    return(0);}#ifdef LIBXML_OUTPUT_ENABLED/** * UTF8ToUTF16BE: * @outb:  a pointer to an array of bytes to store the result * @outlen:  the length of @outb * @in:  a pointer to an array of UTF-8 chars * @inlen:  the length of @in * * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE * block of chars out. * * Returns the number of byte written, or -1 by lack of space, or -2 *     if the transcoding failed.  */static intUTF8ToUTF16BE(unsigned char* outb, int *outlen,            const unsigned char* in, int *inlen){    unsigned short* out = (unsigned short*) outb;    const unsigned char* processed = in;    const unsigned char *const instart = in;    unsigned short* outstart= out;    unsigned short* outend;    const unsigned char* inend= in+*inlen;    unsigned int c, d;    int trailing;    unsigned char *tmp;    unsigned short tmp1, tmp2;    /* UTF-16BE has no BOM */    if (in == NULL) {	*outlen = 0;	*inlen = 0;	return(0);    }    outend = out + (*outlen / 2);    while (in < inend) {      d= *in++;      if      (d < 0x80)  { c= d; trailing= 0; }      else if (d < 0xC0)  {          /* trailing byte in leading position */	  *outlen = out - outstart;	  *inlen = processed - instart;	  return(-2);      } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }      else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }      else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }      else {          /* no chance for this in UTF-16 */	  *outlen = out - outstart;	  *inlen = processed - instart;	  return(-2);      }      if (inend - in < trailing) {          break;      }       for ( ; trailing; trailing--) {          if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))  break;          c <<= 6;          c |= d & 0x3F;      }      /* assertion: c is a single UTF-4 value */        if (c < 0x10000) {            if (out >= outend)  break;	    if (xmlLittleEndian) {		tmp = (unsigned char *) out;		*tmp = c >> 8;		*(tmp + 1) = c;		out++;	    } else {		*out++ = c;	    }        }        else if (c < 0x110000) {            if (out+1 >= outend)  break;            c -= 0x10000;	    if (xmlLittleEndian) {		tmp1 = 0xD800 | (c >> 10);		tmp = (unsigned char *) out;		*tmp = tmp1 >> 8;		*(tmp + 1) = (unsigned char) tmp1;		out++;		tmp2 = 0xDC00 | (c & 0x03FF);		tmp = (unsigned char *) out;		*tmp = tmp2 >> 8;		*(tmp + 1) = (unsigned char) tmp2;		out++;	    } else {		*out++ = 0xD800 | (c >> 10);		*out++ = 0xDC00 | (c & 0x03FF);	    }        }        else	    break;	processed = in;    }    *outlen = (out - outstart) * 2;    *inlen = processed - instart;    return(0);}#endif /* LIBXML_OUTPUT_ENABLED *//************************************************************************ *									* *		Generic encoding handling routines			* *									* ************************************************************************//** * xmlDetectCharEncoding: * @in:  a pointer to the first bytes of the XML entity, must be at least *       2 bytes long (at least 4 if encoding is UTF4 variant). * @len:  pointer to the length of the buffer * * Guess the encoding of the entity using the first bytes of the entity content * according to the non-normative appendix F of the XML-1.0 recommendation. *  * Returns one of the XML_CHAR_ENCODING_... values. */xmlCharEncodingxmlDetectCharEncoding(const unsigned char* in, int len){    if (len >= 4) {	if ((in[0] == 0x00) && (in[1] == 0x00) &&	    (in[2] == 0x00) && (in[3] == 0x3C))	    return(XML_CHAR_ENCODING_UCS4BE);	if ((in[0] == 0x3C) && (in[1] == 0x00) &&	    (in[2] == 0x00) && (in[3] == 0x00))	    return(XML_CHAR_ENCODING_UCS4LE);	if ((in[0] == 0x00) && (in[1] == 0x00) &&	    (in[2] == 0x3C) && (in[3] == 0x00))	    return(XML_CHAR_ENCODING_UCS4_2143);	if ((in[0] == 0x00) && (in[1] == 0x3C) &&	    (in[2] == 0x00) && (in[3] == 0x00))	    return(XML_CHAR_ENCODING_UCS4_3412);	if ((in[0] == 0x4C) && (in[1] == 0x6F) &&	    (in[2] == 0xA7) && (in[3] == 0x94))	    return(XML_CHAR_ENCODING_EBCDIC);	if ((in[0] == 0x3C) && (in[1] == 0x3F) &&	    (in[2] == 0x78) && (in[3] == 0x6D))	    return(XML_CHAR_ENCODING_UTF8);	/*	 * Although not part of the recommendation, we also	 * attempt an "auto-recognition" of UTF-16LE and	 * UTF-16BE encodings.	 */	if ((in[0] == 0x3C) && (in[1] == 0x00) &&	    (in[2] == 0x3F) && (in[3] == 0x00))	    return(XML_CHAR_ENCODING_UTF16LE);	if ((in[0] == 0x00) && (in[1] == 0x3C) &&	    (in[2] == 0x00) && (in[3] == 0x3F))	    return(XML_CHAR_ENCODING_UTF16BE);    }    if (len >= 3) {	/*	 * Errata on XML-1.0 June 20 2001	 * We now allow an UTF8 encoded BOM	 */	if ((in[0] == 0xEF) && (in[1] == 0xBB) &&	    (in[2] == 0xBF))	    return(XML_CHAR_ENCODING_UTF8);    }    /* For UTF-16 we can recognize by the BOM */    if (len >= 2) {	if ((in[0] == 0xFE) && (in[1] == 0xFF))	    return(XML_CHAR_ENCODING_UTF16BE);	if ((in[0] == 0xFF) && (in[1] == 0xFE))	    return(XML_CHAR_ENCODING_UTF16LE);    }    return(XML_CHAR_ENCODING_NONE);}/** * xmlCleanupEncodingAliases: * * Unregisters all aliases */voidxmlCleanupEncodingAliases(void) {
💿 文件大小 1527 K
👤 上传用户 qqpp2q
📂 所属分类其他
🏷️ 相关标签

#xml
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -