📄 parserinternals.c.svn-base

📁 这是一个用于解析xml文件的类库。使用这个类库
💻 SVN-BASE
📖 第 1 页 / 共 4 页
字号:
                        /* 4-byte code */                        ctxt->input->cur += 4;                        val = (cur[0] & 0x7) << 18;                        val |= (cur[1] & 0x3f) << 12;                        val |= (cur[2] & 0x3f) << 6;                        val |= cur[3] & 0x3f;                    } else {                        /* 3-byte code */                        ctxt->input->cur += 3;                        val = (cur[0] & 0xf) << 12;                        val |= (cur[1] & 0x3f) << 6;                        val |= cur[2] & 0x3f;                    }                    if (((val > 0xd7ff) && (val < 0xe000)) ||                        ((val > 0xfffd) && (val < 0x10000)) ||                        (val >= 0x110000)) {			xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,					  "Char 0x%X out of allowed range\n",					  val);                    }                } else                    /* 2-byte code */                    ctxt->input->cur += 2;            } else                /* 1-byte code */                ctxt->input->cur++;            ctxt->nbChars++;            if (*ctxt->input->cur == 0)                xmlParserInputGrow(ctxt->input, INPUT_CHUNK);        }    } else {        /*         * Assume it's a fixed length encoding (1) with         * a compatible encoding for the ASCII set, since         * XML constructs only use < 128 chars         */        if (*(ctxt->input->cur) == '\n') {            ctxt->input->line++;            ctxt->input->col = 1;        } else            ctxt->input->col++;        ctxt->input->cur++;        ctxt->nbChars++;        if (*ctxt->input->cur == 0)            xmlParserInputGrow(ctxt->input, INPUT_CHUNK);    }    if ((*ctxt->input->cur == '%') && (!ctxt->html))        xmlParserHandlePEReference(ctxt);    if ((*ctxt->input->cur == 0) &&        (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0))        xmlPopInput(ctxt);    return;encoding_error:    /*     * If we detect an UTF8 error that probably mean that the     * input encoding didn't get properly advertised in the     * declaration header. Report the error and switch the encoding     * to ISO-Latin-1 (if you don't like this policy, just declare the     * encoding !)     */    __xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,		   "Input is not proper UTF-8, indicate encoding !\n",		   NULL, NULL);    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {        ctxt->sax->error(ctxt->userData,                         "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",                         ctxt->input->cur[0], ctxt->input->cur[1],                         ctxt->input->cur[2], ctxt->input->cur[3]);    }    ctxt->charset = XML_CHAR_ENCODING_8859_1;    ctxt->input->cur++;    return;}/** * xmlCurrentChar: * @ctxt:  the XML parser context * @len:  pointer to the length of the char read * * The current char value, if using UTF-8 this may actually span multiple * bytes in the input buffer. Implement the end of line normalization: * 2.11 End-of-Line Handling * Wherever an external parsed entity or the literal entity value * of an internal parsed entity contains either the literal two-character * sequence "#xD#xA" or a standalone literal #xD, an XML processor * must pass to the application the single character #xA. * This behavior can conveniently be produced by normalizing all * line breaks to #xA on input, before parsing.) * * Returns the current char value and its length */intxmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {    if (ctxt->instate == XML_PARSER_EOF)	return(0);    if ((*ctxt->input->cur >= 0x20) && (*ctxt->input->cur <= 0x7F)) {	    *len = 1;	    return((int) *ctxt->input->cur);    }    if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {	/*	 * We are supposed to handle UTF8, check it's valid	 * From rfc2044: encoding of the Unicode values on UTF-8:	 *	 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)	 * 0000 0000-0000 007F   0xxxxxxx	 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx	 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx 	 *	 * Check for the 0x110000 limit too	 */	const unsigned char *cur = ctxt->input->cur;	unsigned char c;	unsigned int val;	c = *cur;	if (c & 0x80) {	    if (c == 0xC0)		goto encoding_error;	    if (cur[1] == 0)		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);	    if ((cur[1] & 0xc0) != 0x80)		goto encoding_error;	    if ((c & 0xe0) == 0xe0) {		if (cur[2] == 0)		    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);		if ((cur[2] & 0xc0) != 0x80)		    goto encoding_error;		if ((c & 0xf0) == 0xf0) {		    if (cur[3] == 0)			xmlParserInputGrow(ctxt->input, INPUT_CHUNK);		    if (((c & 0xf8) != 0xf0) ||			((cur[3] & 0xc0) != 0x80))			goto encoding_error;		    /* 4-byte code */		    *len = 4;		    val = (cur[0] & 0x7) << 18;		    val |= (cur[1] & 0x3f) << 12;		    val |= (cur[2] & 0x3f) << 6;		    val |= cur[3] & 0x3f;		} else {		  /* 3-byte code */		    *len = 3;		    val = (cur[0] & 0xf) << 12;		    val |= (cur[1] & 0x3f) << 6;		    val |= cur[2] & 0x3f;		}	    } else {	      /* 2-byte code */		*len = 2;		val = (cur[0] & 0x1f) << 6;		val |= cur[1] & 0x3f;	    }	    if (!IS_CHAR(val)) {	        xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,				  "Char 0x%X out of allowed range\n", val);	    }    	    return(val);	} else {	    /* 1-byte code */	    *len = 1;	    if (*ctxt->input->cur == 0xD) {		if (ctxt->input->cur[1] == 0xA) {		    ctxt->nbChars++;		    ctxt->input->cur++;		}		return(0xA);	    }	    return((int) *ctxt->input->cur);	}    }    /*     * Assume it's a fixed length encoding (1) with     * a compatible encoding for the ASCII set, since     * XML constructs only use < 128 chars     */    *len = 1;    if (*ctxt->input->cur == 0xD) {	if (ctxt->input->cur[1] == 0xA) {	    ctxt->nbChars++;	    ctxt->input->cur++;	}	return(0xA);    }    return((int) *ctxt->input->cur);encoding_error:    /*     * An encoding problem may arise from a truncated input buffer     * splitting a character in the middle. In that case do not raise     * an error but return 0 to endicate an end of stream problem     */    if (ctxt->input->end - ctxt->input->cur < 4) {	*len = 0;	return(0);    }    /*     * If we detect an UTF8 error that probably mean that the     * input encoding didn't get properly advertised in the     * declaration header. Report the error and switch the encoding     * to ISO-Latin-1 (if you don't like this policy, just declare the     * encoding !)     */    __xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,		   "Input is not proper UTF-8, indicate encoding !\n",		   NULL, NULL);    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {	ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",			ctxt->input->cur[0], ctxt->input->cur[1],			ctxt->input->cur[2], ctxt->input->cur[3]);    }    ctxt->charset = XML_CHAR_ENCODING_8859_1;     *len = 1;    return((int) *ctxt->input->cur);}/** * xmlStringCurrentChar: * @ctxt:  the XML parser context * @cur:  pointer to the beginning of the char * @len:  pointer to the length of the char read * * The current char value, if using UTF-8 this may actually span multiple * bytes in the input buffer. * * Returns the current char value and its length */intxmlStringCurrentChar(xmlParserCtxtPtr ctxt, const xmlChar * cur, int *len){    if ((ctxt == NULL) || (ctxt->charset == XML_CHAR_ENCODING_UTF8)) {        /*         * We are supposed to handle UTF8, check it's valid         * From rfc2044: encoding of the Unicode values on UTF-8:         *         * UCS-4 range (hex.)           UTF-8 octet sequence (binary)         * 0000 0000-0000 007F   0xxxxxxx         * 0000 0080-0000 07FF   110xxxxx 10xxxxxx         * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx          *         * Check for the 0x110000 limit too         */        unsigned char c;        unsigned int val;        c = *cur;        if (c & 0x80) {            if ((cur[1] & 0xc0) != 0x80)                goto encoding_error;            if ((c & 0xe0) == 0xe0) {                if ((cur[2] & 0xc0) != 0x80)                    goto encoding_error;                if ((c & 0xf0) == 0xf0) {                    if (((c & 0xf8) != 0xf0) || ((cur[3] & 0xc0) != 0x80))                        goto encoding_error;                    /* 4-byte code */                    *len = 4;                    val = (cur[0] & 0x7) << 18;                    val |= (cur[1] & 0x3f) << 12;                    val |= (cur[2] & 0x3f) << 6;                    val |= cur[3] & 0x3f;                } else {                    /* 3-byte code */                    *len = 3;                    val = (cur[0] & 0xf) << 12;                    val |= (cur[1] & 0x3f) << 6;                    val |= cur[2] & 0x3f;                }            } else {                /* 2-byte code */                *len = 2;                val = (cur[0] & 0x1f) << 6;                val |= cur[1] & 0x3f;            }            if (!IS_CHAR(val)) {	        xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,				  "Char 0x%X out of allowed range\n", val);            }            return (val);        } else {            /* 1-byte code */            *len = 1;            return ((int) *cur);        }    }    /*     * Assume it's a fixed length encoding (1) with     * a compatible encoding for the ASCII set, since     * XML constructs only use < 128 chars     */    *len = 1;    return ((int) *cur);encoding_error:    /*     * If we detect an UTF8 error that probably mean that the     * input encoding didn't get properly advertised in the     * declaration header. Report the error and switch the encoding     * to ISO-Latin-1 (if you don't like this policy, just declare the     * encoding !)     */    __xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,		   "Input is not proper UTF-8, indicate encoding !\n",		   NULL, NULL);    if ((ctxt != NULL) && (ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {	ctxt->sax->error(ctxt->userData,			 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",			 ctxt->input->cur[0], ctxt->input->cur[1],			 ctxt->input->cur[2], ctxt->input->cur[3]);    }    *len = 1;    return ((int) *cur);}/** * xmlCopyCharMultiByte: * @out:  pointer to an array of xmlChar * @val:  the char value * * append the char value in the array  * * Returns the number of xmlChar written */intxmlCopyCharMultiByte(xmlChar *out, int val) {    /*     * We are supposed to handle UTF8, check it's valid     * From rfc2044: encoding of the Unicode values on UTF-8:     *     * UCS-4 range (hex.)           UTF-8 octet sequence (binary)     * 0000 0000-0000 007F   0xxxxxxx     * 0000 0080-0000 07FF   110xxxxx 10xxxxxx     * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx      */    if  (val >= 0x80) {	xmlChar *savedout = out;	int bits;	if (val <   0x800) { *out++= (val >>  6) | 0xC0;  bits=  0; }	else if (val < 0x10000) { *out++= (val >> 12) | 0xE0;  bits=  6;}	else if (val < 0x110000)  { *out++= (val >> 18) | 0xF0;  bits=  12; }	else {	    xmlErrEncodingInt(NULL, XML_ERR_INVALID_CHAR,		    "Internal error, xmlCopyCharMultiByte 0x%X out of bound\n",			      val);	    return(0);	}	for ( ; bits >= 0; bits-= 6)	    *out++= ((val >> bits) & 0x3F) | 0x80 ;	return (out - savedout);    }    *out = (xmlChar) val;    return 1;}/** * xmlCopyChar: * @len:  Ignored, compatibility * @out:  pointer to an array of xmlChar * @val:  the char value * * append the char value in the array  * * Returns the number of xmlChar written */intxmlCopyChar(int len ATTRIBUTE_UNUSED, xmlChar *out, int val) {    /* the len parameter is ignored */    if  (val >= 0x80) {	return(xmlCopyCharMultiByte (out, val));    }    *out = (xmlChar) val;    return 1;}/************************************************************************ *									* *		Commodity functions to switch encodings			* *									* ************************************************************************//** * xmlSwitchEncoding: * @ctxt:  the parser context * @enc:  the encoding value (number) * * change the input functions when discovering the character encoding * of a given entity. * * Returns 0 in case of success, -1 otherwise */intxmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc){    xmlCharEncodingHandlerPtr handler;    switch (enc) {	case XML_CHAR_ENCODING_ERROR:	    __xmlErrEncoding(ctxt, XML_ERR_UNKNOWN_ENCODING,	                   "encoding unknown\n", NULL, NULL);	    break;	case XML_CHAR_ENCODING_NONE:	    /* let's assume it's UTF-8 without the XML decl */	    ctxt->charset = XML_CHAR_ENCODING_UTF8;	    return(0);	case XML_CHAR_ENCODING_UTF8:	    /* default encoding, no conversion should be needed */	    ctxt->charset = XML_CHAR_ENCODING_UTF8;	    /*	     * Errata on XML-1.0 June 20 2001	     * Specific handling of the Byte Order Mark for	     * UTF-8	     */	    if ((ctxt->input != NULL) &&		(ctxt->input->cur[0] == 0xEF) &&		(ctxt->input->cur[1] == 0xBB) &&		(ctxt->input->cur[2] == 0xBF)) {		ctxt->input->cur += 3;	    }	    return(0);    case XML_CHAR_ENCODING_UTF16LE:    case XML_CHAR_ENCODING_UTF16BE:        /*The raw input characters are encoded         *in UTF-16. As we expect this function         *to be called after xmlCharEncInFunc, we expect         *ctxt->input->cur to contain UTF-8 encoded characters.         *So the raw UTF16 Byte Order Mark         *has also been converted into         *an UTF-8 BOM. Let's skip that BOM.         */        if ((ctxt->input != NULL) &&            (ctxt->input->cur[0] == 0xEF) &&            (ctxt->input->cur[1] == 0xBB) &&            (ctxt->input->cur[2] == 0xBF)) {            ctxt->input->cur += 3;        }	break ;	default:	    break;    }    handler = xmlGetCharEncodingHandler(enc);    if (handler == NULL) {	/*	 * Default handlers.	 */	switch (enc) {	    case XML_CHAR_ENCODING_ERROR:		__xmlErrEncoding(ctxt, XML_ERR_UNKNOWN_ENCODING,			       "encoding unknown\n", NULL, NULL);		break;	    case XML_CHAR_ENCODING_NONE:		/* let's assume it's UTF-8 without the XML decl */		ctxt->charset = XML_CHAR_ENCODING_UTF8;		return(0);	    case XML_CHAR_ENCODING_UTF8:	    case XML_CHAR_ENCODING_ASCII:		/* default encoding, no conversion should be needed */		ctxt->charset = XML_CHAR_ENCODING_UTF8;		return(0);	    case XML_CHAR_ENCODING_UTF16LE:		break;	    case XML_CHAR_ENCODING_UTF16BE:		break;	    case XML_CHAR_ENCODING_UCS4LE:		__xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,			       "encoding not supported %s\n",			       BAD_CAST "USC4 little endian", NULL);		break;	    case XML_CHAR_ENCODING_UCS4BE:		__xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,			       "encoding not supported %s\n",			       BAD_CAST "USC4 big endian", NULL);		break;	    case XML_CHAR_ENCODING_EBCDIC:		__xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,			       "encoding not supported %s\n",			       BAD_CAST "EBCDIC", NULL);		break;	    case XML_CHAR_ENCODING_UCS4_2143:		__xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,			       "encoding not supported %s\n",			       BAD_CAST "UCS4 2143", NULL);		break;	    case XML_CHAR_ENCODING_UCS4_3412:		__xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,			       "encoding not supported %s\n",			       BAD_CAST "UCS4 3412", NULL);		break;	    case XML_CHAR_ENCODING_UCS2:		__xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,			       "encoding not supported %s\n",			       BAD_CAST "UCS2", NULL);		break;	    case XML_CHAR_ENCODING_8859_1:	    case XML_CHAR_ENCODING_8859_2:	    case XML_CHAR_ENCODING_8859_3:	    case XML_CHAR_ENCODING_8859_4:	    case XML_CHAR_ENCODING_8859_5:	    case XML_CHAR_ENCODING_8859_6:	    case XML_CHAR_ENCODING_8859_7:	    case XML_CHAR_ENCODING_8859_8:	    case XML_CHAR_ENCODING_8859_9:
💿 文件大小 1527 K
👤 上传用户 qqpp2q
📂 所属分类其他
🏷️ 相关标签

#xml
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -