📄 parserinternals.c

📁 libxml,在UNIX/LINUX下非常重要的一个库,为XML相关应用提供方便.目前上载的是最新版本,若要取得最新版本,请参考里面的readme.
💻 C
📖 第 1 页 / 共 5 页
字号:
    if (in->base != in->buf->buffer->content) {        /*	 * the buffer has been reallocated	 */	indx = in->cur - in->base;	in->base = in->buf->buffer->content;	in->cur = &in->buf->buffer->content[indx];    }    in->end = &in->buf->buffer->content[in->buf->buffer->use];    CHECK_BUFFER(in);}/************************************************************************ *									* * 		UTF8 character input and related functions		* *									* ************************************************************************//** * xmlNextChar: * @ctxt:  the XML parser context * * Skip to the next char input char. */voidxmlNextChar(xmlParserCtxtPtr ctxt){    if ((ctxt == NULL) || (ctxt->instate == XML_PARSER_EOF) ||        (ctxt->input == NULL))        return;    if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {        if ((*ctxt->input->cur == 0) &&            (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0) &&            (ctxt->instate != XML_PARSER_COMMENT)) {            /*             * If we are at the end of the current entity and             * the context allows it, we pop consumed entities             * automatically.             * the auto closing should be blocked in other cases             */            xmlPopInput(ctxt);        } else {            const unsigned char *cur;            unsigned char c;            /*             *   2.11 End-of-Line Handling             *   the literal two-character sequence "#xD#xA" or a standalone             *   literal #xD, an XML processor must pass to the application             *   the single character #xA.             */            if (*(ctxt->input->cur) == '\n') {                ctxt->input->line++; ctxt->input->col = 1;            } else                ctxt->input->col++;            /*             * We are supposed to handle UTF8, check it's valid             * From rfc2044: encoding of the Unicode values on UTF-8:             *             * UCS-4 range (hex.)           UTF-8 octet sequence (binary)             * 0000 0000-0000 007F   0xxxxxxx             * 0000 0080-0000 07FF   110xxxxx 10xxxxxx             * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx              *             * Check for the 0x110000 limit too             */            cur = ctxt->input->cur;            c = *cur;            if (c & 0x80) {	        if (c == 0xC0)		    goto encoding_error;                if (cur[1] == 0)                    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);                if ((cur[1] & 0xc0) != 0x80)                    goto encoding_error;                if ((c & 0xe0) == 0xe0) {                    unsigned int val;                    if (cur[2] == 0)                        xmlParserInputGrow(ctxt->input, INPUT_CHUNK);                    if ((cur[2] & 0xc0) != 0x80)                        goto encoding_error;                    if ((c & 0xf0) == 0xf0) {                        if (cur[3] == 0)                            xmlParserInputGrow(ctxt->input, INPUT_CHUNK);                        if (((c & 0xf8) != 0xf0) ||                            ((cur[3] & 0xc0) != 0x80))                            goto encoding_error;                        /* 4-byte code */                        ctxt->input->cur += 4;                        val = (cur[0] & 0x7) << 18;                        val |= (cur[1] & 0x3f) << 12;                        val |= (cur[2] & 0x3f) << 6;                        val |= cur[3] & 0x3f;                    } else {                        /* 3-byte code */                        ctxt->input->cur += 3;                        val = (cur[0] & 0xf) << 12;                        val |= (cur[1] & 0x3f) << 6;                        val |= cur[2] & 0x3f;                    }                    if (((val > 0xd7ff) && (val < 0xe000)) ||                        ((val > 0xfffd) && (val < 0x10000)) ||                        (val >= 0x110000)) {			xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,					  "Char 0x%X out of allowed range\n",					  val);                    }                } else                    /* 2-byte code */                    ctxt->input->cur += 2;            } else                /* 1-byte code */                ctxt->input->cur++;            ctxt->nbChars++;            if (*ctxt->input->cur == 0)                xmlParserInputGrow(ctxt->input, INPUT_CHUNK);        }    } else {        /*         * Assume it's a fixed length encoding (1) with         * a compatible encoding for the ASCII set, since         * XML constructs only use < 128 chars         */        if (*(ctxt->input->cur) == '\n') {            ctxt->input->line++; ctxt->input->col = 1;        } else            ctxt->input->col++;        ctxt->input->cur++;        ctxt->nbChars++;        if (*ctxt->input->cur == 0)            xmlParserInputGrow(ctxt->input, INPUT_CHUNK);    }    if ((*ctxt->input->cur == '%') && (!ctxt->html))        xmlParserHandlePEReference(ctxt);    if ((*ctxt->input->cur == 0) &&        (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0))        xmlPopInput(ctxt);    return;encoding_error:    /*     * If we detect an UTF8 error that probably mean that the     * input encoding didn't get properly advertised in the     * declaration header. Report the error and switch the encoding     * to ISO-Latin-1 (if you don't like this policy, just declare the     * encoding !)     */    if ((ctxt == NULL) || (ctxt->input == NULL) ||        (ctxt->input->end - ctxt->input->cur < 4)) {	__xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,		     "Input is not proper UTF-8, indicate encoding !\n",		     NULL, NULL);    } else {        char buffer[150];	snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",			ctxt->input->cur[0], ctxt->input->cur[1],			ctxt->input->cur[2], ctxt->input->cur[3]);	__xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,		     "Input is not proper UTF-8, indicate encoding !\n%s",		     BAD_CAST buffer, NULL);    }    ctxt->charset = XML_CHAR_ENCODING_8859_1;    ctxt->input->cur++;    return;}/** * xmlCurrentChar: * @ctxt:  the XML parser context * @len:  pointer to the length of the char read * * The current char value, if using UTF-8 this may actually span multiple * bytes in the input buffer. Implement the end of line normalization: * 2.11 End-of-Line Handling * Wherever an external parsed entity or the literal entity value * of an internal parsed entity contains either the literal two-character * sequence "#xD#xA" or a standalone literal #xD, an XML processor * must pass to the application the single character #xA. * This behavior can conveniently be produced by normalizing all * line breaks to #xA on input, before parsing.) * * Returns the current char value and its length */intxmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {    if ((ctxt == NULL) || (len == NULL) || (ctxt->input == NULL)) return(0);    if (ctxt->instate == XML_PARSER_EOF)	return(0);    if ((*ctxt->input->cur >= 0x20) && (*ctxt->input->cur <= 0x7F)) {	    *len = 1;	    return((int) *ctxt->input->cur);    }    if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {	/*	 * We are supposed to handle UTF8, check it's valid	 * From rfc2044: encoding of the Unicode values on UTF-8:	 *	 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)	 * 0000 0000-0000 007F   0xxxxxxx	 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx	 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx 	 *	 * Check for the 0x110000 limit too	 */	const unsigned char *cur = ctxt->input->cur;	unsigned char c;	unsigned int val;	c = *cur;	if (c & 0x80) {	    if (c == 0xC0)		goto encoding_error;	    if (cur[1] == 0)		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);	    if ((cur[1] & 0xc0) != 0x80)		goto encoding_error;	    if ((c & 0xe0) == 0xe0) {		if (cur[2] == 0)		    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);		if ((cur[2] & 0xc0) != 0x80)		    goto encoding_error;		if ((c & 0xf0) == 0xf0) {		    if (cur[3] == 0)			xmlParserInputGrow(ctxt->input, INPUT_CHUNK);		    if (((c & 0xf8) != 0xf0) ||			((cur[3] & 0xc0) != 0x80))			goto encoding_error;		    /* 4-byte code */		    *len = 4;		    val = (cur[0] & 0x7) << 18;		    val |= (cur[1] & 0x3f) << 12;		    val |= (cur[2] & 0x3f) << 6;		    val |= cur[3] & 0x3f;		} else {		  /* 3-byte code */		    *len = 3;		    val = (cur[0] & 0xf) << 12;		    val |= (cur[1] & 0x3f) << 6;		    val |= cur[2] & 0x3f;		}	    } else {	      /* 2-byte code */		*len = 2;		val = (cur[0] & 0x1f) << 6;		val |= cur[1] & 0x3f;	    }	    if (!IS_CHAR(val)) {	        xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,				  "Char 0x%X out of allowed range\n", val);	    }    	    return(val);	} else {	    /* 1-byte code */	    *len = 1;	    if (*ctxt->input->cur == 0xD) {		if (ctxt->input->cur[1] == 0xA) {		    ctxt->nbChars++;		    ctxt->input->cur++;		}		return(0xA);	    }	    return((int) *ctxt->input->cur);	}    }    /*     * Assume it's a fixed length encoding (1) with     * a compatible encoding for the ASCII set, since     * XML constructs only use < 128 chars     */    *len = 1;    if (*ctxt->input->cur == 0xD) {	if (ctxt->input->cur[1] == 0xA) {	    ctxt->nbChars++;	    ctxt->input->cur++;	}	return(0xA);    }    return((int) *ctxt->input->cur);encoding_error:    /*     * An encoding problem may arise from a truncated input buffer     * splitting a character in the middle. In that case do not raise     * an error but return 0 to endicate an end of stream problem     */    if (ctxt->input->end - ctxt->input->cur < 4) {	*len = 0;	return(0);    }    /*     * If we detect an UTF8 error that probably mean that the     * input encoding didn't get properly advertised in the     * declaration header. Report the error and switch the encoding     * to ISO-Latin-1 (if you don't like this policy, just declare the     * encoding !)     */    {        char buffer[150];	snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",			ctxt->input->cur[0], ctxt->input->cur[1],			ctxt->input->cur[2], ctxt->input->cur[3]);	__xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,		     "Input is not proper UTF-8, indicate encoding !\n%s",		     BAD_CAST buffer, NULL);    }    ctxt->charset = XML_CHAR_ENCODING_8859_1;     *len = 1;    return((int) *ctxt->input->cur);}/** * xmlStringCurrentChar: * @ctxt:  the XML parser context * @cur:  pointer to the beginning of the char * @len:  pointer to the length of the char read * * The current char value, if using UTF-8 this may actually span multiple * bytes in the input buffer. * * Returns the current char value and its length */intxmlStringCurrentChar(xmlParserCtxtPtr ctxt, const xmlChar * cur, int *len){    if ((len == NULL) || (cur == NULL)) return(0);    if ((ctxt == NULL) || (ctxt->charset == XML_CHAR_ENCODING_UTF8)) {        /*         * We are supposed to handle UTF8, check it's valid         * From rfc2044: encoding of the Unicode values on UTF-8:         *         * UCS-4 range (hex.)           UTF-8 octet sequence (binary)         * 0000 0000-0000 007F   0xxxxxxx         * 0000 0080-0000 07FF   110xxxxx 10xxxxxx         * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx          *         * Check for the 0x110000 limit too         */        unsigned char c;        unsigned int val;        c = *cur;        if (c & 0x80) {            if ((cur[1] & 0xc0) != 0x80)                goto encoding_error;            if ((c & 0xe0) == 0xe0) {                if ((cur[2] & 0xc0) != 0x80)                    goto encoding_error;                if ((c & 0xf0) == 0xf0) {                    if (((c & 0xf8) != 0xf0) || ((cur[3] & 0xc0) != 0x80))                        goto encoding_error;                    /* 4-byte code */                    *len = 4;                    val = (cur[0] & 0x7) << 18;                    val |= (cur[1] & 0x3f) << 12;                    val |= (cur[2] & 0x3f) << 6;                    val |= cur[3] & 0x3f;                } else {                    /* 3-byte code */                    *len = 3;                    val = (cur[0] & 0xf) << 12;                    val |= (cur[1] & 0x3f) << 6;                    val |= cur[2] & 0x3f;                }            } else {                /* 2-byte code */                *len = 2;                val = (cur[0] & 0x1f) << 6;                val |= cur[1] & 0x3f;            }            if (!IS_CHAR(val)) {	        xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,				  "Char 0x%X out of allowed range\n", val);            }            return (val);        } else {            /* 1-byte code */            *len = 1;            return ((int) *cur);        }    }    /*     * Assume it's a fixed length encoding (1) with     * a compatible encoding for the ASCII set, since     * XML constructs only use < 128 chars     */    *len = 1;    return ((int) *cur);encoding_error:    /*     * An encoding problem may arise from a truncated input buffer     * splitting a character in the middle. In that case do not raise     * an error but return 0 to endicate an end of stream problem     */    if ((ctxt == NULL) || (ctxt->input == NULL) ||        (ctxt->input->end - ctxt->input->cur < 4)) {	*len = 0;	return(0);    }    /*     * If we detect an UTF8 error that probably mean that the     * input encoding didn't get properly advertised in the     * declaration header. Report the error and switch the encoding     * to ISO-Latin-1 (if you don't like this policy, just declare the     * encoding !)     */
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -