📄 parserinternals.c
字号:
{ char buffer[150]; snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", ctxt->input->cur[0], ctxt->input->cur[1], ctxt->input->cur[2], ctxt->input->cur[3]); __xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR, "Input is not proper UTF-8, indicate encoding !\n%s", BAD_CAST buffer, NULL); } *len = 1; return ((int) *cur);}/** * xmlCopyCharMultiByte: * @out: pointer to an array of xmlChar * @val: the char value * * append the char value in the array * * Returns the number of xmlChar written */intxmlCopyCharMultiByte(xmlChar *out, int val) { if (out == NULL) return(0); /* * We are supposed to handle UTF8, check it's valid * From rfc2044: encoding of the Unicode values on UTF-8: * * UCS-4 range (hex.) UTF-8 octet sequence (binary) * 0000 0000-0000 007F 0xxxxxxx * 0000 0080-0000 07FF 110xxxxx 10xxxxxx * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx */ if (val >= 0x80) { xmlChar *savedout = out; int bits; if (val < 0x800) { *out++= (val >> 6) | 0xC0; bits= 0; } else if (val < 0x10000) { *out++= (val >> 12) | 0xE0; bits= 6;} else if (val < 0x110000) { *out++= (val >> 18) | 0xF0; bits= 12; } else { xmlErrEncodingInt(NULL, XML_ERR_INVALID_CHAR, "Internal error, xmlCopyCharMultiByte 0x%X out of bound\n", val); return(0); } for ( ; bits >= 0; bits-= 6) *out++= ((val >> bits) & 0x3F) | 0x80 ; return (out - savedout); } *out = (xmlChar) val; return 1;}/** * xmlCopyChar: * @len: Ignored, compatibility * @out: pointer to an array of xmlChar * @val: the char value * * append the char value in the array * * Returns the number of xmlChar written */intxmlCopyChar(int len ATTRIBUTE_UNUSED, xmlChar *out, int val) { if (out == NULL) return(0); /* the len parameter is ignored */ if (val >= 0x80) { return(xmlCopyCharMultiByte (out, val)); } *out = (xmlChar) val; return 1;}/************************************************************************ * * * Commodity functions to switch encodings * * * ************************************************************************//** * xmlSwitchEncoding: * @ctxt: the parser context * @enc: the encoding value (number) * * change the input functions when discovering the character encoding * of a given entity. * * Returns 0 in case of success, -1 otherwise */intxmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc){ xmlCharEncodingHandlerPtr handler; if (ctxt == NULL) return(-1); switch (enc) { case XML_CHAR_ENCODING_ERROR: __xmlErrEncoding(ctxt, XML_ERR_UNKNOWN_ENCODING, "encoding unknown\n", NULL, NULL); break; case XML_CHAR_ENCODING_NONE: /* let's assume it's UTF-8 without the XML decl */ ctxt->charset = XML_CHAR_ENCODING_UTF8; return(0); case XML_CHAR_ENCODING_UTF8: /* default encoding, no conversion should be needed */ ctxt->charset = XML_CHAR_ENCODING_UTF8; /* * Errata on XML-1.0 June 20 2001 * Specific handling of the Byte Order Mark for * UTF-8 */ if ((ctxt->input != NULL) && (ctxt->input->cur[0] == 0xEF) && (ctxt->input->cur[1] == 0xBB) && (ctxt->input->cur[2] == 0xBF)) { ctxt->input->cur += 3; } return(0); case XML_CHAR_ENCODING_UTF16LE: case XML_CHAR_ENCODING_UTF16BE: /*The raw input characters are encoded *in UTF-16. As we expect this function *to be called after xmlCharEncInFunc, we expect *ctxt->input->cur to contain UTF-8 encoded characters. *So the raw UTF16 Byte Order Mark *has also been converted into *an UTF-8 BOM. Let's skip that BOM. */ if ((ctxt->input != NULL) && (ctxt->input->cur[0] == 0xEF) && (ctxt->input->cur[1] == 0xBB) && (ctxt->input->cur[2] == 0xBF)) { ctxt->input->cur += 3; } break ; default: break; } handler = xmlGetCharEncodingHandler(enc); if (handler == NULL) { /* * Default handlers. */ switch (enc) { case XML_CHAR_ENCODING_ERROR: __xmlErrEncoding(ctxt, XML_ERR_UNKNOWN_ENCODING, "encoding unknown\n", NULL, NULL); break; case XML_CHAR_ENCODING_NONE: /* let's assume it's UTF-8 without the XML decl */ ctxt->charset = XML_CHAR_ENCODING_UTF8; return(0); case XML_CHAR_ENCODING_UTF8: case XML_CHAR_ENCODING_ASCII: /* default encoding, no conversion should be needed */ ctxt->charset = XML_CHAR_ENCODING_UTF8; return(0); case XML_CHAR_ENCODING_UTF16LE: break; case XML_CHAR_ENCODING_UTF16BE: break; case XML_CHAR_ENCODING_UCS4LE: __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING, "encoding not supported %s\n", BAD_CAST "USC4 little endian", NULL); break; case XML_CHAR_ENCODING_UCS4BE: __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING, "encoding not supported %s\n", BAD_CAST "USC4 big endian", NULL); break; case XML_CHAR_ENCODING_EBCDIC: __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING, "encoding not supported %s\n", BAD_CAST "EBCDIC", NULL); break; case XML_CHAR_ENCODING_UCS4_2143: __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING, "encoding not supported %s\n", BAD_CAST "UCS4 2143", NULL); break; case XML_CHAR_ENCODING_UCS4_3412: __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING, "encoding not supported %s\n", BAD_CAST "UCS4 3412", NULL); break; case XML_CHAR_ENCODING_UCS2: __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING, "encoding not supported %s\n", BAD_CAST "UCS2", NULL); break; case XML_CHAR_ENCODING_8859_1: case XML_CHAR_ENCODING_8859_2: case XML_CHAR_ENCODING_8859_3: case XML_CHAR_ENCODING_8859_4: case XML_CHAR_ENCODING_8859_5: case XML_CHAR_ENCODING_8859_6: case XML_CHAR_ENCODING_8859_7: case XML_CHAR_ENCODING_8859_8: case XML_CHAR_ENCODING_8859_9: /* * We used to keep the internal content in the * document encoding however this turns being unmaintainable * So xmlGetCharEncodingHandler() will return non-null * values for this now. */ if ((ctxt->inputNr == 1) && (ctxt->encoding == NULL) && (ctxt->input->encoding != NULL)) { ctxt->encoding = xmlStrdup(ctxt->input->encoding); } ctxt->charset = enc; return(0); case XML_CHAR_ENCODING_2022_JP: __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING, "encoding not supported %s\n", BAD_CAST "ISO-2022-JP", NULL); break; case XML_CHAR_ENCODING_SHIFT_JIS: __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING, "encoding not supported %s\n", BAD_CAST "Shift_JIS", NULL); break; case XML_CHAR_ENCODING_EUC_JP: __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING, "encoding not supported %s\n", BAD_CAST "EUC-JP", NULL); break; } } if (handler == NULL) return(-1); ctxt->charset = XML_CHAR_ENCODING_UTF8; return(xmlSwitchToEncoding(ctxt, handler));}/** * xmlSwitchInputEncoding: * @ctxt: the parser context * @input: the input stream * @handler: the encoding handler * * change the input functions when discovering the character encoding * of a given entity. * * Returns 0 in case of success, -1 otherwise */intxmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input, xmlCharEncodingHandlerPtr handler){ int nbchars; if (handler == NULL) return (-1); if (input == NULL) return (-1); if (input->buf != NULL) { if (input->buf->encoder != NULL) { /* * Check in case the auto encoding detetection triggered * in already. */ if (input->buf->encoder == handler) return (0); /* * "UTF-16" can be used for both LE and BE if ((!xmlStrncmp(BAD_CAST input->buf->encoder->name, BAD_CAST "UTF-16", 6)) && (!xmlStrncmp(BAD_CAST handler->name, BAD_CAST "UTF-16", 6))) { return(0); } */ /* * Note: this is a bit dangerous, but that's what it * takes to use nearly compatible signature for different * encodings. */ xmlCharEncCloseFunc(input->buf->encoder); input->buf->encoder = handler; return (0); } input->buf->encoder = handler; /* * Is there already some content down the pipe to convert ? */ if ((input->buf->buffer != NULL) && (input->buf->buffer->use > 0)) { int processed; unsigned int use; /* * Specific handling of the Byte Order Mark for * UTF-16 */ if ((handler->name != NULL) && (!strcmp(handler->name, "UTF-16LE") || !strcmp(handler->name, "UTF-16")) && (input->cur[0] == 0xFF) && (input->cur[1] == 0xFE)) { input->cur += 2; } if ((handler->name != NULL) && (!strcmp(handler->name, "UTF-16BE")) && (input->cur[0] == 0xFE) && (input->cur[1] == 0xFF)) { input->cur += 2; } /* * Errata on XML-1.0 June 20 2001 * Specific handling of the Byte Order Mark for * UTF-8 */ if ((handler->name != NULL) && (!strcmp(handler->name, "UTF-8")) && (input->cur[0] == 0xEF) && (input->cur[1] == 0xBB) && (input->cur[2] == 0xBF)) { input->cur += 3; } /* * Shrink the current input buffer. * Move it as the raw buffer and create a new input buffer */ processed = input->cur - input->base; xmlBufferShrink(input->buf->buffer, processed); input->buf->raw = input->buf->buffer; input->buf->buffer = xmlBufferCreate(); input->buf->rawconsumed = processed; use = input->buf->raw->use; if (ctxt->html) { /* * convert as much as possible of the buffer */ nbchars = xmlCharEncInFunc(input->buf->encoder, input->buf->buffer, input->buf->raw); } else { /* * convert just enough to get * '<?xml version="1.0" encoding="xxx"?>' * parsed with the autodetected encoding * into the parser reading buffer. */ nbchars = xmlCharEncFirstLine(input->buf->encoder, input->buf->buffer, input->buf->raw); } if (nbchars < 0) { xmlErrInternal(ctxt, "switching encoding: encoder error\n", NULL); return (-1); } input->buf->rawconsumed += use - input->buf->raw->use; input->base = input->cur = input->buf->buffer->content; input->end = &input->base[input->buf->buffer->use]; } return (0); } else { if ((input->length == 0) || (input->buf == NULL)) { /* * When parsing a static memory array one must know the * size to be able to convert the buffer. */ xmlErrInternal(ctxt, "switching encoding : no input\n", NULL); return (-1); } else { int processed; /* * Shrink the current input buffer. * Move it as the raw buffer and create a new input buffer */ processed = input->cur - input->base; input->buf->raw = xmlBufferCreate(); xmlBufferAdd(input->buf->raw, input->cur, input->length - processed); input->buf->buffer = xmlBufferCreate(); /* * convert as much as possible of the raw input * to the parser reading buffer. */ nbchars = xmlCharEncInFunc(input->buf->encoder, input->buf->buffer, input->buf->raw); if (nbchars < 0) { xmlErrInternal(ctxt, "switching encoding: encoder error\n", NULL); return (-1); } /* * Conversion succeeded, get rid of the old buffer */ if ((input->free != NULL) && (input->base != NULL)) input->free((xmlChar *) input->base); input->base = input->cur = input->buf->buffer->content; input->end = &input->base[input->buf->buffer->use]; } } return (0);}/** * xmlSwitchToEncoding: * @ctxt: the parser context * @handler: the encoding handler
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -