📄 parserinternals.c
字号:
{
char buffer[150];
snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
ctxt->input->cur[0], ctxt->input->cur[1],
ctxt->input->cur[2], ctxt->input->cur[3]);
__xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
"Input is not proper UTF-8, indicate encoding !\n%s",
BAD_CAST buffer, NULL);
}
*len = 1;
return ((int) *cur);
}
/**
* xmlCopyCharMultiByte:
* @out: pointer to an array of xmlChar
* @val: the char value
*
* append the char value in the array
*
* Returns the number of xmlChar written
*/
int
xmlCopyCharMultiByte(xmlChar *out, int val) {
if (out == NULL) return(0);
/*
* We are supposed to handle UTF8, check it's valid
* From rfc2044: encoding of the Unicode values on UTF-8:
*
* UCS-4 range (hex.) UTF-8 octet sequence (binary)
* 0000 0000-0000 007F 0xxxxxxx
* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
*/
if (val >= 0x80) {
xmlChar *savedout = out;
int bits;
if (val < 0x800) { *out++= (val >> 6) | 0xC0; bits= 0; }
else if (val < 0x10000) { *out++= (val >> 12) | 0xE0; bits= 6;}
else if (val < 0x110000) { *out++= (val >> 18) | 0xF0; bits= 12; }
else {
xmlErrEncodingInt(NULL, XML_ERR_INVALID_CHAR,
"Internal error, xmlCopyCharMultiByte 0x%X out of bound\n",
val);
return(0);
}
for ( ; bits >= 0; bits-= 6)
*out++= ((val >> bits) & 0x3F) | 0x80 ;
return (out - savedout);
}
*out = (xmlChar) val;
return 1;
}
/**
* xmlCopyChar:
* @len: Ignored, compatibility
* @out: pointer to an array of xmlChar
* @val: the char value
*
* append the char value in the array
*
* Returns the number of xmlChar written
*/
int
xmlCopyChar(int len ATTRIBUTE_UNUSED, xmlChar *out, int val) {
if (out == NULL) return(0);
/* the len parameter is ignored */
if (val >= 0x80) {
return(xmlCopyCharMultiByte (out, val));
}
*out = (xmlChar) val;
return 1;
}
/************************************************************************
* *
* Commodity functions to switch encodings *
* *
************************************************************************/
/**
* xmlSwitchEncoding:
* @ctxt: the parser context
* @enc: the encoding value (number)
*
* change the input functions when discovering the character encoding
* of a given entity.
*
* Returns 0 in case of success, -1 otherwise
*/
int
xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
{
xmlCharEncodingHandlerPtr handler;
if (ctxt == NULL) return(-1);
switch (enc) {
case XML_CHAR_ENCODING_ERROR:
__xmlErrEncoding(ctxt, XML_ERR_UNKNOWN_ENCODING,
"encoding unknown\n", NULL, NULL);
break;
case XML_CHAR_ENCODING_NONE:
/* let's assume it's UTF-8 without the XML decl */
ctxt->charset = XML_CHAR_ENCODING_UTF8;
return(0);
case XML_CHAR_ENCODING_UTF8:
/* default encoding, no conversion should be needed */
ctxt->charset = XML_CHAR_ENCODING_UTF8;
/*
* Errata on XML-1.0 June 20 2001
* Specific handling of the Byte Order Mark for
* UTF-8
*/
if ((ctxt->input != NULL) &&
(ctxt->input->cur[0] == 0xEF) &&
(ctxt->input->cur[1] == 0xBB) &&
(ctxt->input->cur[2] == 0xBF)) {
ctxt->input->cur += 3;
}
return(0);
case XML_CHAR_ENCODING_UTF16LE:
case XML_CHAR_ENCODING_UTF16BE:
/*The raw input characters are encoded
*in UTF-16. As we expect this function
*to be called after xmlCharEncInFunc, we expect
*ctxt->input->cur to contain UTF-8 encoded characters.
*So the raw UTF16 Byte Order Mark
*has also been converted into
*an UTF-8 BOM. Let's skip that BOM.
*/
if ((ctxt->input != NULL) &&
(ctxt->input->cur[0] == 0xEF) &&
(ctxt->input->cur[1] == 0xBB) &&
(ctxt->input->cur[2] == 0xBF)) {
ctxt->input->cur += 3;
}
break ;
default:
break;
}
handler = xmlGetCharEncodingHandler(enc);
if (handler == NULL) {
/*
* Default handlers.
*/
switch (enc) {
case XML_CHAR_ENCODING_ERROR:
__xmlErrEncoding(ctxt, XML_ERR_UNKNOWN_ENCODING,
"encoding unknown\n", NULL, NULL);
break;
case XML_CHAR_ENCODING_NONE:
/* let's assume it's UTF-8 without the XML decl */
ctxt->charset = XML_CHAR_ENCODING_UTF8;
return(0);
case XML_CHAR_ENCODING_UTF8:
case XML_CHAR_ENCODING_ASCII:
/* default encoding, no conversion should be needed */
ctxt->charset = XML_CHAR_ENCODING_UTF8;
return(0);
case XML_CHAR_ENCODING_UTF16LE:
break;
case XML_CHAR_ENCODING_UTF16BE:
break;
case XML_CHAR_ENCODING_UCS4LE:
__xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
"encoding not supported %s\n",
BAD_CAST "USC4 little endian", NULL);
break;
case XML_CHAR_ENCODING_UCS4BE:
__xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
"encoding not supported %s\n",
BAD_CAST "USC4 big endian", NULL);
break;
case XML_CHAR_ENCODING_EBCDIC:
__xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
"encoding not supported %s\n",
BAD_CAST "EBCDIC", NULL);
break;
case XML_CHAR_ENCODING_UCS4_2143:
__xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
"encoding not supported %s\n",
BAD_CAST "UCS4 2143", NULL);
break;
case XML_CHAR_ENCODING_UCS4_3412:
__xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
"encoding not supported %s\n",
BAD_CAST "UCS4 3412", NULL);
break;
case XML_CHAR_ENCODING_UCS2:
__xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
"encoding not supported %s\n",
BAD_CAST "UCS2", NULL);
break;
case XML_CHAR_ENCODING_8859_1:
case XML_CHAR_ENCODING_8859_2:
case XML_CHAR_ENCODING_8859_3:
case XML_CHAR_ENCODING_8859_4:
case XML_CHAR_ENCODING_8859_5:
case XML_CHAR_ENCODING_8859_6:
case XML_CHAR_ENCODING_8859_7:
case XML_CHAR_ENCODING_8859_8:
case XML_CHAR_ENCODING_8859_9:
/*
* We used to keep the internal content in the
* document encoding however this turns being unmaintainable
* So xmlGetCharEncodingHandler() will return non-null
* values for this now.
*/
if ((ctxt->inputNr == 1) &&
(ctxt->encoding == NULL) &&
(ctxt->input->encoding != NULL)) {
ctxt->encoding = xmlStrdup(ctxt->input->encoding);
}
ctxt->charset = enc;
return(0);
case XML_CHAR_ENCODING_2022_JP:
__xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
"encoding not supported %s\n",
BAD_CAST "ISO-2022-JP", NULL);
break;
case XML_CHAR_ENCODING_SHIFT_JIS:
__xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
"encoding not supported %s\n",
BAD_CAST "Shift_JIS", NULL);
break;
case XML_CHAR_ENCODING_EUC_JP:
__xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
"encoding not supported %s\n",
BAD_CAST "EUC-JP", NULL);
break;
}
}
if (handler == NULL)
return(-1);
ctxt->charset = XML_CHAR_ENCODING_UTF8;
return(xmlSwitchToEncoding(ctxt, handler));
}
/**
* xmlSwitchInputEncoding:
* @ctxt: the parser context
* @input: the input stream
* @handler: the encoding handler
*
* change the input functions when discovering the character encoding
* of a given entity.
*
* Returns 0 in case of success, -1 otherwise
*/
int
xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
xmlCharEncodingHandlerPtr handler)
{
int nbchars;
if (handler == NULL)
return (-1);
if (input == NULL)
return (-1);
if (input->buf != NULL) {
if (input->buf->encoder != NULL) {
/*
* Check in case the auto encoding detetection triggered
* in already.
*/
if (input->buf->encoder == handler)
return (0);
/*
* "UTF-16" can be used for both LE and BE
if ((!xmlStrncmp(BAD_CAST input->buf->encoder->name,
BAD_CAST "UTF-16", 6)) &&
(!xmlStrncmp(BAD_CAST handler->name,
BAD_CAST "UTF-16", 6))) {
return(0);
}
*/
/*
* Note: this is a bit dangerous, but that's what it
* takes to use nearly compatible signature for different
* encodings.
*/
xmlCharEncCloseFunc(input->buf->encoder);
input->buf->encoder = handler;
return (0);
}
input->buf->encoder = handler;
/*
* Is there already some content down the pipe to convert ?
*/
if ((input->buf->buffer != NULL) && (input->buf->buffer->use > 0)) {
int processed;
unsigned int use;
/*
* Specific handling of the Byte Order Mark for
* UTF-16
*/
if ((handler->name != NULL) &&
(!strcmp(handler->name, "UTF-16LE") ||
!strcmp(handler->name, "UTF-16")) &&
(input->cur[0] == 0xFF) && (input->cur[1] == 0xFE)) {
input->cur += 2;
}
if ((handler->name != NULL) &&
(!strcmp(handler->name, "UTF-16BE")) &&
(input->cur[0] == 0xFE) && (input->cur[1] == 0xFF)) {
input->cur += 2;
}
/*
* Errata on XML-1.0 June 20 2001
* Specific handling of the Byte Order Mark for
* UTF-8
*/
if ((handler->name != NULL) &&
(!strcmp(handler->name, "UTF-8")) &&
(input->cur[0] == 0xEF) &&
(input->cur[1] == 0xBB) && (input->cur[2] == 0xBF)) {
input->cur += 3;
}
/*
* Shrink the current input buffer.
* Move it as the raw buffer and create a new input buffer
*/
processed = input->cur - input->base;
xmlBufferShrink(input->buf->buffer, processed);
input->buf->raw = input->buf->buffer;
input->buf->buffer = xmlBufferCreate();
input->buf->rawconsumed = processed;
use = input->buf->raw->use;
if (ctxt->html) {
/*
* convert as much as possible of the buffer
*/
nbchars = xmlCharEncInFunc(input->buf->encoder,
input->buf->buffer,
input->buf->raw);
} else {
/*
* convert just enough to get
* '<?xml version="1.0" encoding="xxx"?>'
* parsed with the autodetected encoding
* into the parser reading buffer.
*/
nbchars = xmlCharEncFirstLine(input->buf->encoder,
input->buf->buffer,
input->buf->raw);
}
if (nbchars < 0) {
xmlErrInternal(ctxt,
"switching encoding: encoder error\n",
NULL);
return (-1);
}
input->buf->rawconsumed += use - input->buf->raw->use;
input->base = input->cur = input->buf->buffer->content;
input->end = &input->base[input->buf->buffer->use];
}
return (0);
} else {
if ((input->length == 0) || (input->buf == NULL)) {
/*
* When parsing a static memory array one must know the
* size to be able to convert the buffer.
*/
xmlErrInternal(ctxt, "switching encoding : no input\n", NULL);
return (-1);
} else {
int processed;
/*
* Shrink the current input buffer.
* Move it as the raw buffer and create a new input buffer
*/
processed = input->cur - input->base;
input->buf->raw = xmlBufferCreate();
xmlBufferAdd(input->buf->raw, input->cur,
input->length - processed);
input->buf->buffer = xmlBufferCreate();
/*
* convert as much as possible of the raw input
* to the parser reading buffer.
*/
nbchars = xmlCharEncInFunc(input->buf->encoder,
input->buf->buffer,
input->buf->raw);
if (nbchars < 0) {
xmlErrInternal(ctxt,
"switching encoding: encoder error\n",
NULL);
return (-1);
}
/*
* Conversion succeeded, get rid of the old buffer
*/
if ((input->free != NULL) && (input->base != NULL))
input->free((xmlChar *) input->base);
input->base = input->cur = input->buf->buffer->content;
input->end = &input->base[input->buf->buffer->use];
}
}
return (0);
}
/**
* xmlSwitchToEncoding:
* @ctxt: the parser context
* @handler: the encoding handler
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -