📄 parserinternals.c.svn-base
字号:
/* 4-byte code */ ctxt->input->cur += 4; val = (cur[0] & 0x7) << 18; val |= (cur[1] & 0x3f) << 12; val |= (cur[2] & 0x3f) << 6; val |= cur[3] & 0x3f; } else { /* 3-byte code */ ctxt->input->cur += 3; val = (cur[0] & 0xf) << 12; val |= (cur[1] & 0x3f) << 6; val |= cur[2] & 0x3f; } if (((val > 0xd7ff) && (val < 0xe000)) || ((val > 0xfffd) && (val < 0x10000)) || (val >= 0x110000)) { xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR, "Char 0x%X out of allowed range\n", val); } } else /* 2-byte code */ ctxt->input->cur += 2; } else /* 1-byte code */ ctxt->input->cur++; ctxt->nbChars++; if (*ctxt->input->cur == 0) xmlParserInputGrow(ctxt->input, INPUT_CHUNK); } } else { /* * Assume it's a fixed length encoding (1) with * a compatible encoding for the ASCII set, since * XML constructs only use < 128 chars */ if (*(ctxt->input->cur) == '\n') { ctxt->input->line++; ctxt->input->col = 1; } else ctxt->input->col++; ctxt->input->cur++; ctxt->nbChars++; if (*ctxt->input->cur == 0) xmlParserInputGrow(ctxt->input, INPUT_CHUNK); } if ((*ctxt->input->cur == '%') && (!ctxt->html)) xmlParserHandlePEReference(ctxt); if ((*ctxt->input->cur == 0) && (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) xmlPopInput(ctxt); return;encoding_error: /* * If we detect an UTF8 error that probably mean that the * input encoding didn't get properly advertised in the * declaration header. Report the error and switch the encoding * to ISO-Latin-1 (if you don't like this policy, just declare the * encoding !) */ __xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR, "Input is not proper UTF-8, indicate encoding !\n", NULL, NULL); if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) { ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", ctxt->input->cur[0], ctxt->input->cur[1], ctxt->input->cur[2], ctxt->input->cur[3]); } ctxt->charset = XML_CHAR_ENCODING_8859_1; ctxt->input->cur++; return;}/** * xmlCurrentChar: * @ctxt: the XML parser context * @len: pointer to the length of the char read * * The current char value, if using UTF-8 this may actually span multiple * bytes in the input buffer. Implement the end of line normalization: * 2.11 End-of-Line Handling * Wherever an external parsed entity or the literal entity value * of an internal parsed entity contains either the literal two-character * sequence "#xD#xA" or a standalone literal #xD, an XML processor * must pass to the application the single character #xA. * This behavior can conveniently be produced by normalizing all * line breaks to #xA on input, before parsing.) * * Returns the current char value and its length */intxmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { if (ctxt->instate == XML_PARSER_EOF) return(0); if ((*ctxt->input->cur >= 0x20) && (*ctxt->input->cur <= 0x7F)) { *len = 1; return((int) *ctxt->input->cur); } if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { /* * We are supposed to handle UTF8, check it's valid * From rfc2044: encoding of the Unicode values on UTF-8: * * UCS-4 range (hex.) UTF-8 octet sequence (binary) * 0000 0000-0000 007F 0xxxxxxx * 0000 0080-0000 07FF 110xxxxx 10xxxxxx * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx * * Check for the 0x110000 limit too */ const unsigned char *cur = ctxt->input->cur; unsigned char c; unsigned int val; c = *cur; if (c & 0x80) { if (c == 0xC0) goto encoding_error; if (cur[1] == 0) xmlParserInputGrow(ctxt->input, INPUT_CHUNK); if ((cur[1] & 0xc0) != 0x80) goto encoding_error; if ((c & 0xe0) == 0xe0) { if (cur[2] == 0) xmlParserInputGrow(ctxt->input, INPUT_CHUNK); if ((cur[2] & 0xc0) != 0x80) goto encoding_error; if ((c & 0xf0) == 0xf0) { if (cur[3] == 0) xmlParserInputGrow(ctxt->input, INPUT_CHUNK); if (((c & 0xf8) != 0xf0) || ((cur[3] & 0xc0) != 0x80)) goto encoding_error; /* 4-byte code */ *len = 4; val = (cur[0] & 0x7) << 18; val |= (cur[1] & 0x3f) << 12; val |= (cur[2] & 0x3f) << 6; val |= cur[3] & 0x3f; } else { /* 3-byte code */ *len = 3; val = (cur[0] & 0xf) << 12; val |= (cur[1] & 0x3f) << 6; val |= cur[2] & 0x3f; } } else { /* 2-byte code */ *len = 2; val = (cur[0] & 0x1f) << 6; val |= cur[1] & 0x3f; } if (!IS_CHAR(val)) { xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR, "Char 0x%X out of allowed range\n", val); } return(val); } else { /* 1-byte code */ *len = 1; if (*ctxt->input->cur == 0xD) { if (ctxt->input->cur[1] == 0xA) { ctxt->nbChars++; ctxt->input->cur++; } return(0xA); } return((int) *ctxt->input->cur); } } /* * Assume it's a fixed length encoding (1) with * a compatible encoding for the ASCII set, since * XML constructs only use < 128 chars */ *len = 1; if (*ctxt->input->cur == 0xD) { if (ctxt->input->cur[1] == 0xA) { ctxt->nbChars++; ctxt->input->cur++; } return(0xA); } return((int) *ctxt->input->cur);encoding_error: /* * An encoding problem may arise from a truncated input buffer * splitting a character in the middle. In that case do not raise * an error but return 0 to endicate an end of stream problem */ if (ctxt->input->end - ctxt->input->cur < 4) { *len = 0; return(0); } /* * If we detect an UTF8 error that probably mean that the * input encoding didn't get properly advertised in the * declaration header. Report the error and switch the encoding * to ISO-Latin-1 (if you don't like this policy, just declare the * encoding !) */ __xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR, "Input is not proper UTF-8, indicate encoding !\n", NULL, NULL); if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) { ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", ctxt->input->cur[0], ctxt->input->cur[1], ctxt->input->cur[2], ctxt->input->cur[3]); } ctxt->charset = XML_CHAR_ENCODING_8859_1; *len = 1; return((int) *ctxt->input->cur);}/** * xmlStringCurrentChar: * @ctxt: the XML parser context * @cur: pointer to the beginning of the char * @len: pointer to the length of the char read * * The current char value, if using UTF-8 this may actually span multiple * bytes in the input buffer. * * Returns the current char value and its length */intxmlStringCurrentChar(xmlParserCtxtPtr ctxt, const xmlChar * cur, int *len){ if ((ctxt == NULL) || (ctxt->charset == XML_CHAR_ENCODING_UTF8)) { /* * We are supposed to handle UTF8, check it's valid * From rfc2044: encoding of the Unicode values on UTF-8: * * UCS-4 range (hex.) UTF-8 octet sequence (binary) * 0000 0000-0000 007F 0xxxxxxx * 0000 0080-0000 07FF 110xxxxx 10xxxxxx * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx * * Check for the 0x110000 limit too */ unsigned char c; unsigned int val; c = *cur; if (c & 0x80) { if ((cur[1] & 0xc0) != 0x80) goto encoding_error; if ((c & 0xe0) == 0xe0) { if ((cur[2] & 0xc0) != 0x80) goto encoding_error; if ((c & 0xf0) == 0xf0) { if (((c & 0xf8) != 0xf0) || ((cur[3] & 0xc0) != 0x80)) goto encoding_error; /* 4-byte code */ *len = 4; val = (cur[0] & 0x7) << 18; val |= (cur[1] & 0x3f) << 12; val |= (cur[2] & 0x3f) << 6; val |= cur[3] & 0x3f; } else { /* 3-byte code */ *len = 3; val = (cur[0] & 0xf) << 12; val |= (cur[1] & 0x3f) << 6; val |= cur[2] & 0x3f; } } else { /* 2-byte code */ *len = 2; val = (cur[0] & 0x1f) << 6; val |= cur[1] & 0x3f; } if (!IS_CHAR(val)) { xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR, "Char 0x%X out of allowed range\n", val); } return (val); } else { /* 1-byte code */ *len = 1; return ((int) *cur); } } /* * Assume it's a fixed length encoding (1) with * a compatible encoding for the ASCII set, since * XML constructs only use < 128 chars */ *len = 1; return ((int) *cur);encoding_error: /* * If we detect an UTF8 error that probably mean that the * input encoding didn't get properly advertised in the * declaration header. Report the error and switch the encoding * to ISO-Latin-1 (if you don't like this policy, just declare the * encoding !) */ __xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR, "Input is not proper UTF-8, indicate encoding !\n", NULL, NULL); if ((ctxt != NULL) && (ctxt->sax != NULL) && (ctxt->sax->error != NULL)) { ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", ctxt->input->cur[0], ctxt->input->cur[1], ctxt->input->cur[2], ctxt->input->cur[3]); } *len = 1; return ((int) *cur);}/** * xmlCopyCharMultiByte: * @out: pointer to an array of xmlChar * @val: the char value * * append the char value in the array * * Returns the number of xmlChar written */intxmlCopyCharMultiByte(xmlChar *out, int val) { /* * We are supposed to handle UTF8, check it's valid * From rfc2044: encoding of the Unicode values on UTF-8: * * UCS-4 range (hex.) UTF-8 octet sequence (binary) * 0000 0000-0000 007F 0xxxxxxx * 0000 0080-0000 07FF 110xxxxx 10xxxxxx * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx */ if (val >= 0x80) { xmlChar *savedout = out; int bits; if (val < 0x800) { *out++= (val >> 6) | 0xC0; bits= 0; } else if (val < 0x10000) { *out++= (val >> 12) | 0xE0; bits= 6;} else if (val < 0x110000) { *out++= (val >> 18) | 0xF0; bits= 12; } else { xmlErrEncodingInt(NULL, XML_ERR_INVALID_CHAR, "Internal error, xmlCopyCharMultiByte 0x%X out of bound\n", val); return(0); } for ( ; bits >= 0; bits-= 6) *out++= ((val >> bits) & 0x3F) | 0x80 ; return (out - savedout); } *out = (xmlChar) val; return 1;}/** * xmlCopyChar: * @len: Ignored, compatibility * @out: pointer to an array of xmlChar * @val: the char value * * append the char value in the array * * Returns the number of xmlChar written */intxmlCopyChar(int len ATTRIBUTE_UNUSED, xmlChar *out, int val) { /* the len parameter is ignored */ if (val >= 0x80) { return(xmlCopyCharMultiByte (out, val)); } *out = (xmlChar) val; return 1;}/************************************************************************ * * * Commodity functions to switch encodings * * * ************************************************************************//** * xmlSwitchEncoding: * @ctxt: the parser context * @enc: the encoding value (number) * * change the input functions when discovering the character encoding * of a given entity. * * Returns 0 in case of success, -1 otherwise */intxmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc){ xmlCharEncodingHandlerPtr handler; switch (enc) { case XML_CHAR_ENCODING_ERROR: __xmlErrEncoding(ctxt, XML_ERR_UNKNOWN_ENCODING, "encoding unknown\n", NULL, NULL); break; case XML_CHAR_ENCODING_NONE: /* let's assume it's UTF-8 without the XML decl */ ctxt->charset = XML_CHAR_ENCODING_UTF8; return(0); case XML_CHAR_ENCODING_UTF8: /* default encoding, no conversion should be needed */ ctxt->charset = XML_CHAR_ENCODING_UTF8; /* * Errata on XML-1.0 June 20 2001 * Specific handling of the Byte Order Mark for * UTF-8 */ if ((ctxt->input != NULL) && (ctxt->input->cur[0] == 0xEF) && (ctxt->input->cur[1] == 0xBB) && (ctxt->input->cur[2] == 0xBF)) { ctxt->input->cur += 3; } return(0); case XML_CHAR_ENCODING_UTF16LE: case XML_CHAR_ENCODING_UTF16BE: /*The raw input characters are encoded *in UTF-16. As we expect this function *to be called after xmlCharEncInFunc, we expect *ctxt->input->cur to contain UTF-8 encoded characters. *So the raw UTF16 Byte Order Mark *has also been converted into *an UTF-8 BOM. Let's skip that BOM. */ if ((ctxt->input != NULL) && (ctxt->input->cur[0] == 0xEF) && (ctxt->input->cur[1] == 0xBB) && (ctxt->input->cur[2] == 0xBF)) { ctxt->input->cur += 3; } break ; default: break; } handler = xmlGetCharEncodingHandler(enc); if (handler == NULL) { /* * Default handlers. */ switch (enc) { case XML_CHAR_ENCODING_ERROR: __xmlErrEncoding(ctxt, XML_ERR_UNKNOWN_ENCODING, "encoding unknown\n", NULL, NULL); break; case XML_CHAR_ENCODING_NONE: /* let's assume it's UTF-8 without the XML decl */ ctxt->charset = XML_CHAR_ENCODING_UTF8; return(0); case XML_CHAR_ENCODING_UTF8: case XML_CHAR_ENCODING_ASCII: /* default encoding, no conversion should be needed */ ctxt->charset = XML_CHAR_ENCODING_UTF8; return(0); case XML_CHAR_ENCODING_UTF16LE: break; case XML_CHAR_ENCODING_UTF16BE: break; case XML_CHAR_ENCODING_UCS4LE: __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING, "encoding not supported %s\n", BAD_CAST "USC4 little endian", NULL); break; case XML_CHAR_ENCODING_UCS4BE: __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING, "encoding not supported %s\n", BAD_CAST "USC4 big endian", NULL); break; case XML_CHAR_ENCODING_EBCDIC: __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING, "encoding not supported %s\n", BAD_CAST "EBCDIC", NULL); break; case XML_CHAR_ENCODING_UCS4_2143: __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING, "encoding not supported %s\n", BAD_CAST "UCS4 2143", NULL); break; case XML_CHAR_ENCODING_UCS4_3412: __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING, "encoding not supported %s\n", BAD_CAST "UCS4 3412", NULL); break; case XML_CHAR_ENCODING_UCS2: __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING, "encoding not supported %s\n", BAD_CAST "UCS2", NULL); break; case XML_CHAR_ENCODING_8859_1: case XML_CHAR_ENCODING_8859_2: case XML_CHAR_ENCODING_8859_3: case XML_CHAR_ENCODING_8859_4: case XML_CHAR_ENCODING_8859_5: case XML_CHAR_ENCODING_8859_6: case XML_CHAR_ENCODING_8859_7: case XML_CHAR_ENCODING_8859_8: case XML_CHAR_ENCODING_8859_9:
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -