📄 parserinternals.c
字号:
if (in->base != in->buf->buffer->content) { /* * the buffer has been reallocated */ indx = in->cur - in->base; in->base = in->buf->buffer->content; in->cur = &in->buf->buffer->content[indx]; } in->end = &in->buf->buffer->content[in->buf->buffer->use]; CHECK_BUFFER(in);}/************************************************************************ * * * UTF8 character input and related functions * * * ************************************************************************//** * xmlNextChar: * @ctxt: the XML parser context * * Skip to the next char input char. */voidxmlNextChar(xmlParserCtxtPtr ctxt){ if ((ctxt == NULL) || (ctxt->instate == XML_PARSER_EOF) || (ctxt->input == NULL)) return; if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { if ((*ctxt->input->cur == 0) && (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0) && (ctxt->instate != XML_PARSER_COMMENT)) { /* * If we are at the end of the current entity and * the context allows it, we pop consumed entities * automatically. * the auto closing should be blocked in other cases */ xmlPopInput(ctxt); } else { const unsigned char *cur; unsigned char c; /* * 2.11 End-of-Line Handling * the literal two-character sequence "#xD#xA" or a standalone * literal #xD, an XML processor must pass to the application * the single character #xA. */ if (*(ctxt->input->cur) == '\n') { ctxt->input->line++; ctxt->input->col = 1; } else ctxt->input->col++; /* * We are supposed to handle UTF8, check it's valid * From rfc2044: encoding of the Unicode values on UTF-8: * * UCS-4 range (hex.) UTF-8 octet sequence (binary) * 0000 0000-0000 007F 0xxxxxxx * 0000 0080-0000 07FF 110xxxxx 10xxxxxx * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx * * Check for the 0x110000 limit too */ cur = ctxt->input->cur; c = *cur; if (c & 0x80) { if (c == 0xC0) goto encoding_error; if (cur[1] == 0) xmlParserInputGrow(ctxt->input, INPUT_CHUNK); if ((cur[1] & 0xc0) != 0x80) goto encoding_error; if ((c & 0xe0) == 0xe0) { unsigned int val; if (cur[2] == 0) xmlParserInputGrow(ctxt->input, INPUT_CHUNK); if ((cur[2] & 0xc0) != 0x80) goto encoding_error; if ((c & 0xf0) == 0xf0) { if (cur[3] == 0) xmlParserInputGrow(ctxt->input, INPUT_CHUNK); if (((c & 0xf8) != 0xf0) || ((cur[3] & 0xc0) != 0x80)) goto encoding_error; /* 4-byte code */ ctxt->input->cur += 4; val = (cur[0] & 0x7) << 18; val |= (cur[1] & 0x3f) << 12; val |= (cur[2] & 0x3f) << 6; val |= cur[3] & 0x3f; } else { /* 3-byte code */ ctxt->input->cur += 3; val = (cur[0] & 0xf) << 12; val |= (cur[1] & 0x3f) << 6; val |= cur[2] & 0x3f; } if (((val > 0xd7ff) && (val < 0xe000)) || ((val > 0xfffd) && (val < 0x10000)) || (val >= 0x110000)) { xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR, "Char 0x%X out of allowed range\n", val); } } else /* 2-byte code */ ctxt->input->cur += 2; } else /* 1-byte code */ ctxt->input->cur++; ctxt->nbChars++; if (*ctxt->input->cur == 0) xmlParserInputGrow(ctxt->input, INPUT_CHUNK); } } else { /* * Assume it's a fixed length encoding (1) with * a compatible encoding for the ASCII set, since * XML constructs only use < 128 chars */ if (*(ctxt->input->cur) == '\n') { ctxt->input->line++; ctxt->input->col = 1; } else ctxt->input->col++; ctxt->input->cur++; ctxt->nbChars++; if (*ctxt->input->cur == 0) xmlParserInputGrow(ctxt->input, INPUT_CHUNK); } if ((*ctxt->input->cur == '%') && (!ctxt->html)) xmlParserHandlePEReference(ctxt); if ((*ctxt->input->cur == 0) && (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) xmlPopInput(ctxt); return;encoding_error: /* * If we detect an UTF8 error that probably mean that the * input encoding didn't get properly advertised in the * declaration header. Report the error and switch the encoding * to ISO-Latin-1 (if you don't like this policy, just declare the * encoding !) */ if ((ctxt == NULL) || (ctxt->input == NULL) || (ctxt->input->end - ctxt->input->cur < 4)) { __xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR, "Input is not proper UTF-8, indicate encoding !\n", NULL, NULL); } else { char buffer[150]; snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", ctxt->input->cur[0], ctxt->input->cur[1], ctxt->input->cur[2], ctxt->input->cur[3]); __xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR, "Input is not proper UTF-8, indicate encoding !\n%s", BAD_CAST buffer, NULL); } ctxt->charset = XML_CHAR_ENCODING_8859_1; ctxt->input->cur++; return;}/** * xmlCurrentChar: * @ctxt: the XML parser context * @len: pointer to the length of the char read * * The current char value, if using UTF-8 this may actually span multiple * bytes in the input buffer. Implement the end of line normalization: * 2.11 End-of-Line Handling * Wherever an external parsed entity or the literal entity value * of an internal parsed entity contains either the literal two-character * sequence "#xD#xA" or a standalone literal #xD, an XML processor * must pass to the application the single character #xA. * This behavior can conveniently be produced by normalizing all * line breaks to #xA on input, before parsing.) * * Returns the current char value and its length */intxmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { if ((ctxt == NULL) || (len == NULL) || (ctxt->input == NULL)) return(0); if (ctxt->instate == XML_PARSER_EOF) return(0); if ((*ctxt->input->cur >= 0x20) && (*ctxt->input->cur <= 0x7F)) { *len = 1; return((int) *ctxt->input->cur); } if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { /* * We are supposed to handle UTF8, check it's valid * From rfc2044: encoding of the Unicode values on UTF-8: * * UCS-4 range (hex.) UTF-8 octet sequence (binary) * 0000 0000-0000 007F 0xxxxxxx * 0000 0080-0000 07FF 110xxxxx 10xxxxxx * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx * * Check for the 0x110000 limit too */ const unsigned char *cur = ctxt->input->cur; unsigned char c; unsigned int val; c = *cur; if (c & 0x80) { if (c == 0xC0) goto encoding_error; if (cur[1] == 0) xmlParserInputGrow(ctxt->input, INPUT_CHUNK); if ((cur[1] & 0xc0) != 0x80) goto encoding_error; if ((c & 0xe0) == 0xe0) { if (cur[2] == 0) xmlParserInputGrow(ctxt->input, INPUT_CHUNK); if ((cur[2] & 0xc0) != 0x80) goto encoding_error; if ((c & 0xf0) == 0xf0) { if (cur[3] == 0) xmlParserInputGrow(ctxt->input, INPUT_CHUNK); if (((c & 0xf8) != 0xf0) || ((cur[3] & 0xc0) != 0x80)) goto encoding_error; /* 4-byte code */ *len = 4; val = (cur[0] & 0x7) << 18; val |= (cur[1] & 0x3f) << 12; val |= (cur[2] & 0x3f) << 6; val |= cur[3] & 0x3f; } else { /* 3-byte code */ *len = 3; val = (cur[0] & 0xf) << 12; val |= (cur[1] & 0x3f) << 6; val |= cur[2] & 0x3f; } } else { /* 2-byte code */ *len = 2; val = (cur[0] & 0x1f) << 6; val |= cur[1] & 0x3f; } if (!IS_CHAR(val)) { xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR, "Char 0x%X out of allowed range\n", val); } return(val); } else { /* 1-byte code */ *len = 1; if (*ctxt->input->cur == 0xD) { if (ctxt->input->cur[1] == 0xA) { ctxt->nbChars++; ctxt->input->cur++; } return(0xA); } return((int) *ctxt->input->cur); } } /* * Assume it's a fixed length encoding (1) with * a compatible encoding for the ASCII set, since * XML constructs only use < 128 chars */ *len = 1; if (*ctxt->input->cur == 0xD) { if (ctxt->input->cur[1] == 0xA) { ctxt->nbChars++; ctxt->input->cur++; } return(0xA); } return((int) *ctxt->input->cur);encoding_error: /* * An encoding problem may arise from a truncated input buffer * splitting a character in the middle. In that case do not raise * an error but return 0 to endicate an end of stream problem */ if (ctxt->input->end - ctxt->input->cur < 4) { *len = 0; return(0); } /* * If we detect an UTF8 error that probably mean that the * input encoding didn't get properly advertised in the * declaration header. Report the error and switch the encoding * to ISO-Latin-1 (if you don't like this policy, just declare the * encoding !) */ { char buffer[150]; snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", ctxt->input->cur[0], ctxt->input->cur[1], ctxt->input->cur[2], ctxt->input->cur[3]); __xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR, "Input is not proper UTF-8, indicate encoding !\n%s", BAD_CAST buffer, NULL); } ctxt->charset = XML_CHAR_ENCODING_8859_1; *len = 1; return((int) *ctxt->input->cur);}/** * xmlStringCurrentChar: * @ctxt: the XML parser context * @cur: pointer to the beginning of the char * @len: pointer to the length of the char read * * The current char value, if using UTF-8 this may actually span multiple * bytes in the input buffer. * * Returns the current char value and its length */intxmlStringCurrentChar(xmlParserCtxtPtr ctxt, const xmlChar * cur, int *len){ if ((len == NULL) || (cur == NULL)) return(0); if ((ctxt == NULL) || (ctxt->charset == XML_CHAR_ENCODING_UTF8)) { /* * We are supposed to handle UTF8, check it's valid * From rfc2044: encoding of the Unicode values on UTF-8: * * UCS-4 range (hex.) UTF-8 octet sequence (binary) * 0000 0000-0000 007F 0xxxxxxx * 0000 0080-0000 07FF 110xxxxx 10xxxxxx * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx * * Check for the 0x110000 limit too */ unsigned char c; unsigned int val; c = *cur; if (c & 0x80) { if ((cur[1] & 0xc0) != 0x80) goto encoding_error; if ((c & 0xe0) == 0xe0) { if ((cur[2] & 0xc0) != 0x80) goto encoding_error; if ((c & 0xf0) == 0xf0) { if (((c & 0xf8) != 0xf0) || ((cur[3] & 0xc0) != 0x80)) goto encoding_error; /* 4-byte code */ *len = 4; val = (cur[0] & 0x7) << 18; val |= (cur[1] & 0x3f) << 12; val |= (cur[2] & 0x3f) << 6; val |= cur[3] & 0x3f; } else { /* 3-byte code */ *len = 3; val = (cur[0] & 0xf) << 12; val |= (cur[1] & 0x3f) << 6; val |= cur[2] & 0x3f; } } else { /* 2-byte code */ *len = 2; val = (cur[0] & 0x1f) << 6; val |= cur[1] & 0x3f; } if (!IS_CHAR(val)) { xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR, "Char 0x%X out of allowed range\n", val); } return (val); } else { /* 1-byte code */ *len = 1; return ((int) *cur); } } /* * Assume it's a fixed length encoding (1) with * a compatible encoding for the ASCII set, since * XML constructs only use < 128 chars */ *len = 1; return ((int) *cur);encoding_error: /* * An encoding problem may arise from a truncated input buffer * splitting a character in the middle. In that case do not raise * an error but return 0 to endicate an end of stream problem */ if ((ctxt == NULL) || (ctxt->input == NULL) || (ctxt->input->end - ctxt->input->cur < 4)) { *len = 0; return(0); } /* * If we detect an UTF8 error that probably mean that the * input encoding didn't get properly advertised in the * declaration header. Report the error and switch the encoding * to ISO-Latin-1 (if you don't like this policy, just declare the * encoding !) */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -