📄 htmlparser.c
字号:
/*
* HTMLparser.c : an HTML 4.0 non-verifying parser
*
* See Copyright for the status of this software.
*
* daniel@veillard.com
*/
#define IN_LIBXML
#include "libxml.h"
#ifdef LIBXML_HTML_ENABLED
#include <string.h>
#ifdef HAVE_CTYPE_H
#include <ctype.h>
#endif
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif
#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h>
#endif
#ifdef HAVE_FCNTL_H
#include <fcntl.h>
#endif
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_ZLIB_H
#include <zlib.h>
#endif
#include <libxml/xmlmemory.h>
#include <libxml/tree.h>
#include <libxml/parser.h>
#include <libxml/parserInternals.h>
#include <libxml/xmlerror.h>
#include <libxml/HTMLparser.h>
#include <libxml/HTMLtree.h>
#include <libxml/entities.h>
#include <libxml/encoding.h>
#include <libxml/valid.h>
#include <libxml/xmlIO.h>
#include <libxml/globals.h>
#include <libxml/uri.h>
#define HTML_MAX_NAMELEN 1000
#define HTML_PARSER_BIG_BUFFER_SIZE 1000
#define HTML_PARSER_BUFFER_SIZE 100
/* #define DEBUG */
/* #define DEBUG_PUSH */
static int htmlOmittedDefaultValue = 1;
xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
xmlChar end, xmlChar end2, xmlChar end3);
static void htmlParseComment(htmlParserCtxtPtr ctxt);
/************************************************************************
* *
* Some factorized error routines *
* *
************************************************************************/
/**
* htmlErrMemory:
* @ctxt: an HTML parser context
* @extra: extra informations
*
* Handle a redefinition of attribute error
*/
static void
htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
{
if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
(ctxt->instate == XML_PARSER_EOF))
return;
if (ctxt != NULL) {
ctxt->errNo = XML_ERR_NO_MEMORY;
ctxt->instate = XML_PARSER_EOF;
ctxt->disableSAX = 1;
}
if (extra)
__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
NULL, NULL, 0, 0,
"Memory allocation failed : %s\n", extra);
else
__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
NULL, NULL, 0, 0, "Memory allocation failed\n");
}
/**
* htmlParseErr:
* @ctxt: an HTML parser context
* @error: the error number
* @msg: the error message
* @str1: string infor
* @str2: string infor
*
* Handle a fatal parser error, i.e. violating Well-Formedness constraints
*/
static void
htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
const char *msg, const xmlChar *str1, const xmlChar *str2)
{
if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
(ctxt->instate == XML_PARSER_EOF))
return;
if (ctxt != NULL)
ctxt->errNo = error;
__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
XML_ERR_ERROR, NULL, 0,
(const char *) str1, (const char *) str2,
NULL, 0, 0,
msg, str1, str2);
if (ctxt != NULL)
ctxt->wellFormed = 0;
}
/**
* htmlParseErrInt:
* @ctxt: an HTML parser context
* @error: the error number
* @msg: the error message
* @val: integer info
*
* Handle a fatal parser error, i.e. violating Well-Formedness constraints
*/
static void
htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
const char *msg, int val)
{
if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
(ctxt->instate == XML_PARSER_EOF))
return;
if (ctxt != NULL)
ctxt->errNo = error;
__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
XML_ERR_ERROR, NULL, 0, NULL, NULL,
NULL, val, 0, msg, val);
if (ctxt != NULL)
ctxt->wellFormed = 0;
}
/************************************************************************
* *
* Parser stacks related functions and macros *
* *
************************************************************************/
/**
* htmlnamePush:
* @ctxt: an HTML parser context
* @value: the element name
*
* Pushes a new element name on top of the name stack
*
* Returns 0 in case of error, the index in the stack otherwise
*/
static int
htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
{
if (ctxt->nameNr >= ctxt->nameMax) {
ctxt->nameMax *= 2;
ctxt->nameTab = (const xmlChar * *)
xmlRealloc((xmlChar * *)ctxt->nameTab,
ctxt->nameMax *
sizeof(ctxt->nameTab[0]));
if (ctxt->nameTab == NULL) {
htmlErrMemory(ctxt, NULL);
return (0);
}
}
ctxt->nameTab[ctxt->nameNr] = value;
ctxt->name = value;
return (ctxt->nameNr++);
}
/**
* htmlnamePop:
* @ctxt: an HTML parser context
*
* Pops the top element name from the name stack
*
* Returns the name just removed
*/
static const xmlChar *
htmlnamePop(htmlParserCtxtPtr ctxt)
{
const xmlChar *ret;
if (ctxt->nameNr <= 0)
return (0);
ctxt->nameNr--;
if (ctxt->nameNr < 0)
return (0);
if (ctxt->nameNr > 0)
ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
else
ctxt->name = NULL;
ret = ctxt->nameTab[ctxt->nameNr];
ctxt->nameTab[ctxt->nameNr] = 0;
return (ret);
}
/*
* Macros for accessing the content. Those should be used only by the parser,
* and not exported.
*
* Dirty macros, i.e. one need to make assumption on the context to use them
*
* CUR_PTR return the current pointer to the xmlChar to be parsed.
* CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
* in ISO-Latin or UTF-8, and the current 16 bit value if compiled
* in UNICODE mode. This should be used internally by the parser
* only to compare to ASCII values otherwise it would break when
* running with UTF-8 encoding.
* NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
* to compare on ASCII based substring.
* UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
* it should be used only to compare on ASCII based substring.
* SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
* strings without newlines within the parser.
*
* Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
*
* CURRENT Returns the current char value, with the full decoding of
* UTF-8 if we are using this mode. It returns an int.
* NEXT Skip to the next character, this does the proper decoding
* in UTF-8 mode. It also pop-up unfinished entities on the fly.
* NEXTL(l) Skip the current unicode character of l xmlChars long.
* COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
*/
#define UPPER (toupper(*ctxt->input->cur))
#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
#define NXT(val) ctxt->input->cur[(val)]
#define UPP(val) (toupper(ctxt->input->cur[(val)]))
#define CUR_PTR ctxt->input->cur
#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
(ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
xmlParserInputShrink(ctxt->input)
#define GROW if ((ctxt->progressive == 0) && \
(ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
#define CURRENT ((int) (*ctxt->input->cur))
#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
/* Inported from XML */
/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
#define CUR ((int) (*ctxt->input->cur))
#define NEXT xmlNextChar(ctxt)
#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
#define NXT(val) ctxt->input->cur[(val)]
#define CUR_PTR ctxt->input->cur
#define NEXTL(l) do { \
if (*(ctxt->input->cur) == '\n') { \
ctxt->input->line++; ctxt->input->col = 1; \
} else ctxt->input->col++; \
ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
} while (0)
/************
\
if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
************/
#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
#define COPY_BUF(l,b,i,v) \
if (l == 1) b[i++] = (xmlChar) v; \
else i += xmlCopyChar(l,&b[i],v)
/**
* htmlCurrentChar:
* @ctxt: the HTML parser context
* @len: pointer to the length of the char read
*
* The current char value, if using UTF-8 this may actually span multiple
* bytes in the input buffer. Implement the end of line normalization:
* 2.11 End-of-Line Handling
* If the encoding is unspecified, in the case we find an ISO-Latin-1
* char, then the encoding converter is plugged in automatically.
*
* Returns the current char value and its length
*/
static int
htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
if (ctxt->instate == XML_PARSER_EOF)
return(0);
if (ctxt->token != 0) {
*len = 0;
return(ctxt->token);
}
if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
/*
* We are supposed to handle UTF8, check it's valid
* From rfc2044: encoding of the Unicode values on UTF-8:
*
* UCS-4 range (hex.) UTF-8 octet sequence (binary)
* 0000 0000-0000 007F 0xxxxxxx
* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
*
* Check for the 0x110000 limit too
*/
const unsigned char *cur = ctxt->input->cur;
unsigned char c;
unsigned int val;
c = *cur;
if (c & 0x80) {
if (cur[1] == 0)
xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
if ((cur[1] & 0xc0) != 0x80)
goto encoding_error;
if ((c & 0xe0) == 0xe0) {
if (cur[2] == 0)
xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -