📄 htmlparser.c.svn-base
字号:
/* * HTMLparser.c : an HTML 4.0 non-verifying parser * * See Copyright for the status of this software. * * daniel@veillard.com */#define IN_LIBXML#include "libxml.h"#ifdef LIBXML_HTML_ENABLED#include <string.h>#ifdef HAVE_CTYPE_H#include <ctype.h>#endif#ifdef HAVE_STDLIB_H#include <stdlib.h>#endif#ifdef HAVE_SYS_STAT_H#include <sys/stat.h>#endif#ifdef HAVE_FCNTL_H#include <fcntl.h>#endif#ifdef HAVE_UNISTD_H#include <unistd.h>#endif#ifdef HAVE_ZLIB_H#include <zlib.h>#endif#include <libxml/xmlmemory.h>#include <libxml/tree.h>#include <libxml/parser.h>#include <libxml/parserInternals.h>#include <libxml/xmlerror.h>#include <libxml/HTMLparser.h>#include <libxml/HTMLtree.h>#include <libxml/entities.h>#include <libxml/encoding.h>#include <libxml/valid.h>#include <libxml/xmlIO.h>#include <libxml/globals.h>#include <libxml/uri.h>#define HTML_MAX_NAMELEN 1000#define HTML_PARSER_BIG_BUFFER_SIZE 1000#define HTML_PARSER_BUFFER_SIZE 100/* #define DEBUG *//* #define DEBUG_PUSH */static int htmlOmittedDefaultValue = 1;xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len, xmlChar end, xmlChar end2, xmlChar end3);static void htmlParseComment(htmlParserCtxtPtr ctxt);/************************************************************************ * * * Some factorized error routines * * * ************************************************************************//** * htmlErrMemory: * @ctxt: an HTML parser context * @extra: extra informations * * Handle a redefinition of attribute error */static voidhtmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra){ if ((ctxt != NULL) && (ctxt->disableSAX != 0) && (ctxt->instate == XML_PARSER_EOF)) return; if (ctxt != NULL) { ctxt->errNo = XML_ERR_NO_MEMORY; ctxt->instate = XML_PARSER_EOF; ctxt->disableSAX = 1; } if (extra) __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER, XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra, NULL, NULL, 0, 0, "Memory allocation failed : %s\n", extra); else __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER, XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL, NULL, NULL, 0, 0, "Memory allocation failed\n");}/** * htmlParseErr: * @ctxt: an HTML parser context * @error: the error number * @msg: the error message * @str1: string infor * @str2: string infor * * Handle a fatal parser error, i.e. violating Well-Formedness constraints */static voidhtmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error, const char *msg, const xmlChar *str1, const xmlChar *str2){ if ((ctxt != NULL) && (ctxt->disableSAX != 0) && (ctxt->instate == XML_PARSER_EOF)) return; ctxt->errNo = error; __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, XML_ERR_ERROR, NULL, 0, (const char *) str1, (const char *) str2, NULL, 0, 0, msg, str1, str2); ctxt->wellFormed = 0;}/** * htmlParseErrInt: * @ctxt: an HTML parser context * @error: the error number * @msg: the error message * @val: integer info * * Handle a fatal parser error, i.e. violating Well-Formedness constraints */static voidhtmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error, const char *msg, int val){ if ((ctxt != NULL) && (ctxt->disableSAX != 0) && (ctxt->instate == XML_PARSER_EOF)) return; ctxt->errNo = error; __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, XML_ERR_ERROR, NULL, 0, NULL, NULL, NULL, val, 0, msg, val); ctxt->wellFormed = 0;}/************************************************************************ * * * Parser stacks related functions and macros * * * ************************************************************************//** * htmlnamePush: * @ctxt: an HTML parser context * @value: the element name * * Pushes a new element name on top of the name stack * * Returns 0 in case of error, the index in the stack otherwise */static inthtmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value){ if (ctxt->nameNr >= ctxt->nameMax) { ctxt->nameMax *= 2; ctxt->nameTab = (const xmlChar * *) xmlRealloc((xmlChar * *)ctxt->nameTab, ctxt->nameMax * sizeof(ctxt->nameTab[0])); if (ctxt->nameTab == NULL) { htmlErrMemory(ctxt, NULL); return (0); } } ctxt->nameTab[ctxt->nameNr] = value; ctxt->name = value; return (ctxt->nameNr++);}/** * htmlnamePop: * @ctxt: an HTML parser context * * Pops the top element name from the name stack * * Returns the name just removed */static const xmlChar *htmlnamePop(htmlParserCtxtPtr ctxt){ const xmlChar *ret; if (ctxt->nameNr <= 0) return (0); ctxt->nameNr--; if (ctxt->nameNr < 0) return (0); if (ctxt->nameNr > 0) ctxt->name = ctxt->nameTab[ctxt->nameNr - 1]; else ctxt->name = NULL; ret = ctxt->nameTab[ctxt->nameNr]; ctxt->nameTab[ctxt->nameNr] = 0; return (ret);}/* * Macros for accessing the content. Those should be used only by the parser, * and not exported. * * Dirty macros, i.e. one need to make assumption on the context to use them * * CUR_PTR return the current pointer to the xmlChar to be parsed. * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled * in ISO-Latin or UTF-8, and the current 16 bit value if compiled * in UNICODE mode. This should be used internally by the parser * only to compare to ASCII values otherwise it would break when * running with UTF-8 encoding. * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only * to compare on ASCII based substring. * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR * it should be used only to compare on ASCII based substring. * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined * strings without newlines within the parser. * * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding * * CURRENT Returns the current char value, with the full decoding of * UTF-8 if we are using this mode. It returns an int. * NEXT Skip to the next character, this does the proper decoding * in UTF-8 mode. It also pop-up unfinished entities on the fly. * NEXTL(l) Skip the current unicode character of l xmlChars long. * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly */#define UPPER (toupper(*ctxt->input->cur))#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)#define NXT(val) ctxt->input->cur[(val)]#define UPP(val) (toupper(ctxt->input->cur[(val)]))#define CUR_PTR ctxt->input->cur#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \ (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \ xmlParserInputShrink(ctxt->input)#define GROW if ((ctxt->progressive == 0) && \ (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \ xmlParserInputGrow(ctxt->input, INPUT_CHUNK)#define CURRENT ((int) (*ctxt->input->cur))#define SKIP_BLANKS htmlSkipBlankChars(ctxt)/* Inported from XML *//* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */#define CUR ((int) (*ctxt->input->cur))#define NEXT xmlNextChar(ctxt)#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))#define NXT(val) ctxt->input->cur[(val)]#define CUR_PTR ctxt->input->cur#define NEXTL(l) do { \ if (*(ctxt->input->cur) == '\n') { \ ctxt->input->line++; ctxt->input->col = 1; \ } else ctxt->input->col++; \ ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \ } while (0) /************ \ if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \ if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt); ************/#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)#define COPY_BUF(l,b,i,v) \ if (l == 1) b[i++] = (xmlChar) v; \ else i += xmlCopyChar(l,&b[i],v)/** * htmlCurrentChar: * @ctxt: the HTML parser context * @len: pointer to the length of the char read * * The current char value, if using UTF-8 this may actually span multiple * bytes in the input buffer. Implement the end of line normalization: * 2.11 End-of-Line Handling * If the encoding is unspecified, in the case we find an ISO-Latin-1 * char, then the encoding converter is plugged in automatically. * * Returns the current char value and its length */static inthtmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { if (ctxt->instate == XML_PARSER_EOF) return(0); if (ctxt->token != 0) { *len = 0; return(ctxt->token); } if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { /* * We are supposed to handle UTF8, check it's valid * From rfc2044: encoding of the Unicode values on UTF-8: * * UCS-4 range (hex.) UTF-8 octet sequence (binary) * 0000 0000-0000 007F 0xxxxxxx * 0000 0080-0000 07FF 110xxxxx 10xxxxxx * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx * * Check for the 0x110000 limit too */ const unsigned char *cur = ctxt->input->cur; unsigned char c; unsigned int val; c = *cur; if (c & 0x80) { if (cur[1] == 0) xmlParserInputGrow(ctxt->input, INPUT_CHUNK); if ((cur[1] & 0xc0) != 0x80) goto encoding_error; if ((c & 0xe0) == 0xe0) { if (cur[2] == 0) xmlParserInputGrow(ctxt->input, INPUT_CHUNK); if ((cur[2] & 0xc0) != 0x80) goto encoding_error; if ((c & 0xf0) == 0xf0) { if (cur[3] == 0) xmlParserInputGrow(ctxt->input, INPUT_CHUNK); if (((c & 0xf8) != 0xf0) || ((cur[3] & 0xc0) != 0x80)) goto encoding_error; /* 4-byte code */ *len = 4; val = (cur[0] & 0x7) << 18; val |= (cur[1] & 0x3f) << 12; val |= (cur[2] & 0x3f) << 6;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -