📄 htmlparser.c
字号:
/* * HTMLparser.c : an HTML 4.0 non-verifying parser * * See Copyright for the status of this software. * * Daniel.Veillard@w3.org */#ifdef WIN32#include "win32config.h"#else#include "config.h"#endif#include "xmlversion.h"#ifdef LIBXML_HTML_ENABLED#include <stdio.h>#include <string.h> /* for memset() only */#ifdef HAVE_CTYPE_H#include <ctype.h>#endif#ifdef HAVE_STDLIB_H#include <stdlib.h>#endif#ifdef HAVE_SYS_STAT_H#include <sys/stat.h>#endif#ifdef HAVE_FCNTL_H#include <fcntl.h>#endif#ifdef HAVE_UNISTD_H#include <unistd.h>#endif#ifdef HAVE_ZLIB_H#include <zlib.h>#endif#include <libxml/xmlmemory.h>#include <libxml/tree.h>#include <libxml/HTMLparser.h>#include <libxml/entities.h>#include <libxml/encoding.h>#include <libxml/valid.h>#include <libxml/parserInternals.h>#include <libxml/xmlIO.h>#include "xml-error.h"#define HTML_MAX_NAMELEN 1000#define INPUT_CHUNK 50#define HTML_PARSER_BIG_BUFFER_SIZE 1024#define HTML_PARSER_BUFFER_SIZE 100/* #define DEBUG *//* #define DEBUG_PUSH *//************************************************************************ * * * Parser stacks related functions and macros * * * ************************************************************************//* * Generic function for accessing stacks in the Parser Context */#define PUSH_AND_POP(scope, type, name) \scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) { \ if (ctxt->name##Nr >= ctxt->name##Max) { \ ctxt->name##Max *= 2; \ ctxt->name##Tab = (void *) xmlRealloc(ctxt->name##Tab, \ ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \ if (ctxt->name##Tab == NULL) { \ fprintf(stderr, "realloc failed !\n"); \ return(0); \ } \ } \ ctxt->name##Tab[ctxt->name##Nr] = value; \ ctxt->name = value; \ return(ctxt->name##Nr++); \} \scope type html##name##Pop(htmlParserCtxtPtr ctxt) { \ type ret; \ if (ctxt->name##Nr < 0) return(0); \ ctxt->name##Nr--; \ if (ctxt->name##Nr < 0) return(0); \ if (ctxt->name##Nr > 0) \ ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \ else \ ctxt->name = NULL; \ ret = ctxt->name##Tab[ctxt->name##Nr]; \ ctxt->name##Tab[ctxt->name##Nr] = 0; \ return(ret); \} \PUSH_AND_POP(extern, xmlNodePtr, node)PUSH_AND_POP(extern, xmlChar*, name)/* * Macros for accessing the content. Those should be used only by the parser, * and not exported. * * Dirty macros, i.e. one need to make assumption on the context to use them * * CUR_PTR return the current pointer to the xmlChar to be parsed. * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled * in ISO-Latin or UTF-8, and the current 16 bit value if compiled * in UNICODE mode. This should be used internally by the parser * only to compare to ASCII values otherwise it would break when * running with UTF-8 encoding. * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only * to compare on ASCII based substring. * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR * it should be used only to compare on ASCII based substring. * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined * strings within the parser. * * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding * * CURRENT Returns the current char value, with the full decoding of * UTF-8 if we are using this mode. It returns an int. * NEXT Skip to the next character, this does the proper decoding * in UTF-8 mode. It also pop-up unfinished entities on the fly. * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly */#define CUR ((int) (*ctxt->input->cur)) #define UPPER (toupper(*ctxt->input->cur))#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)#define NXT(val) ctxt->input->cur[(val)]#define UPP(val) (toupper(ctxt->input->cur[(val)]))#define CUR_PTR ctxt->input->cur#define SHRINK xmlParserInputShrink(ctxt->input)#define GROW xmlParserInputGrow(ctxt->input, INPUT_CHUNK)#define CURRENT ((int) (*ctxt->input->cur))#define NEXT htmlNextChar(ctxt);#define SKIP_BLANKS htmlSkipBlankChars(ctxt);/** * htmlNextChar: * @ctxt: the HTML parser context * * Skip to the next char input char. */voidhtmlNextChar(htmlParserCtxtPtr ctxt) { if ((*ctxt->input->cur == 0) && (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) { xmlPopInput(ctxt); } else { if (*(ctxt->input->cur) == '\n') { ctxt->input->line++; ctxt->input->col = 1; } else ctxt->input->col++; ctxt->input->cur++; ctxt->nbChars++; if (*ctxt->input->cur == 0) xmlParserInputGrow(ctxt->input, INPUT_CHUNK); }}/** * htmlSkipBlankChars: * @ctxt: the HTML parser context * * skip all blanks character found at that point in the input streams. * * Returns the number of space chars skipped */inthtmlSkipBlankChars(xmlParserCtxtPtr ctxt) { int res = 0; while (IS_BLANK(*(ctxt->input->cur))) { if ((*ctxt->input->cur == 0) && (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) { xmlPopInput(ctxt); } else { if (*(ctxt->input->cur) == '\n') { ctxt->input->line++; ctxt->input->col = 1; } else ctxt->input->col++; ctxt->input->cur++; ctxt->nbChars++; if (*ctxt->input->cur == 0) xmlParserInputGrow(ctxt->input, INPUT_CHUNK); } res++; } return(res);}/************************************************************************ * * * The list of HTML elements and their properties * * * ************************************************************************//* * Start Tag: 1 means the start tag can be ommited * End Tag: 1 means the end tag can be ommited * 2 means it's forbidden (empty elements) * Depr: this element is deprecated * DTD: 1 means that this element is valid only in the Loose DTD * 2 means that this element is valid only in the Frameset DTD * * Name,Start Tag,End Tag, Empty, Depr., DTD, Description */htmlElemDesc html40ElementTable[] = {{ "a", 0, 0, 0, 0, 0, "anchor " },{ "abbr", 0, 0, 0, 0, 0, "abbreviated form" },{ "acronym", 0, 0, 0, 0, 0, "" },{ "address", 0, 0, 0, 0, 0, "information on author " },{ "applet", 0, 0, 0, 1, 1, "java applet " },{ "area", 0, 2, 1, 0, 0, "client-side image map area " },{ "b", 0, 0, 0, 0, 0, "bold text style" },{ "base", 0, 2, 1, 0, 0, "document base uri " },{ "basefont", 0, 2, 1, 1, 1, "base font size " },{ "bdo", 0, 0, 0, 0, 0, "i18n bidi over-ride " },{ "big", 0, 0, 0, 0, 0, "large text style" },{ "blockquote", 0, 0, 0, 0, 0, "long quotation " },{ "body", 1, 1, 0, 0, 0, "document body " },{ "br", 0, 2, 1, 0, 0, "forced line break " },{ "button", 0, 0, 0, 0, 0, "push button " },{ "caption", 0, 0, 0, 0, 0, "table caption " },{ "center", 0, 0, 0, 1, 1, "shorthand for div align=center " },{ "cite", 0, 0, 0, 0, 0, "citation" },{ "code", 0, 0, 0, 0, 0, "computer code fragment" },{ "col", 0, 2, 1, 0, 0, "table column " },{ "colgroup", 0, 1, 0, 0, 0, "table column group " },{ "dd", 0, 1, 0, 0, 0, "definition description " },{ "del", 0, 0, 0, 0, 0, "deleted text " },{ "dfn", 0, 0, 0, 0, 0, "instance definition" },{ "dir", 0, 0, 0, 1, 1, "directory list" },{ "div", 0, 0, 0, 0, 0, "generic language/style container"},{ "dl", 0, 0, 0, 0, 0, "definition list " },{ "dt", 0, 1, 0, 0, 0, "definition term " },{ "em", 0, 0, 0, 0, 0, "emphasis" },{ "fieldset", 0, 0, 0, 0, 0, "form control group " },{ "font", 0, 0, 0, 1, 1, "local change to font " },{ "form", 0, 0, 0, 0, 0, "interactive form " },{ "frame", 0, 2, 1, 0, 2, "subwindow " },{ "frameset", 0, 0, 0, 0, 2, "window subdivision" },{ "h1", 0, 0, 0, 0, 0, "heading " },{ "h2", 0, 0, 0, 0, 0, "heading " },{ "h3", 0, 0, 0, 0, 0, "heading " },{ "h4", 0, 0, 0, 0, 0, "heading " },{ "h5", 0, 0, 0, 0, 0, "heading " },{ "h6", 0, 0, 0, 0, 0, "heading " },{ "head", 1, 1, 0, 0, 0, "document head " },{ "hr", 0, 2, 1, 0, 0, "horizontal rule " },{ "html", 1, 1, 0, 0, 0, "document root element " },{ "i", 0, 0, 0, 0, 0, "italic text style" },{ "iframe", 0, 0, 0, 0, 1, "inline subwindow " },{ "img", 0, 2, 1, 0, 0, "embedded image " },{ "input", 0, 2, 1, 0, 0, "form control " },{ "ins", 0, 0, 0, 0, 0, "inserted text" },{ "isindex", 0, 2, 1, 1, 1, "single line prompt " },{ "kbd", 0, 0, 0, 0, 0, "text to be entered by the user" },{ "label", 0, 0, 0, 0, 0, "form field label text " },{ "legend", 0, 0, 0, 0, 0, "fieldset legend " },{ "li", 0, 1, 0, 0, 0, "list item " },{ "link", 0, 2, 1, 0, 0, "a media-independent link " },{ "map", 0, 0, 0, 0, 0, "client-side image map " },{ "menu", 0, 0, 0, 1, 1, "menu list " },{ "meta", 0, 2, 1, 0, 0, "generic metainformation " },{ "noframes", 0, 0, 0, 0, 2, "alternate content container for non frame-based rendering " },{ "noscript", 0, 0, 0, 0, 0, "alternate content container for non script-based rendering " },{ "object", 0, 0, 0, 0, 0, "generic embedded object " },{ "ol", 0, 0, 0, 0, 0, "ordered list " },{ "optgroup", 0, 0, 0, 0, 0, "option group " },{ "option", 0, 1, 0, 0, 0, "selectable choice " },{ "p", 0, 1, 0, 0, 0, "paragraph " },{ "param", 0, 2, 1, 0, 0, "named property value " },{ "pre", 0, 0, 0, 0, 0, "preformatted text " },{ "q", 0, 0, 0, 0, 0, "short inline quotation " },{ "s", 0, 0, 0, 1, 1, "strike-through text style" },{ "samp", 0, 0, 0, 0, 0, "sample program output, scripts, etc." },{ "script", 0, 0, 0, 0, 0, "script statements " },{ "select", 0, 0, 0, 0, 0, "option selector " },{ "small", 0, 0, 0, 0, 0, "small text style" },{ "span", 0, 0, 0, 0, 0, "generic language/style container " },{ "strike", 0, 0, 0, 1, 1, "strike-through text" },{ "strong", 0, 0, 0, 0, 0, "strong emphasis" },{ "style", 0, 0, 0, 0, 0, "style info " },{ "sub", 0, 0, 0, 0, 0, "subscript" },{ "sup", 0, 0, 0, 0, 0, "superscript " },{ "table", 0, 0, 0, 0, 0, " " },{ "tbody", 1, 1, 0, 0, 0, "table body " },{ "td", 0, 1, 0, 0, 0, "table data cell" },{ "textarea", 0, 0, 0, 0, 0, "multi-line text field " },{ "tfoot", 0, 1, 0, 0, 0, "table footer " },{ "th", 0, 1, 0, 0, 0, "table header cell" },{ "thead", 0, 1, 0, 0, 0, "table header " },{ "title", 0, 0, 0, 0, 0, "document title " },{ "tr", 0, 1, 0, 0, 0, "table row " },{ "tt", 0, 0, 0, 0, 0, "teletype or monospaced text style" },{ "u", 0, 0, 0, 1, 1, "underlined text style" },{ "ul", 0, 0, 0, 0, 0, "unordered list " },{ "var", 0, 0, 0, 0, 0, "instance of a variable or program argument" },};/* * start tags that imply the end of a current element * any tag of each line implies the end of the current element if the type of * that element is in the same line */char *htmlEquEnd[] = {"dt", "dd", "li", "option", NULL,"h1", "h2", "h3", "h4", "h5", "h6", NULL,"ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,NULL};/* * acording the HTML DTD, HR should be added to the 2nd line above, as it * is not allowed within a H1, H2, H3, etc. But we should tolerate that case * because many documents contain rules in headings... *//* * start tags that imply the end of current element */char *htmlStartClose[] = {"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "ul", "ol", "menu", "dir", "address", "pre", "listing", "xmp", "head", NULL,"head", "p", NULL,"title", "p", NULL,"body", "head", "style", "link", "title", "p", NULL,"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address", "pre", "listing", "xmp", "head", "li", NULL,"hr", "p", "head", NULL,"h1", "p", "head", NULL,"h2", "p", "head", NULL,"h3", "p", "head", NULL,"h4", "p", "head", NULL,"h5", "p", "head", NULL,"h6", "p", "head", NULL,"dir", "p", "head", NULL,"address", "p", "head", "ul", NULL,"pre", "p", "head", "ul", NULL,"listing", "p", "head", NULL,"xmp", "p", "head", NULL,"blockquote", "p", "head", NULL,"dl", "p", "dt", "menu", "dir", "address", "pre", "listing", "xmp", "head", NULL,"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp", "head", "dd", NULL,"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp", "head", "dt", NULL,"ul", "p", "head", "ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,"ol", "p", "head", "ul", NULL,"menu", "p", "head", "ul", NULL,"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,"div", "p", "head", NULL,"noscript", "p", "head", NULL,"center", "font", "b", "i", "p", "head", NULL,"a", "a", NULL,"caption", "p", NULL,"colgroup", "caption", "colgroup", "col", "p", NULL,"col", "caption", "col", "p", NULL,"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre", "listing", "xmp", "a", NULL,"th", "th", "td", NULL,"td", "th", "td", "p", NULL,"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,"thead", "caption", "col", "colgroup", NULL,"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead", "tbody", "p", NULL,"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead", "tfoot", "tbody", "p", NULL,"optgroup", "option", NULL,"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre", "listing", "xmp", "a", NULL,NULL};static char** htmlStartCloseIndex[100];static int htmlStartCloseIndexinitialized = 0;/************************************************************************ * * * functions to handle HTML specific data * * * ************************************************************************//**
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -