⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 htmlparser.c

📁 SIP(Session Initiation Protocol)是由IETF定义
💻 C
📖 第 1 页 / 共 5 页
字号:
/* * HTMLparser.c : an HTML 4.0 non-verifying parser * * See Copyright for the status of this software. * * Daniel.Veillard@w3.org */#ifdef WIN32#include "win32config.h"#else#include "config.h"#endif#include "xmlversion.h"#ifdef LIBXML_HTML_ENABLED#include <stdio.h>#include <string.h> /* for memset() only */#ifdef HAVE_CTYPE_H#include <ctype.h>#endif#ifdef HAVE_STDLIB_H#include <stdlib.h>#endif#ifdef HAVE_SYS_STAT_H#include <sys/stat.h>#endif#ifdef HAVE_FCNTL_H#include <fcntl.h>#endif#ifdef HAVE_UNISTD_H#include <unistd.h>#endif#ifdef HAVE_ZLIB_H#include <zlib.h>#endif#include <libxml/xmlmemory.h>#include <libxml/tree.h>#include <libxml/HTMLparser.h>#include <libxml/entities.h>#include <libxml/encoding.h>#include <libxml/valid.h>#include <libxml/parserInternals.h>#include <libxml/xmlIO.h>#include "xml-error.h"#define HTML_MAX_NAMELEN 1000#define INPUT_CHUNK     50#define HTML_PARSER_BIG_BUFFER_SIZE 1024#define HTML_PARSER_BUFFER_SIZE 100/* #define DEBUG *//* #define DEBUG_PUSH *//************************************************************************ *									* * 		Parser stacks related functions and macros		* *									* ************************************************************************//* * Generic function for accessing stacks in the Parser Context */#define PUSH_AND_POP(scope, type, name)					\scope int html##name##Push(htmlParserCtxtPtr ctxt, type value) {	\    if (ctxt->name##Nr >= ctxt->name##Max) {				\	ctxt->name##Max *= 2;						\        ctxt->name##Tab = (void *) xmlRealloc(ctxt->name##Tab,		\	             ctxt->name##Max * sizeof(ctxt->name##Tab[0]));	\        if (ctxt->name##Tab == NULL) {					\	    fprintf(stderr, "realloc failed !\n");			\	    return(0);							\	}								\    }									\    ctxt->name##Tab[ctxt->name##Nr] = value;				\    ctxt->name = value;							\    return(ctxt->name##Nr++);						\}									\scope type html##name##Pop(htmlParserCtxtPtr ctxt) {			\    type ret;								\    if (ctxt->name##Nr < 0) return(0);					\    ctxt->name##Nr--;							\    if (ctxt->name##Nr < 0) return(0);					\    if (ctxt->name##Nr > 0)						\	ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1];		\    else								\        ctxt->name = NULL;						\    ret = ctxt->name##Tab[ctxt->name##Nr];				\    ctxt->name##Tab[ctxt->name##Nr] = 0;				\    return(ret);							\}									\PUSH_AND_POP(extern, xmlNodePtr, node)PUSH_AND_POP(extern, xmlChar*, name)/* * Macros for accessing the content. Those should be used only by the parser, * and not exported. * * Dirty macros, i.e. one need to make assumption on the context to use them * *   CUR_PTR return the current pointer to the xmlChar to be parsed. *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled *           in UNICODE mode. This should be used internally by the parser *           only to compare to ASCII values otherwise it would break when *           running with UTF-8 encoding. *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only *           to compare on ASCII based substring. *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR *           it should be used only to compare on ASCII based substring. *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined *           strings within the parser. * * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding * *   CURRENT Returns the current char value, with the full decoding of *           UTF-8 if we are using this mode. It returns an int. *   NEXT    Skip to the next character, this does the proper decoding *           in UTF-8 mode. It also pop-up unfinished entities on the fly. *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly */#define CUR ((int) (*ctxt->input->cur))    #define UPPER (toupper(*ctxt->input->cur))#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)#define NXT(val) ctxt->input->cur[(val)]#define UPP(val) (toupper(ctxt->input->cur[(val)]))#define CUR_PTR ctxt->input->cur#define SHRINK  xmlParserInputShrink(ctxt->input)#define GROW  xmlParserInputGrow(ctxt->input, INPUT_CHUNK)#define CURRENT ((int) (*ctxt->input->cur))#define NEXT htmlNextChar(ctxt);#define SKIP_BLANKS htmlSkipBlankChars(ctxt);/** * htmlNextChar: * @ctxt:  the HTML parser context * * Skip to the next char input char. */voidhtmlNextChar(htmlParserCtxtPtr ctxt) {    if ((*ctxt->input->cur == 0) &&        (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {	    xmlPopInput(ctxt);    } else {        if (*(ctxt->input->cur) == '\n') {	    ctxt->input->line++; ctxt->input->col = 1;	} else ctxt->input->col++;	ctxt->input->cur++;	ctxt->nbChars++;        if (*ctxt->input->cur == 0)	    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);    }}/** * htmlSkipBlankChars: * @ctxt:  the HTML parser context * * skip all blanks character found at that point in the input streams. * * Returns the number of space chars skipped */inthtmlSkipBlankChars(xmlParserCtxtPtr ctxt) {    int res = 0;    while (IS_BLANK(*(ctxt->input->cur))) {	if ((*ctxt->input->cur == 0) &&	    (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {		xmlPopInput(ctxt);	} else {	    if (*(ctxt->input->cur) == '\n') {		ctxt->input->line++; ctxt->input->col = 1;	    } else ctxt->input->col++;	    ctxt->input->cur++;	    ctxt->nbChars++;	    if (*ctxt->input->cur == 0)		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);	}	res++;    }    return(res);}/************************************************************************ *									* * 		The list of HTML elements and their properties		* *									* ************************************************************************//* *  Start Tag: 1 means the start tag can be ommited *  End Tag:   1 means the end tag can be ommited *             2 means it's forbidden (empty elements) *  Depr:      this element is deprecated *  DTD:       1 means that this element is valid only in the Loose DTD *             2 means that this element is valid only in the Frameset DTD * * Name,Start Tag,End Tag,  Empty,  Depr.,    DTD, Description */htmlElemDesc  html40ElementTable[] = {{ "a",		0,	0,	0,	0,	0, "anchor " },{ "abbr",	0,	0,	0,	0,	0, "abbreviated form" },{ "acronym",	0,	0,	0,	0,	0, "" },{ "address",	0,	0,	0,	0,	0, "information on author " },{ "applet",	0,	0,	0,	1,	1, "java applet " },{ "area",	0,	2,	1,	0,	0, "client-side image map area " },{ "b",		0,	0,	0,	0,	0, "bold text style" },{ "base",	0,	2,	1,	0,	0, "document base uri " },{ "basefont",	0,	2,	1,	1,	1, "base font size " },{ "bdo",	0,	0,	0,	0,	0, "i18n bidi over-ride " },{ "big",	0,	0,	0,	0,	0, "large text style" },{ "blockquote",	0,	0,	0,	0,	0, "long quotation " },{ "body",	1,	1,	0,	0,	0, "document body " },{ "br",		0,	2,	1,	0,	0, "forced line break " },{ "button",	0,	0,	0,	0,	0, "push button " },{ "caption",	0,	0,	0,	0,	0, "table caption " },{ "center",	0,	0,	0,	1,	1, "shorthand for div align=center " },{ "cite",	0,	0,	0,	0,	0, "citation" },{ "code",	0,	0,	0,	0,	0, "computer code fragment" },{ "col",	0,	2,	1,	0,	0, "table column " },{ "colgroup",	0,	1,	0,	0,	0, "table column group " },{ "dd",		0,	1,	0,	0,	0, "definition description " },{ "del",	0,	0,	0,	0,	0, "deleted text " },{ "dfn",	0,	0,	0,	0,	0, "instance definition" },{ "dir",	0,	0,	0,	1,	1, "directory list" },{ "div",	0,	0,	0,	0,	0, "generic language/style container"},{ "dl",		0,	0,	0,	0,	0, "definition list " },{ "dt",		0,	1,	0,	0,	0, "definition term " },{ "em",		0,	0,	0,	0,	0, "emphasis" },{ "fieldset",	0,	0,	0,	0,	0, "form control group " },{ "font",	0,	0,	0,	1,	1, "local change to font " },{ "form",	0,	0,	0,	0,	0, "interactive form " },{ "frame",	0,	2,	1,	0,	2, "subwindow " },{ "frameset",	0,	0,	0,	0,	2, "window subdivision" },{ "h1",		0,	0,	0,	0,	0, "heading " },{ "h2",		0,	0,	0,	0,	0, "heading " },{ "h3",		0,	0,	0,	0,	0, "heading " },{ "h4",		0,	0,	0,	0,	0, "heading " },{ "h5",		0,	0,	0,	0,	0, "heading " },{ "h6",		0,	0,	0,	0,	0, "heading " },{ "head",	1,	1,	0,	0,	0, "document head " },{ "hr",		0,	2,	1,	0,	0, "horizontal rule " },{ "html",	1,	1,	0,	0,	0, "document root element " },{ "i",		0,	0,	0,	0,	0, "italic text style" },{ "iframe",	0,	0,	0,	0,	1, "inline subwindow " },{ "img",	0,	2,	1,	0,	0, "embedded image " },{ "input",	0,	2,	1,	0,	0, "form control " },{ "ins",	0,	0,	0,	0,	0, "inserted text" },{ "isindex",	0,	2,	1,	1,	1, "single line prompt " },{ "kbd",	0,	0,	0,	0,	0, "text to be entered by the user" },{ "label",	0,	0,	0,	0,	0, "form field label text " },{ "legend",	0,	0,	0,	0,	0, "fieldset legend " },{ "li",		0,	1,	0,	0,	0, "list item " },{ "link",	0,	2,	1,	0,	0, "a media-independent link " },{ "map",	0,	0,	0,	0,	0, "client-side image map " },{ "menu",	0,	0,	0,	1,	1, "menu list " },{ "meta",	0,	2,	1,	0,	0, "generic metainformation " },{ "noframes",	0,	0,	0,	0,	2, "alternate content container for non frame-based rendering " },{ "noscript",	0,	0,	0,	0,	0, "alternate content container for non script-based rendering " },{ "object",	0,	0,	0,	0,	0, "generic embedded object " },{ "ol",		0,	0,	0,	0,	0, "ordered list " },{ "optgroup",	0,	0,	0,	0,	0, "option group " },{ "option",	0,	1,	0,	0,	0, "selectable choice " },{ "p",		0,	1,	0,	0,	0, "paragraph " },{ "param",	0,	2,	1,	0,	0, "named property value " },{ "pre",	0,	0,	0,	0,	0, "preformatted text " },{ "q",		0,	0,	0,	0,	0, "short inline quotation " },{ "s",		0,	0,	0,	1,	1, "strike-through text style" },{ "samp",	0,	0,	0,	0,	0, "sample program output, scripts, etc." },{ "script",	0,	0,	0,	0,	0, "script statements " },{ "select",	0,	0,	0,	0,	0, "option selector " },{ "small",	0,	0,	0,	0,	0, "small text style" },{ "span",	0,	0,	0,	0,	0, "generic language/style container " },{ "strike",	0,	0,	0,	1,	1, "strike-through text" },{ "strong",	0,	0,	0,	0,	0, "strong emphasis" },{ "style",	0,	0,	0,	0,	0, "style info " },{ "sub",	0,	0,	0,	0,	0, "subscript" },{ "sup",	0,	0,	0,	0,	0, "superscript " },{ "table",	0,	0,	0,	0,	0, "&#160;" },{ "tbody",	1,	1,	0,	0,	0, "table body " },{ "td",		0,	1,	0,	0,	0, "table data cell" },{ "textarea",	0,	0,	0,	0,	0, "multi-line text field " },{ "tfoot",	0,	1,	0,	0,	0, "table footer " },{ "th",		0,	1,	0,	0,	0, "table header cell" },{ "thead",	0,	1,	0,	0,	0, "table header " },{ "title",	0,	0,	0,	0,	0, "document title " },{ "tr",		0,	1,	0,	0,	0, "table row " },{ "tt",		0,	0,	0,	0,	0, "teletype or monospaced text style" },{ "u",		0,	0,	0,	1,	1, "underlined text style" },{ "ul",		0,	0,	0,	0,	0, "unordered list " },{ "var",	0,	0,	0,	0,	0, "instance of a variable or program argument" },};/* * start tags that imply the end of a current element * any tag of each line implies the end of the current element if the type of * that element is in the same line */char *htmlEquEnd[] = {"dt", "dd", "li", "option", NULL,"h1", "h2", "h3", "h4", "h5", "h6", NULL,"ol", "menu", "dir", "address", "pre", "listing", "xmp", NULL,NULL};/* * acording the HTML DTD, HR should be added to the 2nd line above, as it * is not allowed within a H1, H2, H3, etc. But we should tolerate that case * because many documents contain rules in headings... *//* * start tags that imply the end of current element */char *htmlStartClose[] = {"form",		"form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",		"dl", "ul", "ol", "menu", "dir", "address", "pre",		"listing", "xmp", "head", NULL,"head",		"p", NULL,"title",	"p", NULL,"body",		"head", "style", "link", "title", "p", NULL,"li",		"p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",		"pre", "listing", "xmp", "head", "li", NULL,"hr",		"p", "head", NULL,"h1",		"p", "head", NULL,"h2",		"p", "head", NULL,"h3",		"p", "head", NULL,"h4",		"p", "head", NULL,"h5",		"p", "head", NULL,"h6",		"p", "head", NULL,"dir",		"p", "head", NULL,"address",	"p", "head", "ul", NULL,"pre",		"p", "head", "ul", NULL,"listing",	"p", "head", NULL,"xmp",		"p", "head", NULL,"blockquote",	"p", "head", NULL,"dl",		"p", "dt", "menu", "dir", "address", "pre", "listing",		"xmp", "head", NULL,"dt",		"p", "menu", "dir", "address", "pre", "listing", "xmp",                "head", "dd", NULL,"dd",		"p", "menu", "dir", "address", "pre", "listing", "xmp",                "head", "dt", NULL,"ul",		"p", "head", "ol", "menu", "dir", "address", "pre",		"listing", "xmp", NULL,"ol",		"p", "head", "ul", NULL,"menu",		"p", "head", "ul", NULL,"p",		"p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,"div",		"p", "head", NULL,"noscript",	"p", "head", NULL,"center",	"font", "b", "i", "p", "head", NULL,"a",		"a", NULL,"caption",	"p", NULL,"colgroup",	"caption", "colgroup", "col", "p", NULL,"col",		"caption", "col", "p", NULL,"table",	"p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",		"listing", "xmp", "a", NULL,"th",		"th", "td", NULL,"td",		"th", "td", "p", NULL,"tr",		"th", "td", "tr", "caption", "col", "colgroup", "p", NULL,"thead",	"caption", "col", "colgroup", NULL,"tfoot",	"th", "td", "tr", "caption", "col", "colgroup", "thead",		"tbody", "p", NULL,"tbody",	"th", "td", "tr", "caption", "col", "colgroup", "thead",		"tfoot", "tbody", "p", NULL,"optgroup",	"option", NULL,"fieldset",	"legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",		"pre", "listing", "xmp", "a", NULL,NULL};static char** htmlStartCloseIndex[100];static int htmlStartCloseIndexinitialized = 0;/************************************************************************ *									* * 		functions to handle HTML specific data			* *									* ************************************************************************//**

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -