parser.c

来自「一个很有名的浏览器」· C语言代码 · 共 1,631 行 · 第 1/3 页
1,631 行
/* HTML parser *//* $Id: parser.c,v 1.513.2.10 2005/05/01 22:47:55 jonas Exp $ */#ifdef HAVE_CONFIG_H#include "config.h"#endif#include <errno.h>#include <stdarg.h>#include <stdio.h>#include <stdlib.h>#include <string.h>#include "elinks.h"#include "main.h"#include "bfu/listmenu.h"#include "bfu/menu.h"#include "document/css/apply.h"#include "document/css/css.h"#include "document/css/stylesheet.h"#include "document/html/frames.h"#include "document/html/parser/link.h"#include "document/html/parser/stack.h"#include "document/html/parser/parse.h"#include "document/html/parser.h"#include "document/html/renderer.h"#include "document/html/tables.h"#include "document/renderer.h"#include "intl/charsets.h"#include "osdep/ascii.h"#include "protocol/date.h"#include "protocol/header.h"#include "protocol/uri.h"#include "sched/session.h"#include "sched/task.h"#include "terminal/draw.h"#include "terminal/terminal.h"#include "util/color.h"#include "util/conv.h"#include "util/error.h"#include "util/memdebug.h"#include "util/memlist.h"#include "util/memory.h"#include "util/string.h"#include "viewer/text/form.h"#include "viewer/text/link.h"#include "viewer/text/view.h"/* Unsafe macros */#include "document/html/internal.h"/* TODO: This needs rewrite. Yes, no kidding. */static struct {	int n;	unsigned char *s;} roman_tbl[] = {	{1000,	"m"},	{999,	"im"},	{990,	"xm"},	{900,	"cm"},	{500,	"d"},	{499,	"id"},	{490,	"xd"},	{400,	"cd"},	{100,	"c"},	{99,	"ic"},	{90,	"xc"},	{50,	"l"},	{49,	"il"},	{40,	"xl"},	{10,	"x"},	{9,	"ix"},	{5,	"v"},	{4,	"iv"},	{1,	"i"},	{0,	NULL}};static voidroman(unsigned char *p, unsigned n){	int i = 0;	if (n >= 4000) {		strcpy(p, "---");		return;	}	if (!n) {		strcpy(p, "o");		return;	}	p[0] = 0;	while (n) {		while (roman_tbl[i].n <= n) {			n -= roman_tbl[i].n;			strcat(p, roman_tbl[i].s);		}		i++;		assertm(!(n && !roman_tbl[i].n),			"BUG in roman number convertor");		if_assert_failed break;	}}static intget_color(unsigned char *a, unsigned char *c, color_t *rgb){	unsigned char *at;	int r;	if (!use_document_fg_colors(global_doc_opts))		return -1;	at = get_attr_val(a, c);	if (!at) return -1;	r = decode_color(at, strlen(at), rgb);	mem_free(at);	return r;}intget_bgcolor(unsigned char *a, color_t *rgb){	if (!use_document_bg_colors(global_doc_opts))		return -1;	return get_color(a, "bgcolor", rgb);}unsigned char *get_target(unsigned char *a){	unsigned char *v = get_attr_val(a, "target");	if (v) {		if (!strcasecmp(v, "_self")) {			mem_free(v);			v = stracpy(global_doc_opts->framename);		}	}	return v;}struct html_context html_context = {#ifdef CONFIG_CSS	INIT_CSS_STYLESHEET(html_context.css_styles, import_css_stylesheet),#endif};voidln_break(int n, void (*line_break)(struct part *), struct part *part){	if (!n || html_top.invisible) return;	while (n > html_context.line_breax) {		html_context.line_breax++;		line_break(part);	}	html_context.position = 0;	html_context.putsp = -1;}voidput_chrs(unsigned char *start, int len,	 void (*put_chars)(struct part *, unsigned char *, int), struct part *part){	if (html_is_preformatted())		html_context.putsp = 0;	if (!len || html_top.invisible)		return;	if (html_context.putsp == 1) {		put_chars(part, " ", 1);		html_context.position++;		html_context.putsp = -1;	}	if (html_context.putsp == -1) {		html_context.putsp = 0;		if (isspace(start[0])) {			start++, len--;			if (!len) {				if (!html_is_preformatted())					html_context.putsp = -1;				return;			}		}	}	if (isspace(start[len - 1]) && !html_is_preformatted())		html_context.putsp = -1;	html_context.was_br = 0;	put_chars(part, start, len);	html_context.position += len;	html_context.line_breax = 0;	if (html_context.was_li > 0)		html_context.was_li--;}voidset_fragment_identifier(unsigned char *attr_name, unsigned char *attr){	unsigned char *id_attr = get_attr_val(attr_name, attr);	if (id_attr) {		html_context.special_f(html_context.part, SP_TAG, id_attr);		mem_free(id_attr);	}}voidadd_fragment_identifier(struct part *part, unsigned char *attr){	html_context.special_f(part, SP_TAG, attr);}#ifdef CONFIG_CSSvoidimport_css_stylesheet(struct css_stylesheet *css, struct uri *base_uri,		      unsigned char *url, int len){	unsigned char *import_url;	struct uri *uri;	assert(base_uri);	if (!global_doc_opts->css_enable	    || !global_doc_opts->css_import)		return;	url = memacpy(url, len);	if (!url) return;	/* HTML <head> urls should already be fine but we can.t detect them. */	import_url = join_urls(base_uri, url);	mem_free(url);	if (!import_url) return;	uri = get_uri(import_url, URI_BASE);	mem_free(import_url);	if (!uri) return;	/* Request the imported stylesheet as part of the document ... */	html_context.special_f(html_context.part, SP_STYLESHEET, uri);	/* ... and then attempt to import from the cache. */	import_css(css, uri);	done_uri(uri);}#endifvoidhtml_span(unsigned char *a){}voidhtml_bold(unsigned char *a){	format.style.attr |= AT_BOLD;}voidhtml_italic(unsigned char *a){	format.style.attr |= AT_ITALIC;}voidhtml_underline(unsigned char *a){	format.style.attr |= AT_UNDERLINE;}voidhtml_fixed(unsigned char *a){	format.style.attr |= AT_FIXED;}voidhtml_subscript(unsigned char *a){	format.style.attr |= AT_SUBSCRIPT;}voidhtml_superscript(unsigned char *a){	format.style.attr |= AT_SUPERSCRIPT;}/* Extract the extra information that is available for elements which can * receive focus. Call this from each element which supports tabindex or * accesskey. *//* Note that in ELinks, we support those attributes (I mean, we call this * function) while processing any focusable element (otherwise it'd have zero * tabindex, thus messing up navigation between links), thus we support these * attributes even near tags where we're not supposed to (like IFRAME, FRAME or * LINK). I think this doesn't make any harm ;). --pasky */voidhtml_focusable(unsigned char *a){	unsigned char *accesskey;	int tabindex;	format.accesskey = 0;	format.tabindex = 0x80000000;	if (!a) return;	accesskey = get_attr_val(a, "accesskey");	if (accesskey) {		if (*accesskey) {			accesskey[0] = toupper(accesskey[0]);			format.accesskey = read_key(accesskey);		}		mem_free(accesskey);	}	tabindex = get_num(a, "tabindex");	if (0 < tabindex && tabindex < 32767) {		format.tabindex = (tabindex & 0x7fff) << 16;	}	mem_free_if(format.onclick); format.onclick = get_attr_val(a, "onclick");	mem_free_if(format.ondblclick); format.ondblclick = get_attr_val(a, "ondblclick");	mem_free_if(format.onmouseover); format.onmouseover = get_attr_val(a, "onmouseover");	mem_free_if(format.onhover); format.onhover = get_attr_val(a, "onhover");	mem_free_if(format.onfocus); format.onfocus = get_attr_val(a, "onfocus");	mem_free_if(format.onmouseout); format.onmouseout = get_attr_val(a, "onmouseout");	mem_free_if(format.onblur); format.onblur = get_attr_val(a, "onblur");}voidhtml_font(unsigned char *a){	unsigned char *al = get_attr_val(a, "size");	if (al) {		int p = 0;		unsigned s;		unsigned char *nn = al;		unsigned char *end;		if (*al == '+') p = 1, nn++;		else if (*al == '-') p = -1, nn++;		errno = 0;		s = strtoul(nn, (char **) &end, 10);		if (!errno && *nn && !*end) {			if (s > 7) s = 7;			if (!p) format.fontsize = s;			else format.fontsize += p * s;			if (format.fontsize < 1) format.fontsize = 1;			else if (format.fontsize > 7) format.fontsize = 7;		}		mem_free(al);	}	get_color(a, "color", &format.style.fg);}voidhtml_body(unsigned char *a){	get_color(a, "text", &format.style.fg);	get_color(a, "link", &format.clink);	get_color(a, "vlink", &format.vlink);	get_bgcolor(a, &format.style.bg);#ifdef CONFIG_CSS	/* If there are any CSS twaks regarding bgcolor, make sure we will get	 * it _and_ prefer it over bgcolor attribute. */	if (global_doc_opts->css_enable)		css_apply(&html_top, &html_context.css_styles,		          &html_context.stack);#endif	if (par_format.bgcolor != format.style.bg) {		/* Modify the root HTML element - format_html_part() will take		 * this from there. */		struct html_element *e = html_context.stack.prev;		e->parattr.bgcolor = e->attr.style.bg = par_format.bgcolor = format.style.bg;	}	if (html_context.has_link_lines	    && par_format.bgcolor	    && !search_html_stack("BODY")) {		html_context.special_f(html_context.part, SP_COLOR_LINK_LINES);	}}voidhtml_skip(unsigned char *a){	html_top.invisible = 1;	html_top.type = ELEMENT_DONT_KILL;}#ifdef CONFIG_ECMASCRIPTintdo_html_script(unsigned char *a, unsigned char *html, unsigned char *eof, unsigned char **end, struct part *part){	/* TODO: <noscript> processing. Well, same considerations apply as to	 * CSS property display: none processing. */	/* TODO: Charsets for external scripts. */	unsigned char *type, *language, *src;	int in_comment = 0;	html_skip(a);	/* We try to process nested <script> if we didn't process the parent	 * one. That's why's all the fuzz. */	type = get_attr_val(a, "type");	if (type && strcasecmp(type, "text/javascript")) {		mem_free(type);not_processed:		/* Permit nested scripts and retreat. */		html_top.invisible++;		return 1;	}	if (type) mem_free(type);	/* Check that the script content is ecmascript. The value of the	 * language attribute can be JavaScript with optional version digits	 * postfixed (like: ``JavaScript1.1''). */	language = get_attr_val(a, "language");	if (language) {		int languagelen = strlen(language);		if (languagelen < 10		    || (languagelen > 10 && !isdigit(language[10]))		    || strncasecmp(language, "javascript", 10)) {			mem_free(language);			goto not_processed;		}		mem_free(language);	}	if (part->document && (src = get_attr_val(a, "src"))) {		/* External reference. */		unsigned char *import_url;		struct uri *uri;		if (!get_opt_bool("ecmascript.enable")) {			mem_free(src);			goto not_processed;		}		/* HTML <head> urls should already be fine but we can.t detect them. */		import_url = join_urls(html_context.base_href, src);		mem_free(src);		if (!import_url) goto imported;		uri = get_uri(import_url, URI_BASE);		if (!uri) goto imported;		/* Request the imported script as part of the document ... */		html_context.special_f(html_context.part, SP_SCRIPT, uri);		done_uri(uri);		/* Create URL reference onload snippet. */		insert_in_string(&import_url, 0, "^", 1);		add_to_string_list(&part->document->onload_snippets,		                   import_url, -1);imported:		/* Retreat. Do not permit nested scripts, tho'. */		if (import_url) mem_free(import_url);		return 1;	}	/* Positive, grab the rest and interpret it. */	/* First position to the real script start. */	while (html < eof && *html <= ' ') html++;	if (eof - html > 4 && !strncmp(html, "<!--", 4)) {		in_comment = 1;		/* We either skip to the end of line or to -->. */		for (; *html != '\n' && *html != '\r' && eof - html >= 3; html++) {			if (!strncmp(html, "-->", 3)) {				/* This means the document is probably broken.				 * We will now try to process the rest of				 * <script> contents, which is however likely				 * to be empty. Should we try to process the				 * comment too? Currently it seems safer but				 * less tolerant to broken pages, if there are				 * any like this. */				html += 3;				in_comment = 0;				break;			}		}	}	*end = html;	/* Now look ahead for the script end. The <script> contents is raw	 * CDATA, so we just look for the ending tag and need not care for	 * any quote marks counting etc - YET, we are more tolerant and permit	 * </script> stuff inside of the script if the whole <script> element	 * contents is wrapped in a comment. See i.e. Mozilla bug 26857 for fun	 * reading regarding this. */	for (; *end < eof; (*end)++) {		unsigned char *name;		int namelen;		if (in_comment) {			/* TODO: If we ever get some standards-quirk mode			 * distinction, this should be disabled in the			 * standards mode (and we should just look for CDATA			 * end, which is "</"). --pasky */			if (eof - *end >= 3 && !strncmp(*end, "-->", 3)) {				/* Next iteration will jump passed the ending '>' */				(*end) += 2;				in_comment = 0;			}			continue;			/* XXX: Scan for another comment? That's admittelly			 * already stretching things a little bit to an			 * extreme ;-). */		}		if (**end != '<')			continue;		/* We want to land before the closing element, that's why we		 * don't pass @end also as the appropriate parse_element()
parser.c - 源码说明

本页面展示了「一个很有名的浏览器」中的 parser.c 源码文件，采用 C语言编程语言编写，共 1,631 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与浏览器相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?