parse.c

来自「一个很有名的浏览器」· C语言代码 · 共 1,095 行 · 第 1/2 页
1,095 行
/* HTML core parser routines *//* $Id: parse.c,v 1.102.2.7 2005/04/05 21:08:41 jonas Exp $ */#ifdef HAVE_CONFIG_H#include "config.h"#endif#include <errno.h>#include <stdarg.h>#include <stdio.h>#include <stdlib.h>#include <string.h>#include "elinks.h"#include "document/css/apply.h"#include "document/css/parser.h"#include "document/html/parser/forms.h"#include "document/html/parser/link.h"#include "document/html/parser/parse.h"#include "document/html/parser/stack.h"#include "document/html/parser.h"#include "document/html/tables.h"#include "document/options.h"#include "intl/charsets.h"#include "util/conv.h"#include "util/error.h"#include "util/fastfind.h"#include "util/memdebug.h"#include "util/memory.h"#include "util/string.h"/* Unsafe macros */#include "document/html/internal.h"#define end_of_tag(c) ((c) == '>' || (c) == '<')static inline intatchr(register unsigned char c){	return (c < 127 && (c > '>' || (c > ' ' && c != '=' && !end_of_tag(c))));}/* This function eats one html element. *//* - e is pointer to the begining of the element (*e must be '<') * - eof is pointer to the end of scanned area * - parsed element name is stored in name, it's length is namelen * - first attribute is stored in attr * - end points to first character behind the html element *//* It returns -1 when it failed (returned values in pointers are invalid) and * 0 for success. */intparse_element(register unsigned char *e, unsigned char *eof,	      unsigned char **name, int *namelen,	      unsigned char **attr, unsigned char **end){#define next_char() if (++e == eof) return -1;	assert(e && eof);	if (e >= eof || *e != '<') return -1;	next_char();	if (name) *name = e;	if (*e == '/') next_char();	if (!isident(*e)) return -1;	while (isident(*e)) next_char();	if (!isspace(*e) && !end_of_tag(*e) && *e != '/' && *e != ':' && *e != '=')		return -1;	if (name && namelen) *namelen = e - *name;	while (isspace(*e) || *e == '/' || *e == ':') next_char();	/* Skip bad attribute */	while (!atchr(*e) && !end_of_tag(*e) && !isspace(*e)) next_char();	if (attr) *attr = e;next_attr:	while (isspace(*e)) next_char();	/* Skip bad attribute */	while (!atchr(*e) && !end_of_tag(*e) && !isspace(*e)) next_char();	if (end_of_tag(*e)) goto end;	while (atchr(*e)) next_char();	while (isspace(*e)) next_char();	if (*e != '=') {		if (end_of_tag(*e)) goto end;		goto next_attr;	}	next_char();	while (isspace(*e)) next_char();	if (isquote(*e)) {		unsigned char quote = *e;/* quoted_value: */		next_char();		while (*e != quote) next_char();		next_char();		/* The following apparently handles the case of <foo		 * id="a""b">, however that is very rare and probably not		 * conforming. More frequent (and mishandling it more fatal) is		 * probably the typo of <foo id="a""> - we can handle it as		 * long as this is commented out. --pasky */		/* if (*e == quote) goto quoted_value; */	} else {		while (!isspace(*e) && !end_of_tag(*e)) next_char();	}	while (isspace(*e)) next_char();	if (!end_of_tag(*e)) goto next_attr;end:	if (end) *end = e + (*e == '>');	return 0;}#define realloc_chrs(x, l) mem_align_alloc(x, l, (l) + 1, unsigned char, 0xFF)#define add_chr(s, l, c)						\	do {								\		if (!realloc_chrs(&(s), l)) return NULL;		\		(s)[(l)++] = (c);					\	} while (0)unsigned char *get_attr_value(register unsigned char *e, unsigned char *name,	       enum html_attr_flags flags){	unsigned char *n;	unsigned char *name_start;	unsigned char *attr = NULL;	int attrlen = 0;	int found;next_attr:	skip_space(e);	if (end_of_tag(*e) || !atchr(*e)) goto parse_error;	n = name;	name_start = e;	while (atchr(*n) && atchr(*e) && toupper(*e) == toupper(*n)) e++, n++;	found = !*n && !atchr(*e);	if (found && (flags & HTML_ATTR_TEST)) return name_start;	while (atchr(*e)) e++;	skip_space(e);	if (*e != '=') {		if (found) goto found_endattr;		goto next_attr;	}	e++;	skip_space(e);	if (found) {		if (!isquote(*e)) {			while (!isspace(*e) && !end_of_tag(*e)) {				if (!*e) goto parse_error;				add_chr(attr, attrlen, *e);				e++;			}		} else {			unsigned char quote = *e;/* parse_quoted_value: */			while (*(++e) != quote) {				if (*e == ASCII_CR) continue;				if (!*e) goto parse_error;				if (*e != ASCII_TAB && *e != ASCII_LF)					add_chr(attr, attrlen, *e);				else if (!(flags & HTML_ATTR_EAT_NL))					add_chr(attr, attrlen, ' ');			}			e++;			/* The following apparently handles the case of <foo			 * id="a""b">, however that is very rare and probably			 * not conforming. More frequent (and mishandling it			 * more fatal) is probably the typo of <foo id="a""> -			 * we can handle it as long as this is commented out.			 * --pasky */#if 0			if (*e == quote) {				add_chr(attr, attrlen, *e);				goto parse_quoted_value;			}#endif		}found_endattr:		add_chr(attr, attrlen, '\0');		attrlen--;		if (/* Unused: !(flags & HTML_ATTR_NO_CONV) && */		    memchr(attr, '&', attrlen)) {			unsigned char *saved_attr = attr;			attr = convert_string(NULL, saved_attr, attrlen, CSM_QUERY, NULL, NULL, NULL);			mem_free(saved_attr);		}		set_mem_comment(trim_chars(attr, ' ', NULL), name, strlen(name));		return attr;	} else {		if (!isquote(*e)) {			while (!isspace(*e) && !end_of_tag(*e)) {				if (!*e) goto parse_error;				e++;			}		} else {			unsigned char quote = *e;			do {				while (*(++e) != quote)					if (!*e) goto parse_error;				e++;			} while (/* See above. *e == quote */ 0);		}	}	goto next_attr;parse_error:	mem_free_if(attr);	return NULL;}#undef add_chr/* Extract numerical value of attribute @name. * It will return a positive integer value on success, * or -1 on error. */intget_num(unsigned char *a, unsigned char *name){	unsigned char *al = get_attr_val(a, name);	int result = -1;	if (al) {		unsigned char *end;		long num;		errno = 0;		num = strtol(al, (char **) &end, 10);		if (!errno && *al && !*end && num >= 0 && num <= INT_MAX)			result = (int) num;		mem_free(al);	}	return result;}/* Parse 'width[%],....'-like attribute @name of element @a.  If @limited is * set, it will limit the width value to the current usable width. Note that * @limited must be set to be able to parse percentage widths. *//* The function returns width in characters or -1 in case of error. */intget_width(unsigned char *a, unsigned char *name, int limited){	unsigned char *value = get_attr_val(a, name);	unsigned char *str = value;	unsigned char *end;	int percentage = 0;	int len;	long width;	if (!value) return -1;	/* Skip spaces at start of string if any. */	skip_space(str);	/* Search for end of string or ',' character (ie. in "100,200") */	for (len = 0; str[len] && str[len] != ','; len++);	/* Go back, and skip spaces after width if any. */	while (len && isspace(str[len - 1])) len--;	if (!len) { mem_free(value); return -1; } /* Nothing to parse. */	/* Is this a percentage ? */	if (str[len - 1] == '%') len--, percentage = 1;	/* Skip spaces between width number and percentage if any. */	while (len && isspace(str[len - 1])) len--;	if (!len) { mem_free(value); return -1; } /* Nothing to parse. */	/* Shorten the string a bit, so strtoul() will work on useful	 * part of it. */	str[len] = '\0';	/* Convert to number if possible. */	errno = 0;	width = strtoul((char *) str, (char **) &end, 10);	/* @end points into the @value string so check @end position	 * before freeing @value. */	if (errno || *end || width >= INT_MAX) {		/* Not a valid number. */		mem_free(value);		return -1;	}	mem_free(value);#define WIDTH_PIXELS2CHARS(width) ((width) + (HTML_CHAR_WIDTH - 1) / 2) / HTML_CHAR_WIDTH;	if (limited) {		int maxwidth = get_html_max_width();		if (percentage) {			/* Value is a percentage. */			width = width * maxwidth / 100;		} else {			/* Value is a number of pixels, makes an approximation. */			width = WIDTH_PIXELS2CHARS(width);		}		if (width > maxwidth)			width = maxwidth;	} else {		if (percentage) {			/* No sense, we need @limited and @maxwidth for percentage. */			return -1;		} else {			/* Value is a number of pixels, makes an approximation,			 * no limit here */			width = WIDTH_PIXELS2CHARS(width);		}	}#undef WIDTH_PIXELS2CHARS	if (width < 0)		width = 0;	return width;}unsigned char *skip_comment(unsigned char *html, unsigned char *eof){	int comm = html + 4 <= eof && html[2] == '-' && html[3] == '-';	html += comm ? 4 : 2;	while (html < eof) {		if (!comm && html[0] == '>') return html + 1;		if (comm && html + 2 <= eof && html[0] == '-' && html[1] == '-') {			html += 2;			while (html < eof && *html == '-') html++;			while (html < eof && isspace(*html)) html++;			if (html >= eof) return eof;			if (*html == '>') return html + 1;			continue;		}		html++;	}	return eof;}/* These should be exported properly by specific HTML parser modules * implementing them. But for now... */void html_address(unsigned char *);void html_base(unsigned char *);void html_blockquote(unsigned char *);void html_body(unsigned char *);void html_bold(unsigned char *);void html_br(unsigned char *);void html_center(unsigned char *);void html_dd(unsigned char *);void html_dl(unsigned char *);void html_dt(unsigned char *);void html_fixed(unsigned char *);void html_font(unsigned char *);void html_frame(unsigned char *);void html_frameset(unsigned char *);void html_h1(unsigned char *);void html_h2(unsigned char *);void html_h3(unsigned char *);void html_h4(unsigned char *);void html_h5(unsigned char *);void html_h6(unsigned char *);void html_head(unsigned char *);void html_html(unsigned char *);void html_hr(unsigned char *);void html_italic(unsigned char *);void html_li(unsigned char *);void html_linebrk(unsigned char *);void html_noframes(unsigned char *);void html_ol(unsigned char *);void html_p(unsigned char *);void html_pre(unsigned char *);void html_script(unsigned char *);void html_skip(unsigned char *);void html_span(unsigned char *);void html_style(unsigned char *);void html_subscript(unsigned char *);void html_superscript(unsigned char *);void html_table(unsigned char *);void html_td(unsigned char *);void html_th(unsigned char *);void html_title(unsigned char *);void html_tr(unsigned char *);void html_ul(unsigned char *);void html_underline(unsigned char *);void html_xmp(unsigned char *);struct element_info {	/* Element name, uppercase. */	unsigned char *name;	/* Element handler. This does the relevant arguments processing and	 * formatting (by calling renderer hooks). Note that in a few cases,	 * this is just a placeholder and the element is given special care	 * in start_element() (which is also where we call these handlers). */	void (*func)(unsigned char *);	/* Basically something like how many line-breaks to put before	 * (and sometimes after) an element. Also, for various element closing	 * precedence heuristics, a value of zero signifies an inline element	 * and a non-zero value indicates a block element. */	int linebreak;	/* 0 - normal pair tags	 * 1 - normal non-pair tags	 * 2 - pair tags which cannot be nested (e.g., you cannot have <a><a>)	 * 3 - similiar to 2 but a little stricter, seems to be a	 *     <li>-specific hack */	int nopair;};#define NUMBER_OF_TAGS 65static struct element_info elements[] = {	{"A",		html_a,		0, 2},	{"ABBR",	html_italic,	0, 0},	{"ADDRESS",	html_address,	2, 0},	{"APPLET",	html_applet,	1, 1},	{"B",		html_bold,	0, 0},	{"BASE",	html_base,	0, 1},	{"BASEFONT",	html_font,	0, 1},	{"BLOCKQUOTE",	html_blockquote,2, 0},	{"BODY",	html_body,	0, 0},	{"BR",		html_br,	1, 1},	{"BUTTON",	html_button,	0, 0},	{"CAPTION",	html_center,	1, 0},	{"CENTER",	html_center,	1, 0},	{"CODE",	html_fixed,	0, 0},	{"DD",		html_dd,	1, 1},	{"DFN",		html_bold,	0, 0},	{"DIR",		html_ul,	2, 0},	{"DIV",		html_linebrk,	1, 0},	{"DL",		html_dl,	2, 0},	{"DT",		html_dt,	1, 1},	{"EM",		html_italic,	0, 0},	{"EMBED",	html_embed,	0, 1},	{"FIXED",	html_fixed,	0, 0},	{"FONT",	html_font,	0, 0},	{"FORM",	html_form,	1, 0},	{"FRAME",	html_frame,	1, 1},	{"FRAMESET",	html_frameset,	1, 0},	{"H1",		html_h1,	2, 2},	{"H2",		html_h2,	2, 2},	{"H3",		html_h3,	2, 2},	{"H4",		html_h4,	2, 2},	{"H5",		html_h5,	2, 2},	{"H6",		html_h6,	2, 2},	{"HEAD",	html_head,	0, 0},	{"HR",		html_hr,	2, 1},	{"HTML",	html_html,	0, 0},	{"I",		html_italic,	0, 0},	{"IFRAME",	html_iframe,	1, 1},	{"IMG",		html_img,	0, 1},	{"INPUT",	html_input,	0, 1},	{"LI",		html_li,	1, 3},	{"LINK",	html_link,	1, 1},	{"LISTING",	html_pre,	2, 0},	{"MENU",	html_ul,	2, 0},	{"NOFRAMES",	html_noframes,	0, 0},	{"OBJECT",	html_object,	1, 1},	{"OL",		html_ol,	2, 0},	{"OPTION",	html_option,	1, 1},	{"P",		html_p,		2, 2},	{"PRE",		html_pre,	2, 0},	{"Q",		html_italic,	0, 0},	{"S",		html_underline,	0, 0},	{"SCRIPT",	html_script,	0, 0},	{"SELECT",	html_select,	0, 0},	{"SPAN",	html_span,	0, 0},	{"STRIKE",	html_underline,	0, 0},	{"STRONG",	html_bold,	0, 0},	{"STYLE",	html_style,	0, 0},	{"SUB",		html_subscript, 0, 0},	{"SUP",		html_superscript,0,0},	{"TABLE",	html_table,	2, 0},	{"TD",		html_td,	0, 0},	{"TEXTAREA",	html_textarea,	0, 1},	{"TH",		html_th,	0, 0},	{"TITLE",	html_title,	0, 0},	{"TR",		html_tr,	1, 0},	{"U",		html_underline,	0, 0},	{"UL",		html_ul,	2, 0},	{"XMP",		html_xmp,	2, 0},	{NULL,		NULL, 0, 0},};#ifndef USE_FASTFINDstatic intcompar(const void *a, const void *b){	return strcasecmp(((struct element_info *) a)->name,			  ((struct element_info *) b)->name);}#elsestatic struct element_info *internal_pointer;/* Reset internal list pointer */static voidtags_list_reset(void){	internal_pointer = elements;}/* Returns a pointer to a struct that contains
parse.c - 源码说明

本页面展示了「一个很有名的浏览器」中的 parse.c 源码文件，采用 C语言编程语言编写，共 1,095 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与浏览器相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?