📄 parse.c

📁 elinks下lynx是最重要的二个文本浏览器, 在linux下非常实用, elinks也是gentoo安装过程中默认使用的浏览器, 这是elinks源代码
💻 C
📖 第 1 页 / 共 3 页
字号:
unsigned char *skip_comment(unsigned char *html, unsigned char *eof){	if (html + 4 <= eof && html[2] == '-' && html[3] == '-') {		html += 4;		while (html < eof) {			if (html + 2 <= eof && html[0] == '-' && html[1] == '-') {				html += 2;				while (html < eof && *html == '-') html++;				while (html < eof && isspace(*html)) html++;				if (html >= eof) return eof;				if (*html == '>') return html + 1;				continue;			}			html++;		}	} else {		html += 2;		while (html < eof) {			if (html[0] == '>') return html + 1;			html++;		}	}	return eof;}enum element_type {	ELEMENT_TYPE_NESTABLE,	ELEMENT_TYPE_NON_NESTABLE,	ELEMENT_TYPE_NON_PAIRABLE,	ELEMENT_TYPE_LI,};struct element_info {	/* Element name, uppercase. */	unsigned char *name;	/* Element handler. This does the relevant arguments processing and	 * formatting (by calling renderer hooks). Note that in a few cases,	 * this is just a placeholder and the element is given special care	 * in start_element() (which is also where we call these handlers). */	element_handler_T *func;	/* How many line-breaks to ensure we have before and after an element.	 * Value of 1 means the element will be on a line on its own, value	 * of 2 means that it will also have empty lines before and after.	 * Note that this does not add up - it just ensures that there is	 * at least so many linebreaks, but does not add more if that is the	 * case. Therefore, something like e.g. </pre></p> will add only two	 * linebreaks, not four. */	/* In some stack killing logic, we use some weird heuristic based on	 * whether an element is block or inline. That is determined from	 * whether this attribute is zero on non-zero. */	int linebreak;	enum element_type type;};static struct element_info elements[] = {        {"A",           html_a,           0, ELEMENT_TYPE_NON_NESTABLE},        {"ABBR",        html_italic,      0, ELEMENT_TYPE_NESTABLE    },        {"ADDRESS",     html_address,     2, ELEMENT_TYPE_NESTABLE    },        {"APPLET",      html_applet,      1, ELEMENT_TYPE_NON_PAIRABLE},        {"B",           html_bold,        0, ELEMENT_TYPE_NESTABLE    },        {"BASE",        html_base,        0, ELEMENT_TYPE_NON_PAIRABLE},        {"BASEFONT",    html_font,        0, ELEMENT_TYPE_NON_PAIRABLE},        {"BLOCKQUOTE",  html_blockquote,  2, ELEMENT_TYPE_NESTABLE    },        {"BODY",        html_body,        0, ELEMENT_TYPE_NESTABLE    },        {"BR",          html_br,          1, ELEMENT_TYPE_NON_PAIRABLE},        {"BUTTON",      html_button,      0, ELEMENT_TYPE_NESTABLE    },        {"CAPTION",     html_center,      1, ELEMENT_TYPE_NESTABLE    },        {"CENTER",      html_center,      1, ELEMENT_TYPE_NESTABLE    },        {"CODE",        html_fixed,       0, ELEMENT_TYPE_NESTABLE    },        {"DD",          html_dd,          1, ELEMENT_TYPE_NON_PAIRABLE},        {"DFN",         html_bold,        0, ELEMENT_TYPE_NESTABLE    },        {"DIR",         html_ul,          2, ELEMENT_TYPE_NESTABLE    },        {"DIV",         html_linebrk,     1, ELEMENT_TYPE_NESTABLE    },        {"DL",          html_dl,          2, ELEMENT_TYPE_NESTABLE    },        {"DT",          html_dt,          1, ELEMENT_TYPE_NON_PAIRABLE},        {"EM",          html_italic,      0, ELEMENT_TYPE_NESTABLE    },        {"EMBED",       html_embed,       0, ELEMENT_TYPE_NON_PAIRABLE},        {"FIXED",       html_fixed,       0, ELEMENT_TYPE_NESTABLE    },        {"FONT",        html_font,        0, ELEMENT_TYPE_NESTABLE    },        {"FORM",        html_form,        1, ELEMENT_TYPE_NESTABLE    },        {"FRAME",       html_frame,       1, ELEMENT_TYPE_NON_PAIRABLE},        {"FRAMESET",    html_frameset,    1, ELEMENT_TYPE_NESTABLE    },        {"H1",          html_h1,          2, ELEMENT_TYPE_NON_NESTABLE},        {"H2",          html_h2,          2, ELEMENT_TYPE_NON_NESTABLE},        {"H3",          html_h3,          2, ELEMENT_TYPE_NON_NESTABLE},        {"H4",          html_h4,          2, ELEMENT_TYPE_NON_NESTABLE},        {"H5",          html_h5,          2, ELEMENT_TYPE_NON_NESTABLE},        {"H6",          html_h6,          2, ELEMENT_TYPE_NON_NESTABLE},        {"HEAD",        html_head,        0, ELEMENT_TYPE_NESTABLE    },        {"HR",          html_hr,          2, ELEMENT_TYPE_NON_PAIRABLE},        {"HTML",        html_html,        0, ELEMENT_TYPE_NESTABLE    },        {"I",           html_italic,      0, ELEMENT_TYPE_NESTABLE    },        {"IFRAME",      html_iframe,      1, ELEMENT_TYPE_NON_PAIRABLE},        {"IMG",         html_img,         0, ELEMENT_TYPE_NON_PAIRABLE},        {"INPUT",       html_input,       0, ELEMENT_TYPE_NON_PAIRABLE},        {"LI",          html_li,          1, ELEMENT_TYPE_LI          },        {"LINK",        html_link,        1, ELEMENT_TYPE_NON_PAIRABLE},        {"LISTING",     html_pre,         2, ELEMENT_TYPE_NESTABLE    },        {"MENU",        html_ul,          2, ELEMENT_TYPE_NESTABLE    },        {"META",        html_meta,        0, ELEMENT_TYPE_NON_PAIRABLE},        {"NOFRAMES",    html_noframes,    0, ELEMENT_TYPE_NESTABLE    },        {"NOSCRIPT",    html_noscript,    0, ELEMENT_TYPE_NESTABLE    },        {"OBJECT",      html_object,      1, ELEMENT_TYPE_NON_PAIRABLE},        {"OL",          html_ol,          2, ELEMENT_TYPE_NESTABLE    },        {"OPTION",      html_option,      1, ELEMENT_TYPE_NON_PAIRABLE},        {"P",           html_p,           2, ELEMENT_TYPE_NON_NESTABLE},        {"PRE",         html_pre,         2, ELEMENT_TYPE_NESTABLE    },        {"Q",           html_italic,      0, ELEMENT_TYPE_NESTABLE    },        {"S",           html_underline,   0, ELEMENT_TYPE_NESTABLE    },        {"SCRIPT",      html_script,      0, ELEMENT_TYPE_NESTABLE    },        {"SELECT",      html_select,      0, ELEMENT_TYPE_NESTABLE    },        {"SPAN",        html_span,        0, ELEMENT_TYPE_NESTABLE    },        {"STRIKE",      html_underline,   0, ELEMENT_TYPE_NESTABLE    },        {"STRONG",      html_bold,        0, ELEMENT_TYPE_NESTABLE    },        {"STYLE",       html_style,       0, ELEMENT_TYPE_NESTABLE    },        {"SUB",         html_subscript,   0, ELEMENT_TYPE_NESTABLE    },        {"SUP",         html_superscript, 0, ELEMENT_TYPE_NESTABLE    },        {"TABLE",       html_table,       2, ELEMENT_TYPE_NESTABLE    },        {"TD",          html_td,          0, ELEMENT_TYPE_NESTABLE    },        {"TEXTAREA",    html_textarea,    0, ELEMENT_TYPE_NON_PAIRABLE},        {"TH",          html_th,          0, ELEMENT_TYPE_NESTABLE    },        {"TITLE",       html_title,       0, ELEMENT_TYPE_NESTABLE    },        {"TR",          html_tr,          1, ELEMENT_TYPE_NESTABLE    },        {"TT",          html_tt,          0, ELEMENT_TYPE_NON_NESTABLE},        {"U",           html_underline,   0, ELEMENT_TYPE_NESTABLE    },        {"UL",          html_ul,          2, ELEMENT_TYPE_NESTABLE    },        {"XMP",         html_xmp,         2, ELEMENT_TYPE_NESTABLE    },        {NULL,          NULL,             0, ELEMENT_TYPE_NESTABLE    },};#define NUMBER_OF_TAGS (sizeof_array(elements) - 1)#ifndef USE_FASTFINDstatic intcompar(const void *a, const void *b){	return strcasecmp(((struct element_info *) a)->name,			  ((struct element_info *) b)->name);}#elsestatic struct element_info *internal_pointer;/* Reset internal list pointer */static voidtags_list_reset(void){	internal_pointer = elements;}/* Returns a pointer to a struct that contains * current key and data pointers and increment * internal pointer. * It returns NULL when key is NULL. */static struct fastfind_key_value *tags_list_next(void){	static struct fastfind_key_value kv;	if (!internal_pointer->name) return NULL;	kv.key = internal_pointer->name;	kv.data = internal_pointer;	internal_pointer++;	return &kv;}static struct fastfind_index ff_tags_index	= INIT_FASTFIND_INDEX("tags_lookup", tags_list_reset, tags_list_next);#endif /* USE_FASTFIND */voidinit_tags_lookup(void){#ifdef USE_FASTFIND	fastfind_index(&ff_tags_index, FF_COMPRESS);#endif}voidfree_tags_lookup(void){#ifdef USE_FASTFIND	fastfind_done(&ff_tags_index);#endif}static unsigned char *process_element(unsigned char *name, int namelen, int endingtag,                unsigned char *html, unsigned char *prev_html,                unsigned char *eof, unsigned char *attr,                struct html_context *html_context);voidparse_html(unsigned char *html, unsigned char *eof,	   struct part *part, unsigned char *head,	   struct html_context *html_context){	unsigned char *base_pos = html;	int noupdate = 0;	html_context->putsp = HTML_SPACE_SUPPRESS;	html_context->line_breax = html_context->table_level ? 2 : 1;	html_context->position = 0;	html_context->was_br = 0;	html_context->was_li = 0;	html_context->was_body = 0;/*	html_context->was_body_background = 0; */	html_context->part = part;	html_context->eoff = eof;	if (head) process_head(html_context, head);main_loop:	while (html < eof) {		unsigned char *name, *attr, *end;		int namelen, endingtag;		int dotcounter = 0;		if (!noupdate) {			html_context->part = part;			html_context->eoff = eof;			base_pos = html;		} else {			noupdate = 0;		}		if (isspace(*html) && !html_is_preformatted()) {			unsigned char *h = html;			while (h < eof && isspace(*h))				h++;			if (h + 1 < eof && h[0] == '<' && h[1] == '/') {				if (!parse_element(h, eof, &name, &namelen, &attr, &end)) {					put_chrs(html_context, base_pos, html - base_pos);					base_pos = html = h;					html_context->putsp = HTML_SPACE_ADD;					goto element;				}			}			html++;			if (!(html_context->position + (html - base_pos - 1)))				goto skip_w; /* ??? */			if (*(html - 1) == ' ') {	/* Do not replace with isspace() ! --Zas */				/* BIG performance win; not sure if it doesn't cause any bug */				if (html < eof && !isspace(*html)) {					noupdate = 1;					continue;				}				put_chrs(html_context, base_pos, html - base_pos);			} else {				put_chrs(html_context, base_pos, html - base_pos - 1);				put_chrs(html_context, " ", 1);			}skip_w:			while (html < eof && isspace(*html))				html++;			continue;		}		if (html_is_preformatted()) {			html_context->putsp = HTML_SPACE_NORMAL;			if (*html == ASCII_TAB) {				put_chrs(html_context, base_pos, html - base_pos);				put_chrs(html_context, "        ",				         8 - (html_context->position % 8));				html++;				continue;			} else if (*html == ASCII_CR || *html == ASCII_LF) {				put_chrs(html_context, base_pos, html - base_pos);				if (html - base_pos == 0 && html_context->line_breax > 0)					html_context->line_breax--;next_break:				if (*html == ASCII_CR && html < eof - 1				    && html[1] == ASCII_LF)					html++;				ln_break(html_context, 1);				html++;				if (*html == ASCII_CR || *html == ASCII_LF) {					html_context->line_breax = 0;					goto next_break;				}				continue;			} else if (html + 5 < eof && *html == '&') {				/* Really nasty hack to make &#13; handling in				 * <pre>-tags lynx-compatible. It works around				 * the entity handling done in the renderer,				 * since checking #13 value there would require				 * something along the lines of NBSP_CHAR or				 * checking for '\n's in AT_PREFORMATTED text. */				/* See bug 52 and 387 for more info. */				int length = html - base_pos;				int newlines = 0;				while ((html + 5 < eof && html[0] == '&' && html[1] == '#')				       && (!memcmp(html + 2, "13;", 3)					   || (html + 6 < eof && !strncasecmp(html + 2, "x0a;", 4)))) {					newlines++;					html += 5 + (html[4] != ';');				}				if (newlines) {					put_chrs(html_context, base_pos, length);					ln_break(html_context, newlines);					continue;				}			}		}		while (*html < ' ') {			if (html - base_pos)				put_chrs(html_context, base_pos, html - base_pos);			dotcounter++;			base_pos = ++html;			if (*html >= ' ' || isspace(*html) || html >= eof) {				unsigned char *dots = fmem_alloc(dotcounter);				if (dots) {					memset(dots, '.', dotcounter);					put_chrs(html_context, dots, dotcounter);					fmem_free(dots);				}				goto main_loop;			}		}		if (html + 2 <= eof && html[0] == '<' && (html[1] == '!' || html[1] == '?')		    && !html_context->was_xmp) {			put_chrs(html_context, base_pos, html - base_pos);			html = skip_comment(html, eof);			continue;		}		if (*html != '<' || parse_element(html, eof, &name, &namelen, &attr, &end)) {			html++;			noupdate = 1;			continue;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -