📄 parse.c

📁 elinks下lynx是最重要的二个文本浏览器, 在linux下非常实用, elinks也是gentoo安装过程中默认使用的浏览器, 这是elinks源代码
💻 C
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/* HTML core parser routines */#ifdef HAVE_CONFIG_H#include "config.h"#endif#include <errno.h>#include <stdarg.h>#include <stdio.h>#include <stdlib.h>#include <string.h>#include "elinks.h"#include "document/css/apply.h"#include "document/css/parser.h"#include "document/html/parser/forms.h"#include "document/html/parser/general.h"#include "document/html/parser/link.h"#include "document/html/parser/parse.h"#include "document/html/parser/stack.h"#include "document/html/parser.h"#include "document/options.h"#include "intl/charsets.h"#include "util/conv.h"#include "util/error.h"#include "util/fastfind.h"#include "util/memdebug.h"#include "util/memory.h"#include "util/string.h"/* Unsafe macros */#include "document/html/internal.h"#define end_of_tag(c) ((c) == '>' || (c) == '<')static inline intatchr(register unsigned char c){	return (c < 127 && (c > '>' || (c > ' ' && c != '=' && !end_of_tag(c))));}/* This function eats one html element. *//* - e is pointer to the begining of the element (*e must be '<') * - eof is pointer to the end of scanned area * - parsed element name is stored in name, it's length is namelen * - first attribute is stored in attr * - end points to first character behind the html element *//* It returns -1 when it failed (returned values in pointers are invalid) and * 0 for success. */intparse_element(register unsigned char *e, unsigned char *eof,	      unsigned char **name, int *namelen,	      unsigned char **attr, unsigned char **end){#define next_char() if (++e == eof) return -1;	assert(e && eof);	if (e >= eof || *e != '<') return -1;	next_char();	if (name) *name = e;	if (*e == '/') next_char();	if (!isident(*e)) return -1;	while (isident(*e)) next_char();	if (!isspace(*e) && !end_of_tag(*e) && *e != '/' && *e != ':' && *e != '=')		return -1;	if (name && namelen) *namelen = e - *name;	while (isspace(*e) || *e == '/' || *e == ':') next_char();	/* Skip bad attribute */	while (!atchr(*e) && !end_of_tag(*e) && !isspace(*e)) next_char();	if (attr) *attr = e;next_attr:	while (isspace(*e)) next_char();	/* Skip bad attribute */	while (!atchr(*e) && !end_of_tag(*e) && !isspace(*e)) next_char();	if (end_of_tag(*e)) goto end;	while (atchr(*e)) next_char();	while (isspace(*e)) next_char();	if (*e != '=') {		if (end_of_tag(*e)) goto end;		goto next_attr;	}	next_char();	while (isspace(*e)) next_char();	if (isquote(*e)) {		unsigned char quote = *e;/* quoted_value: */		next_char();		while (*e != quote) next_char();		next_char();		/* The following apparently handles the case of <foo		 * id="a""b">, however that is very rare and probably not		 * conforming. More frequent (and mishandling it more fatal) is		 * probably the typo of <foo id="a""> - we can handle it as		 * long as this is commented out. --pasky */		/* if (*e == quote) goto quoted_value; */	} else {		while (!isspace(*e) && !end_of_tag(*e)) next_char();	}	while (isspace(*e)) next_char();	if (!end_of_tag(*e)) goto next_attr;end:	if (end) *end = e + (*e == '>');	return 0;}#define realloc_chrs(x, l) mem_align_alloc(x, l, (l) + 1, unsigned char, 0xFF)#define add_chr(s, l, c)						\	do {								\		if (!realloc_chrs(&(s), l)) return NULL;		\		(s)[(l)++] = (c);					\	} while (0)unsigned char *get_attr_value(register unsigned char *e, unsigned char *name,	       struct document_options *options, enum html_attr_flags flags){	unsigned char *n;	unsigned char *name_start;	unsigned char *attr = NULL;	int attrlen = 0;	int found;next_attr:	skip_space(e);	if (end_of_tag(*e) || !atchr(*e)) goto parse_error;	n = name;	name_start = e;	while (atchr(*n) && atchr(*e) && toupper(*e) == toupper(*n)) e++, n++;	found = !*n && !atchr(*e);	if (found && (flags & HTML_ATTR_TEST)) return name_start;	while (atchr(*e)) e++;	skip_space(e);	if (*e != '=') {		if (found) goto found_endattr;		goto next_attr;	}	e++;	skip_space(e);	if (found) {		if (!isquote(*e)) {			while (!isspace(*e) && !end_of_tag(*e)) {				if (!*e) goto parse_error;				add_chr(attr, attrlen, *e);				e++;			}		} else {			unsigned char quote = *e;/* parse_quoted_value: */			while (*(++e) != quote) {				if (*e == ASCII_CR) continue;				if (!*e) goto parse_error;				if (*e != ASCII_TAB && *e != ASCII_LF)					add_chr(attr, attrlen, *e);				else if (!(flags & HTML_ATTR_EAT_NL))					add_chr(attr, attrlen, ' ');			}			e++;			/* The following apparently handles the case of <foo			 * id="a""b">, however that is very rare and probably			 * not conforming. More frequent (and mishandling it			 * more fatal) is probably the typo of <foo id="a""> -			 * we can handle it as long as this is commented out.			 * --pasky */#if 0			if (*e == quote) {				add_chr(attr, attrlen, *e);				goto parse_quoted_value;			}#endif		}found_endattr:		add_chr(attr, attrlen, '\0');		attrlen--;		if (/* Unused: !(flags & HTML_ATTR_NO_CONV) && */		    memchr(attr, '&', attrlen)) {			unsigned char *saved_attr = attr;			attr = convert_string(NULL, saved_attr, attrlen,			                      options->cp, CSM_QUERY,			                      NULL, NULL, NULL);			mem_free(saved_attr);		}		set_mem_comment(trim_chars(attr, ' ', NULL), name, strlen(name));		return attr;	} else {		if (!isquote(*e)) {			while (!isspace(*e) && !end_of_tag(*e)) {				if (!*e) goto parse_error;				e++;			}		} else {			unsigned char quote = *e;			do {				while (*(++e) != quote)					if (!*e) goto parse_error;				e++;			} while (/* See above. *e == quote */ 0);		}	}	goto next_attr;parse_error:	mem_free_if(attr);	return NULL;}#undef add_chr/* Extract numerical value of attribute @name. * It will return a positive integer value on success, * or -1 on error. */intget_num(unsigned char *a, unsigned char *name, struct document_options *options){	unsigned char *al = get_attr_val(a, name, options);	int result = -1;	if (al) {		unsigned char *end;		long num;		errno = 0;		num = strtol(al, (char **) &end, 10);		if (!errno && *al && !*end && num >= 0 && num <= INT_MAX)			result = (int) num;		mem_free(al);	}	return result;}/* Parse 'width[%],....'-like attribute @name of element @a.  If @limited is * set, it will limit the width value to the current usable width. Note that * @limited must be set to be able to parse percentage widths. *//* The function returns width in characters or -1 in case of error. */intget_width(unsigned char *a, unsigned char *name, int limited,          struct html_context *html_context){	unsigned char *value = get_attr_val(a, name, html_context->options);	unsigned char *str = value;	unsigned char *end;	int percentage = 0;	int len;	long width;	if (!value) return -1;	/* Skip spaces at start of string if any. */	skip_space(str);	/* Search for end of string or ',' character (ie. in "100,200") */	for (len = 0; str[len] && str[len] != ','; len++);	/* Go back, and skip spaces after width if any. */	while (len && isspace(str[len - 1])) len--;	if (!len) { mem_free(value); return -1; } /* Nothing to parse. */	/* Is this a percentage ? */	if (str[len - 1] == '%') len--, percentage = 1;	/* Skip spaces between width number and percentage if any. */	while (len && isspace(str[len - 1])) len--;	if (!len) { mem_free(value); return -1; } /* Nothing to parse. */	/* Shorten the string a bit, so strtoul() will work on useful	 * part of it. */	str[len] = '\0';	/* Convert to number if possible. */	errno = 0;	width = strtoul((char *) str, (char **) &end, 10);	/* @end points into the @value string so check @end position	 * before freeing @value. */	if (errno || *end || width >= INT_MAX) {		/* Not a valid number. */		mem_free(value);		return -1;	}	mem_free(value);#define WIDTH_PIXELS2CHARS(width) ((width) + (HTML_CHAR_WIDTH - 1) / 2) / HTML_CHAR_WIDTH;	if (limited) {		int maxwidth = get_html_max_width();		if (percentage) {			/* Value is a percentage. */			width = width * maxwidth / 100;		} else {			/* Value is a number of pixels, makes an approximation. */			width = WIDTH_PIXELS2CHARS(width);		}		if (width > maxwidth)			width = maxwidth;	} else {		if (percentage) {			/* No sense, we need @limited and @maxwidth for percentage. */			return -1;		} else {			/* Value is a number of pixels, makes an approximation,			 * no limit here */			width = WIDTH_PIXELS2CHARS(width);		}	}#undef WIDTH_PIXELS2CHARS	if (width < 0)		width = 0;	return width;}
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -