📄 parse.c
字号:
/* HTML core parser routines *//* $Id: parse.c,v 1.102.2.7 2005/04/05 21:08:41 jonas Exp $ */#ifdef HAVE_CONFIG_H#include "config.h"#endif#include <errno.h>#include <stdarg.h>#include <stdio.h>#include <stdlib.h>#include <string.h>#include "elinks.h"#include "document/css/apply.h"#include "document/css/parser.h"#include "document/html/parser/forms.h"#include "document/html/parser/link.h"#include "document/html/parser/parse.h"#include "document/html/parser/stack.h"#include "document/html/parser.h"#include "document/html/tables.h"#include "document/options.h"#include "intl/charsets.h"#include "util/conv.h"#include "util/error.h"#include "util/fastfind.h"#include "util/memdebug.h"#include "util/memory.h"#include "util/string.h"/* Unsafe macros */#include "document/html/internal.h"#define end_of_tag(c) ((c) == '>' || (c) == '<')static inline intatchr(register unsigned char c){ return (c < 127 && (c > '>' || (c > ' ' && c != '=' && !end_of_tag(c))));}/* This function eats one html element. *//* - e is pointer to the begining of the element (*e must be '<') * - eof is pointer to the end of scanned area * - parsed element name is stored in name, it's length is namelen * - first attribute is stored in attr * - end points to first character behind the html element *//* It returns -1 when it failed (returned values in pointers are invalid) and * 0 for success. */intparse_element(register unsigned char *e, unsigned char *eof, unsigned char **name, int *namelen, unsigned char **attr, unsigned char **end){#define next_char() if (++e == eof) return -1; assert(e && eof); if (e >= eof || *e != '<') return -1; next_char(); if (name) *name = e; if (*e == '/') next_char(); if (!isident(*e)) return -1; while (isident(*e)) next_char(); if (!isspace(*e) && !end_of_tag(*e) && *e != '/' && *e != ':' && *e != '=') return -1; if (name && namelen) *namelen = e - *name; while (isspace(*e) || *e == '/' || *e == ':') next_char(); /* Skip bad attribute */ while (!atchr(*e) && !end_of_tag(*e) && !isspace(*e)) next_char(); if (attr) *attr = e;next_attr: while (isspace(*e)) next_char(); /* Skip bad attribute */ while (!atchr(*e) && !end_of_tag(*e) && !isspace(*e)) next_char(); if (end_of_tag(*e)) goto end; while (atchr(*e)) next_char(); while (isspace(*e)) next_char(); if (*e != '=') { if (end_of_tag(*e)) goto end; goto next_attr; } next_char(); while (isspace(*e)) next_char(); if (isquote(*e)) { unsigned char quote = *e;/* quoted_value: */ next_char(); while (*e != quote) next_char(); next_char(); /* The following apparently handles the case of <foo * id="a""b">, however that is very rare and probably not * conforming. More frequent (and mishandling it more fatal) is * probably the typo of <foo id="a""> - we can handle it as * long as this is commented out. --pasky */ /* if (*e == quote) goto quoted_value; */ } else { while (!isspace(*e) && !end_of_tag(*e)) next_char(); } while (isspace(*e)) next_char(); if (!end_of_tag(*e)) goto next_attr;end: if (end) *end = e + (*e == '>'); return 0;}#define realloc_chrs(x, l) mem_align_alloc(x, l, (l) + 1, unsigned char, 0xFF)#define add_chr(s, l, c) \ do { \ if (!realloc_chrs(&(s), l)) return NULL; \ (s)[(l)++] = (c); \ } while (0)unsigned char *get_attr_value(register unsigned char *e, unsigned char *name, enum html_attr_flags flags){ unsigned char *n; unsigned char *name_start; unsigned char *attr = NULL; int attrlen = 0; int found;next_attr: skip_space(e); if (end_of_tag(*e) || !atchr(*e)) goto parse_error; n = name; name_start = e; while (atchr(*n) && atchr(*e) && toupper(*e) == toupper(*n)) e++, n++; found = !*n && !atchr(*e); if (found && (flags & HTML_ATTR_TEST)) return name_start; while (atchr(*e)) e++; skip_space(e); if (*e != '=') { if (found) goto found_endattr; goto next_attr; } e++; skip_space(e); if (found) { if (!isquote(*e)) { while (!isspace(*e) && !end_of_tag(*e)) { if (!*e) goto parse_error; add_chr(attr, attrlen, *e); e++; } } else { unsigned char quote = *e;/* parse_quoted_value: */ while (*(++e) != quote) { if (*e == ASCII_CR) continue; if (!*e) goto parse_error; if (*e != ASCII_TAB && *e != ASCII_LF) add_chr(attr, attrlen, *e); else if (!(flags & HTML_ATTR_EAT_NL)) add_chr(attr, attrlen, ' '); } e++; /* The following apparently handles the case of <foo * id="a""b">, however that is very rare and probably * not conforming. More frequent (and mishandling it * more fatal) is probably the typo of <foo id="a""> - * we can handle it as long as this is commented out. * --pasky */#if 0 if (*e == quote) { add_chr(attr, attrlen, *e); goto parse_quoted_value; }#endif }found_endattr: add_chr(attr, attrlen, '\0'); attrlen--; if (/* Unused: !(flags & HTML_ATTR_NO_CONV) && */ memchr(attr, '&', attrlen)) { unsigned char *saved_attr = attr; attr = convert_string(NULL, saved_attr, attrlen, CSM_QUERY, NULL, NULL, NULL); mem_free(saved_attr); } set_mem_comment(trim_chars(attr, ' ', NULL), name, strlen(name)); return attr; } else { if (!isquote(*e)) { while (!isspace(*e) && !end_of_tag(*e)) { if (!*e) goto parse_error; e++; } } else { unsigned char quote = *e; do { while (*(++e) != quote) if (!*e) goto parse_error; e++; } while (/* See above. *e == quote */ 0); } } goto next_attr;parse_error: mem_free_if(attr); return NULL;}#undef add_chr/* Extract numerical value of attribute @name. * It will return a positive integer value on success, * or -1 on error. */intget_num(unsigned char *a, unsigned char *name){ unsigned char *al = get_attr_val(a, name); int result = -1; if (al) { unsigned char *end; long num; errno = 0; num = strtol(al, (char **) &end, 10); if (!errno && *al && !*end && num >= 0 && num <= INT_MAX) result = (int) num; mem_free(al); } return result;}/* Parse 'width[%],....'-like attribute @name of element @a. If @limited is * set, it will limit the width value to the current usable width. Note that * @limited must be set to be able to parse percentage widths. *//* The function returns width in characters or -1 in case of error. */intget_width(unsigned char *a, unsigned char *name, int limited){ unsigned char *value = get_attr_val(a, name); unsigned char *str = value; unsigned char *end; int percentage = 0; int len; long width; if (!value) return -1; /* Skip spaces at start of string if any. */ skip_space(str); /* Search for end of string or ',' character (ie. in "100,200") */ for (len = 0; str[len] && str[len] != ','; len++); /* Go back, and skip spaces after width if any. */ while (len && isspace(str[len - 1])) len--; if (!len) { mem_free(value); return -1; } /* Nothing to parse. */ /* Is this a percentage ? */ if (str[len - 1] == '%') len--, percentage = 1; /* Skip spaces between width number and percentage if any. */ while (len && isspace(str[len - 1])) len--; if (!len) { mem_free(value); return -1; } /* Nothing to parse. */ /* Shorten the string a bit, so strtoul() will work on useful * part of it. */ str[len] = '\0'; /* Convert to number if possible. */ errno = 0; width = strtoul((char *) str, (char **) &end, 10); /* @end points into the @value string so check @end position * before freeing @value. */ if (errno || *end || width >= INT_MAX) { /* Not a valid number. */ mem_free(value); return -1; } mem_free(value);#define WIDTH_PIXELS2CHARS(width) ((width) + (HTML_CHAR_WIDTH - 1) / 2) / HTML_CHAR_WIDTH; if (limited) { int maxwidth = get_html_max_width(); if (percentage) { /* Value is a percentage. */ width = width * maxwidth / 100; } else { /* Value is a number of pixels, makes an approximation. */ width = WIDTH_PIXELS2CHARS(width); } if (width > maxwidth) width = maxwidth; } else { if (percentage) { /* No sense, we need @limited and @maxwidth for percentage. */ return -1; } else { /* Value is a number of pixels, makes an approximation, * no limit here */ width = WIDTH_PIXELS2CHARS(width); } }#undef WIDTH_PIXELS2CHARS if (width < 0) width = 0; return width;}unsigned char *skip_comment(unsigned char *html, unsigned char *eof){ int comm = html + 4 <= eof && html[2] == '-' && html[3] == '-'; html += comm ? 4 : 2; while (html < eof) { if (!comm && html[0] == '>') return html + 1; if (comm && html + 2 <= eof && html[0] == '-' && html[1] == '-') { html += 2; while (html < eof && *html == '-') html++; while (html < eof && isspace(*html)) html++; if (html >= eof) return eof; if (*html == '>') return html + 1; continue; } html++; } return eof;}/* These should be exported properly by specific HTML parser modules * implementing them. But for now... */void html_address(unsigned char *);void html_base(unsigned char *);void html_blockquote(unsigned char *);void html_body(unsigned char *);void html_bold(unsigned char *);void html_br(unsigned char *);void html_center(unsigned char *);void html_dd(unsigned char *);void html_dl(unsigned char *);void html_dt(unsigned char *);void html_fixed(unsigned char *);void html_font(unsigned char *);void html_frame(unsigned char *);void html_frameset(unsigned char *);void html_h1(unsigned char *);void html_h2(unsigned char *);void html_h3(unsigned char *);void html_h4(unsigned char *);void html_h5(unsigned char *);void html_h6(unsigned char *);void html_head(unsigned char *);void html_html(unsigned char *);void html_hr(unsigned char *);void html_italic(unsigned char *);void html_li(unsigned char *);void html_linebrk(unsigned char *);void html_noframes(unsigned char *);void html_ol(unsigned char *);void html_p(unsigned char *);void html_pre(unsigned char *);void html_script(unsigned char *);void html_skip(unsigned char *);void html_span(unsigned char *);void html_style(unsigned char *);void html_subscript(unsigned char *);void html_superscript(unsigned char *);void html_table(unsigned char *);void html_td(unsigned char *);void html_th(unsigned char *);void html_title(unsigned char *);void html_tr(unsigned char *);void html_ul(unsigned char *);void html_underline(unsigned char *);void html_xmp(unsigned char *);struct element_info { /* Element name, uppercase. */ unsigned char *name; /* Element handler. This does the relevant arguments processing and * formatting (by calling renderer hooks). Note that in a few cases, * this is just a placeholder and the element is given special care * in start_element() (which is also where we call these handlers). */ void (*func)(unsigned char *); /* Basically something like how many line-breaks to put before * (and sometimes after) an element. Also, for various element closing * precedence heuristics, a value of zero signifies an inline element * and a non-zero value indicates a block element. */ int linebreak; /* 0 - normal pair tags * 1 - normal non-pair tags * 2 - pair tags which cannot be nested (e.g., you cannot have <a><a>) * 3 - similiar to 2 but a little stricter, seems to be a * <li>-specific hack */ int nopair;};#define NUMBER_OF_TAGS 65static struct element_info elements[] = { {"A", html_a, 0, 2}, {"ABBR", html_italic, 0, 0}, {"ADDRESS", html_address, 2, 0}, {"APPLET", html_applet, 1, 1}, {"B", html_bold, 0, 0}, {"BASE", html_base, 0, 1}, {"BASEFONT", html_font, 0, 1}, {"BLOCKQUOTE", html_blockquote,2, 0}, {"BODY", html_body, 0, 0}, {"BR", html_br, 1, 1}, {"BUTTON", html_button, 0, 0}, {"CAPTION", html_center, 1, 0}, {"CENTER", html_center, 1, 0}, {"CODE", html_fixed, 0, 0}, {"DD", html_dd, 1, 1}, {"DFN", html_bold, 0, 0}, {"DIR", html_ul, 2, 0}, {"DIV", html_linebrk, 1, 0}, {"DL", html_dl, 2, 0}, {"DT", html_dt, 1, 1}, {"EM", html_italic, 0, 0}, {"EMBED", html_embed, 0, 1}, {"FIXED", html_fixed, 0, 0}, {"FONT", html_font, 0, 0}, {"FORM", html_form, 1, 0}, {"FRAME", html_frame, 1, 1}, {"FRAMESET", html_frameset, 1, 0}, {"H1", html_h1, 2, 2}, {"H2", html_h2, 2, 2}, {"H3", html_h3, 2, 2}, {"H4", html_h4, 2, 2}, {"H5", html_h5, 2, 2}, {"H6", html_h6, 2, 2}, {"HEAD", html_head, 0, 0}, {"HR", html_hr, 2, 1}, {"HTML", html_html, 0, 0}, {"I", html_italic, 0, 0}, {"IFRAME", html_iframe, 1, 1}, {"IMG", html_img, 0, 1}, {"INPUT", html_input, 0, 1}, {"LI", html_li, 1, 3}, {"LINK", html_link, 1, 1}, {"LISTING", html_pre, 2, 0}, {"MENU", html_ul, 2, 0}, {"NOFRAMES", html_noframes, 0, 0}, {"OBJECT", html_object, 1, 1}, {"OL", html_ol, 2, 0}, {"OPTION", html_option, 1, 1}, {"P", html_p, 2, 2}, {"PRE", html_pre, 2, 0}, {"Q", html_italic, 0, 0}, {"S", html_underline, 0, 0}, {"SCRIPT", html_script, 0, 0}, {"SELECT", html_select, 0, 0}, {"SPAN", html_span, 0, 0}, {"STRIKE", html_underline, 0, 0}, {"STRONG", html_bold, 0, 0}, {"STYLE", html_style, 0, 0}, {"SUB", html_subscript, 0, 0}, {"SUP", html_superscript,0,0}, {"TABLE", html_table, 2, 0}, {"TD", html_td, 0, 0}, {"TEXTAREA", html_textarea, 0, 1}, {"TH", html_th, 0, 0}, {"TITLE", html_title, 0, 0}, {"TR", html_tr, 1, 0}, {"U", html_underline, 0, 0}, {"UL", html_ul, 2, 0}, {"XMP", html_xmp, 2, 0}, {NULL, NULL, 0, 0},};#ifndef USE_FASTFINDstatic intcompar(const void *a, const void *b){ return strcasecmp(((struct element_info *) a)->name, ((struct element_info *) b)->name);}#elsestatic struct element_info *internal_pointer;/* Reset internal list pointer */static voidtags_list_reset(void){ internal_pointer = elements;}/* Returns a pointer to a struct that contains
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -