📄 parser.c
字号:
/* HTML parser *//* $Id: parser.c,v 1.513.2.10 2005/05/01 22:47:55 jonas Exp $ */#ifdef HAVE_CONFIG_H#include "config.h"#endif#include <errno.h>#include <stdarg.h>#include <stdio.h>#include <stdlib.h>#include <string.h>#include "elinks.h"#include "main.h"#include "bfu/listmenu.h"#include "bfu/menu.h"#include "document/css/apply.h"#include "document/css/css.h"#include "document/css/stylesheet.h"#include "document/html/frames.h"#include "document/html/parser/link.h"#include "document/html/parser/stack.h"#include "document/html/parser/parse.h"#include "document/html/parser.h"#include "document/html/renderer.h"#include "document/html/tables.h"#include "document/renderer.h"#include "intl/charsets.h"#include "osdep/ascii.h"#include "protocol/date.h"#include "protocol/header.h"#include "protocol/uri.h"#include "sched/session.h"#include "sched/task.h"#include "terminal/draw.h"#include "terminal/terminal.h"#include "util/color.h"#include "util/conv.h"#include "util/error.h"#include "util/memdebug.h"#include "util/memlist.h"#include "util/memory.h"#include "util/string.h"#include "viewer/text/form.h"#include "viewer/text/link.h"#include "viewer/text/view.h"/* Unsafe macros */#include "document/html/internal.h"/* TODO: This needs rewrite. Yes, no kidding. */static struct { int n; unsigned char *s;} roman_tbl[] = { {1000, "m"}, {999, "im"}, {990, "xm"}, {900, "cm"}, {500, "d"}, {499, "id"}, {490, "xd"}, {400, "cd"}, {100, "c"}, {99, "ic"}, {90, "xc"}, {50, "l"}, {49, "il"}, {40, "xl"}, {10, "x"}, {9, "ix"}, {5, "v"}, {4, "iv"}, {1, "i"}, {0, NULL}};static voidroman(unsigned char *p, unsigned n){ int i = 0; if (n >= 4000) { strcpy(p, "---"); return; } if (!n) { strcpy(p, "o"); return; } p[0] = 0; while (n) { while (roman_tbl[i].n <= n) { n -= roman_tbl[i].n; strcat(p, roman_tbl[i].s); } i++; assertm(!(n && !roman_tbl[i].n), "BUG in roman number convertor"); if_assert_failed break; }}static intget_color(unsigned char *a, unsigned char *c, color_t *rgb){ unsigned char *at; int r; if (!use_document_fg_colors(global_doc_opts)) return -1; at = get_attr_val(a, c); if (!at) return -1; r = decode_color(at, strlen(at), rgb); mem_free(at); return r;}intget_bgcolor(unsigned char *a, color_t *rgb){ if (!use_document_bg_colors(global_doc_opts)) return -1; return get_color(a, "bgcolor", rgb);}unsigned char *get_target(unsigned char *a){ unsigned char *v = get_attr_val(a, "target"); if (v) { if (!strcasecmp(v, "_self")) { mem_free(v); v = stracpy(global_doc_opts->framename); } } return v;}struct html_context html_context = {#ifdef CONFIG_CSS INIT_CSS_STYLESHEET(html_context.css_styles, import_css_stylesheet),#endif};voidln_break(int n, void (*line_break)(struct part *), struct part *part){ if (!n || html_top.invisible) return; while (n > html_context.line_breax) { html_context.line_breax++; line_break(part); } html_context.position = 0; html_context.putsp = -1;}voidput_chrs(unsigned char *start, int len, void (*put_chars)(struct part *, unsigned char *, int), struct part *part){ if (html_is_preformatted()) html_context.putsp = 0; if (!len || html_top.invisible) return; if (html_context.putsp == 1) { put_chars(part, " ", 1); html_context.position++; html_context.putsp = -1; } if (html_context.putsp == -1) { html_context.putsp = 0; if (isspace(start[0])) { start++, len--; if (!len) { if (!html_is_preformatted()) html_context.putsp = -1; return; } } } if (isspace(start[len - 1]) && !html_is_preformatted()) html_context.putsp = -1; html_context.was_br = 0; put_chars(part, start, len); html_context.position += len; html_context.line_breax = 0; if (html_context.was_li > 0) html_context.was_li--;}voidset_fragment_identifier(unsigned char *attr_name, unsigned char *attr){ unsigned char *id_attr = get_attr_val(attr_name, attr); if (id_attr) { html_context.special_f(html_context.part, SP_TAG, id_attr); mem_free(id_attr); }}voidadd_fragment_identifier(struct part *part, unsigned char *attr){ html_context.special_f(part, SP_TAG, attr);}#ifdef CONFIG_CSSvoidimport_css_stylesheet(struct css_stylesheet *css, struct uri *base_uri, unsigned char *url, int len){ unsigned char *import_url; struct uri *uri; assert(base_uri); if (!global_doc_opts->css_enable || !global_doc_opts->css_import) return; url = memacpy(url, len); if (!url) return; /* HTML <head> urls should already be fine but we can.t detect them. */ import_url = join_urls(base_uri, url); mem_free(url); if (!import_url) return; uri = get_uri(import_url, URI_BASE); mem_free(import_url); if (!uri) return; /* Request the imported stylesheet as part of the document ... */ html_context.special_f(html_context.part, SP_STYLESHEET, uri); /* ... and then attempt to import from the cache. */ import_css(css, uri); done_uri(uri);}#endifvoidhtml_span(unsigned char *a){}voidhtml_bold(unsigned char *a){ format.style.attr |= AT_BOLD;}voidhtml_italic(unsigned char *a){ format.style.attr |= AT_ITALIC;}voidhtml_underline(unsigned char *a){ format.style.attr |= AT_UNDERLINE;}voidhtml_fixed(unsigned char *a){ format.style.attr |= AT_FIXED;}voidhtml_subscript(unsigned char *a){ format.style.attr |= AT_SUBSCRIPT;}voidhtml_superscript(unsigned char *a){ format.style.attr |= AT_SUPERSCRIPT;}/* Extract the extra information that is available for elements which can * receive focus. Call this from each element which supports tabindex or * accesskey. *//* Note that in ELinks, we support those attributes (I mean, we call this * function) while processing any focusable element (otherwise it'd have zero * tabindex, thus messing up navigation between links), thus we support these * attributes even near tags where we're not supposed to (like IFRAME, FRAME or * LINK). I think this doesn't make any harm ;). --pasky */voidhtml_focusable(unsigned char *a){ unsigned char *accesskey; int tabindex; format.accesskey = 0; format.tabindex = 0x80000000; if (!a) return; accesskey = get_attr_val(a, "accesskey"); if (accesskey) { if (*accesskey) { accesskey[0] = toupper(accesskey[0]); format.accesskey = read_key(accesskey); } mem_free(accesskey); } tabindex = get_num(a, "tabindex"); if (0 < tabindex && tabindex < 32767) { format.tabindex = (tabindex & 0x7fff) << 16; } mem_free_if(format.onclick); format.onclick = get_attr_val(a, "onclick"); mem_free_if(format.ondblclick); format.ondblclick = get_attr_val(a, "ondblclick"); mem_free_if(format.onmouseover); format.onmouseover = get_attr_val(a, "onmouseover"); mem_free_if(format.onhover); format.onhover = get_attr_val(a, "onhover"); mem_free_if(format.onfocus); format.onfocus = get_attr_val(a, "onfocus"); mem_free_if(format.onmouseout); format.onmouseout = get_attr_val(a, "onmouseout"); mem_free_if(format.onblur); format.onblur = get_attr_val(a, "onblur");}voidhtml_font(unsigned char *a){ unsigned char *al = get_attr_val(a, "size"); if (al) { int p = 0; unsigned s; unsigned char *nn = al; unsigned char *end; if (*al == '+') p = 1, nn++; else if (*al == '-') p = -1, nn++; errno = 0; s = strtoul(nn, (char **) &end, 10); if (!errno && *nn && !*end) { if (s > 7) s = 7; if (!p) format.fontsize = s; else format.fontsize += p * s; if (format.fontsize < 1) format.fontsize = 1; else if (format.fontsize > 7) format.fontsize = 7; } mem_free(al); } get_color(a, "color", &format.style.fg);}voidhtml_body(unsigned char *a){ get_color(a, "text", &format.style.fg); get_color(a, "link", &format.clink); get_color(a, "vlink", &format.vlink); get_bgcolor(a, &format.style.bg);#ifdef CONFIG_CSS /* If there are any CSS twaks regarding bgcolor, make sure we will get * it _and_ prefer it over bgcolor attribute. */ if (global_doc_opts->css_enable) css_apply(&html_top, &html_context.css_styles, &html_context.stack);#endif if (par_format.bgcolor != format.style.bg) { /* Modify the root HTML element - format_html_part() will take * this from there. */ struct html_element *e = html_context.stack.prev; e->parattr.bgcolor = e->attr.style.bg = par_format.bgcolor = format.style.bg; } if (html_context.has_link_lines && par_format.bgcolor && !search_html_stack("BODY")) { html_context.special_f(html_context.part, SP_COLOR_LINK_LINES); }}voidhtml_skip(unsigned char *a){ html_top.invisible = 1; html_top.type = ELEMENT_DONT_KILL;}#ifdef CONFIG_ECMASCRIPTintdo_html_script(unsigned char *a, unsigned char *html, unsigned char *eof, unsigned char **end, struct part *part){ /* TODO: <noscript> processing. Well, same considerations apply as to * CSS property display: none processing. */ /* TODO: Charsets for external scripts. */ unsigned char *type, *language, *src; int in_comment = 0; html_skip(a); /* We try to process nested <script> if we didn't process the parent * one. That's why's all the fuzz. */ type = get_attr_val(a, "type"); if (type && strcasecmp(type, "text/javascript")) { mem_free(type);not_processed: /* Permit nested scripts and retreat. */ html_top.invisible++; return 1; } if (type) mem_free(type); /* Check that the script content is ecmascript. The value of the * language attribute can be JavaScript with optional version digits * postfixed (like: ``JavaScript1.1''). */ language = get_attr_val(a, "language"); if (language) { int languagelen = strlen(language); if (languagelen < 10 || (languagelen > 10 && !isdigit(language[10])) || strncasecmp(language, "javascript", 10)) { mem_free(language); goto not_processed; } mem_free(language); } if (part->document && (src = get_attr_val(a, "src"))) { /* External reference. */ unsigned char *import_url; struct uri *uri; if (!get_opt_bool("ecmascript.enable")) { mem_free(src); goto not_processed; } /* HTML <head> urls should already be fine but we can.t detect them. */ import_url = join_urls(html_context.base_href, src); mem_free(src); if (!import_url) goto imported; uri = get_uri(import_url, URI_BASE); if (!uri) goto imported; /* Request the imported script as part of the document ... */ html_context.special_f(html_context.part, SP_SCRIPT, uri); done_uri(uri); /* Create URL reference onload snippet. */ insert_in_string(&import_url, 0, "^", 1); add_to_string_list(&part->document->onload_snippets, import_url, -1);imported: /* Retreat. Do not permit nested scripts, tho'. */ if (import_url) mem_free(import_url); return 1; } /* Positive, grab the rest and interpret it. */ /* First position to the real script start. */ while (html < eof && *html <= ' ') html++; if (eof - html > 4 && !strncmp(html, "<!--", 4)) { in_comment = 1; /* We either skip to the end of line or to -->. */ for (; *html != '\n' && *html != '\r' && eof - html >= 3; html++) { if (!strncmp(html, "-->", 3)) { /* This means the document is probably broken. * We will now try to process the rest of * <script> contents, which is however likely * to be empty. Should we try to process the * comment too? Currently it seems safer but * less tolerant to broken pages, if there are * any like this. */ html += 3; in_comment = 0; break; } } } *end = html; /* Now look ahead for the script end. The <script> contents is raw * CDATA, so we just look for the ending tag and need not care for * any quote marks counting etc - YET, we are more tolerant and permit * </script> stuff inside of the script if the whole <script> element * contents is wrapped in a comment. See i.e. Mozilla bug 26857 for fun * reading regarding this. */ for (; *end < eof; (*end)++) { unsigned char *name; int namelen; if (in_comment) { /* TODO: If we ever get some standards-quirk mode * distinction, this should be disabled in the * standards mode (and we should just look for CDATA * end, which is "</"). --pasky */ if (eof - *end >= 3 && !strncmp(*end, "-->", 3)) { /* Next iteration will jump passed the ending '>' */ (*end) += 2; in_comment = 0; } continue; /* XXX: Scan for another comment? That's admittelly * already stretching things a little bit to an * extreme ;-). */ } if (**end != '<') continue; /* We want to land before the closing element, that's why we * don't pass @end also as the appropriate parse_element()
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -