📄 parse.c
字号:
/* HTML core parser routines */#ifdef HAVE_CONFIG_H#include "config.h"#endif#include <errno.h>#include <stdarg.h>#include <stdio.h>#include <stdlib.h>#include <string.h>#include "elinks.h"#include "document/css/apply.h"#include "document/css/parser.h"#include "document/html/parser/forms.h"#include "document/html/parser/general.h"#include "document/html/parser/link.h"#include "document/html/parser/parse.h"#include "document/html/parser/stack.h"#include "document/html/parser.h"#include "document/options.h"#include "intl/charsets.h"#include "util/conv.h"#include "util/error.h"#include "util/fastfind.h"#include "util/memdebug.h"#include "util/memory.h"#include "util/string.h"/* Unsafe macros */#include "document/html/internal.h"#define end_of_tag(c) ((c) == '>' || (c) == '<')static inline intatchr(register unsigned char c){ return (c < 127 && (c > '>' || (c > ' ' && c != '=' && !end_of_tag(c))));}/* This function eats one html element. *//* - e is pointer to the begining of the element (*e must be '<') * - eof is pointer to the end of scanned area * - parsed element name is stored in name, it's length is namelen * - first attribute is stored in attr * - end points to first character behind the html element *//* It returns -1 when it failed (returned values in pointers are invalid) and * 0 for success. */intparse_element(register unsigned char *e, unsigned char *eof, unsigned char **name, int *namelen, unsigned char **attr, unsigned char **end){#define next_char() if (++e == eof) return -1; assert(e && eof); if (e >= eof || *e != '<') return -1; next_char(); if (name) *name = e; if (*e == '/') next_char(); if (!isident(*e)) return -1; while (isident(*e)) next_char(); if (!isspace(*e) && !end_of_tag(*e) && *e != '/' && *e != ':' && *e != '=') return -1; if (name && namelen) *namelen = e - *name; while (isspace(*e) || *e == '/' || *e == ':') next_char(); /* Skip bad attribute */ while (!atchr(*e) && !end_of_tag(*e) && !isspace(*e)) next_char(); if (attr) *attr = e;next_attr: while (isspace(*e)) next_char(); /* Skip bad attribute */ while (!atchr(*e) && !end_of_tag(*e) && !isspace(*e)) next_char(); if (end_of_tag(*e)) goto end; while (atchr(*e)) next_char(); while (isspace(*e)) next_char(); if (*e != '=') { if (end_of_tag(*e)) goto end; goto next_attr; } next_char(); while (isspace(*e)) next_char(); if (isquote(*e)) { unsigned char quote = *e;/* quoted_value: */ next_char(); while (*e != quote) next_char(); next_char(); /* The following apparently handles the case of <foo * id="a""b">, however that is very rare and probably not * conforming. More frequent (and mishandling it more fatal) is * probably the typo of <foo id="a""> - we can handle it as * long as this is commented out. --pasky */ /* if (*e == quote) goto quoted_value; */ } else { while (!isspace(*e) && !end_of_tag(*e)) next_char(); } while (isspace(*e)) next_char(); if (!end_of_tag(*e)) goto next_attr;end: if (end) *end = e + (*e == '>'); return 0;}#define realloc_chrs(x, l) mem_align_alloc(x, l, (l) + 1, unsigned char, 0xFF)#define add_chr(s, l, c) \ do { \ if (!realloc_chrs(&(s), l)) return NULL; \ (s)[(l)++] = (c); \ } while (0)unsigned char *get_attr_value(register unsigned char *e, unsigned char *name, struct document_options *options, enum html_attr_flags flags){ unsigned char *n; unsigned char *name_start; unsigned char *attr = NULL; int attrlen = 0; int found;next_attr: skip_space(e); if (end_of_tag(*e) || !atchr(*e)) goto parse_error; n = name; name_start = e; while (atchr(*n) && atchr(*e) && toupper(*e) == toupper(*n)) e++, n++; found = !*n && !atchr(*e); if (found && (flags & HTML_ATTR_TEST)) return name_start; while (atchr(*e)) e++; skip_space(e); if (*e != '=') { if (found) goto found_endattr; goto next_attr; } e++; skip_space(e); if (found) { if (!isquote(*e)) { while (!isspace(*e) && !end_of_tag(*e)) { if (!*e) goto parse_error; add_chr(attr, attrlen, *e); e++; } } else { unsigned char quote = *e;/* parse_quoted_value: */ while (*(++e) != quote) { if (*e == ASCII_CR) continue; if (!*e) goto parse_error; if (*e != ASCII_TAB && *e != ASCII_LF) add_chr(attr, attrlen, *e); else if (!(flags & HTML_ATTR_EAT_NL)) add_chr(attr, attrlen, ' '); } e++; /* The following apparently handles the case of <foo * id="a""b">, however that is very rare and probably * not conforming. More frequent (and mishandling it * more fatal) is probably the typo of <foo id="a""> - * we can handle it as long as this is commented out. * --pasky */#if 0 if (*e == quote) { add_chr(attr, attrlen, *e); goto parse_quoted_value; }#endif }found_endattr: add_chr(attr, attrlen, '\0'); attrlen--; if (/* Unused: !(flags & HTML_ATTR_NO_CONV) && */ memchr(attr, '&', attrlen)) { unsigned char *saved_attr = attr; attr = convert_string(NULL, saved_attr, attrlen, options->cp, CSM_QUERY, NULL, NULL, NULL); mem_free(saved_attr); } set_mem_comment(trim_chars(attr, ' ', NULL), name, strlen(name)); return attr; } else { if (!isquote(*e)) { while (!isspace(*e) && !end_of_tag(*e)) { if (!*e) goto parse_error; e++; } } else { unsigned char quote = *e; do { while (*(++e) != quote) if (!*e) goto parse_error; e++; } while (/* See above. *e == quote */ 0); } } goto next_attr;parse_error: mem_free_if(attr); return NULL;}#undef add_chr/* Extract numerical value of attribute @name. * It will return a positive integer value on success, * or -1 on error. */intget_num(unsigned char *a, unsigned char *name, struct document_options *options){ unsigned char *al = get_attr_val(a, name, options); int result = -1; if (al) { unsigned char *end; long num; errno = 0; num = strtol(al, (char **) &end, 10); if (!errno && *al && !*end && num >= 0 && num <= INT_MAX) result = (int) num; mem_free(al); } return result;}/* Parse 'width[%],....'-like attribute @name of element @a. If @limited is * set, it will limit the width value to the current usable width. Note that * @limited must be set to be able to parse percentage widths. *//* The function returns width in characters or -1 in case of error. */intget_width(unsigned char *a, unsigned char *name, int limited, struct html_context *html_context){ unsigned char *value = get_attr_val(a, name, html_context->options); unsigned char *str = value; unsigned char *end; int percentage = 0; int len; long width; if (!value) return -1; /* Skip spaces at start of string if any. */ skip_space(str); /* Search for end of string or ',' character (ie. in "100,200") */ for (len = 0; str[len] && str[len] != ','; len++); /* Go back, and skip spaces after width if any. */ while (len && isspace(str[len - 1])) len--; if (!len) { mem_free(value); return -1; } /* Nothing to parse. */ /* Is this a percentage ? */ if (str[len - 1] == '%') len--, percentage = 1; /* Skip spaces between width number and percentage if any. */ while (len && isspace(str[len - 1])) len--; if (!len) { mem_free(value); return -1; } /* Nothing to parse. */ /* Shorten the string a bit, so strtoul() will work on useful * part of it. */ str[len] = '\0'; /* Convert to number if possible. */ errno = 0; width = strtoul((char *) str, (char **) &end, 10); /* @end points into the @value string so check @end position * before freeing @value. */ if (errno || *end || width >= INT_MAX) { /* Not a valid number. */ mem_free(value); return -1; } mem_free(value);#define WIDTH_PIXELS2CHARS(width) ((width) + (HTML_CHAR_WIDTH - 1) / 2) / HTML_CHAR_WIDTH; if (limited) { int maxwidth = get_html_max_width(); if (percentage) { /* Value is a percentage. */ width = width * maxwidth / 100; } else { /* Value is a number of pixels, makes an approximation. */ width = WIDTH_PIXELS2CHARS(width); } if (width > maxwidth) width = maxwidth; } else { if (percentage) { /* No sense, we need @limited and @maxwidth for percentage. */ return -1; } else { /* Value is a number of pixels, makes an approximation, * no limit here */ width = WIDTH_PIXELS2CHARS(width); } }#undef WIDTH_PIXELS2CHARS if (width < 0) width = 0; return width;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -