📄 html.c
字号:
/***************************************************************************//* This code is part of WWW grabber called pavuk *//* Copyright (c) 1997 - 2001 Stefan Ondrejicka *//* Distributed under GPL 2 or later *//***************************************************************************/#include "config.h"#include <assert.h>#include <stdio.h>#include <unistd.h>#include <sys/types.h>#include <sys/stat.h>#include <fcntl.h>#include <string.h>#include <stdlib.h>#include <utime.h>#include "url.h"#include "doc.h"#include "html.h"#include "htmlparser.h"#include "gui_api.h"#include "mime.h"#include "errcode.h"#include "uexit.h"/*****************************************//* get requested attribute from HTML tag *//*****************************************/char *html_get_attrib_from_tag(char *tag, char *link_attrib){ char *p; char *retval = NULL; char *attrstart = NULL; char *attrend = NULL; int llen = strlen(link_attrib); bool_t was_sep = TRUE; for(p = tag; *p; p++) { if(was_sep && !attrstart && !strncasecmp(link_attrib, p, llen) && (tl_ascii_isspace(*(p + llen)) || (*(p + llen) == '='))) { attrstart = p + llen; while(*attrstart) { if(tl_ascii_isspace(*attrstart) || (*attrstart == '=')) attrstart++; else break; } if(*attrstart == '\"' || *attrstart == '\'') { if(!(attrend = strchr(attrstart + 1, *attrstart))) attrend = attrstart + strcspn(attrstart, " \t\r\n>"); attrstart++; } else { attrend = attrstart + strcspn(attrstart, " \t\r\n\"\'>"); } break; } was_sep = tl_ascii_isspace(*p) != 0; if(!attrend && !was_sep) was_sep = (*p == ';'); if(*p == '\"' || *p == '\'') { if(!(p = strchr(p + 1, *p))) break; } } if(attrstart) { /* strip leading/trailing spaces */ while(tl_ascii_isspace(*attrstart)) attrstart++; while(attrend > attrstart && tl_ascii_isspace(*(attrend - 1))) attrend--; /* to workaround broken tags which are missing closing */ /* quotes and contain leading space characters */ if(attrstart > attrend) attrend = attrstart + strcspn(attrstart, "> "); retval = tl_strndup(attrstart, attrend - attrstart); omit_chars(retval, "\t\n\r"); } return retval;}/********************************************************//* overwrite content of specified attribute in HTML tag *//********************************************************/void html_replace_url_in_stack(char *tag, char *link_attrib, char *urlin, int pare){ char *pom; char *p; char *attrstart = NULL; char *pattrstart = NULL; char *attrend = NULL; int llen = strlen(link_attrib); bool_t was_sep = TRUE; for(p = tag; *p; p++) { if(was_sep && !attrstart && !strncasecmp(link_attrib, p, llen) && (tl_ascii_isspace(*(p + llen)) || (*(p + llen) == '='))) { pattrstart = attrstart = p + llen; while(*attrstart) { if(tl_ascii_isspace(*attrstart) || (*attrstart == '=')) attrstart++; else break; } if(*attrstart == '\"' || *attrstart == '\'') { if(!(attrend = strchr(attrstart + 1, *attrstart))) attrend = attrstart + strcspn(attrstart, " \r\n\t>"); attrstart++; } else { attrend = attrstart + strcspn(attrstart, " \t\r\n\"\'>"); } break; } was_sep = tl_ascii_isspace(*p) != 0; if(*p == '\"' || *p == '\'') { if(!(p = strchr(p + 1, *p))) break; } } if(attrstart) { /* to workaround broken tags which are missing closing */ /* quotes and contain leading space characters */ if(attrstart > attrend) attrend = attrstart + strcspn(attrstart, "> "); pom = (*attrend == '\'' || *attrend == '\"') ? tl_strdup(attrend + 1) : tl_strdup(attrend); if(!pare) strcpy(pattrstart, "=\""); else strcpy(pattrstart, "="); strcat(pattrstart, urlin); if(!pare) strcat(pattrstart, "\""); strcat(pattrstart, pom); _free(pom); } return;}/******************************************//* look if tag contains specified element *//******************************************/int html_tag_co_elem(char *tag, char *elem){ char *p; int llen = strlen(elem); bool_t was_sep = TRUE; for(p = tag; *p; p++) { if(was_sep && !strncasecmp(elem, p, llen) && (tl_ascii_isspace(*(p + llen)) || (*(p + llen) == '=') || (!*(p + llen)))) { return TRUE; } was_sep = tl_ascii_isspace(*p) != 0; if(!was_sep) was_sep = (*p == ';'); if(*p == '\"' || *p == '\'') { if(!(p = strchr(p + 1, *p))) break; } } return FALSE;}/**********************************************************//* determine base URL for document looking at request URL *//**********************************************************/static void html_get_init_base_url(url * urlp, char **base, char **baset){ char *p; *baset = url_to_urlstr(urlp, FALSE); *base = tl_strdup(*baset); if((p = strrchr(*baset, '#'))) *p = '\0'; DEBUG_HTML("BASE URL - %s\n", *base); if((p = strrchr(*base, '?'))) *p = '\0'; if(!tl_is_dirname(*base)) { p = strrchr(*base, '/'); if(p) *(p + 1) = '\0'; }}/********************************************************************//* determine base URL for document looking on request URL && server *//* response header fields Content-Location: & Content-Base: & Base: *//********************************************************************/static void html_get_base_url(doc * docp, char **base, char **baset){ char *p; html_get_init_base_url(docp->doc_url, base, baset); /* get possible base URL from server response header */ if(docp->mime && ((p = get_mime_param_val_str("Content-Location:", docp->mime)) || (p = get_mime_param_val_str("Content-Base:", docp->mime)) || (p = get_mime_param_val_str("Base:", docp->mime))) && p) { char *p2; url *urlp; p2 = url_to_absolute_url(*base, *baset, docp->doc_url, p); urlp = url_parse(p2); assert(urlp->type != URLT_FROMPARENT); if(!prottable[urlp->type].supported) { xprintf(1, gettext("Unsupported BASE URL - %s (probably bad handled)\n"), p); _free(*base); *base = tl_strdup(p); } else { _free(p); _free(*base); html_get_init_base_url(urlp, base, &p); _free(p); } free_deep_url(urlp); _free(urlp); _free(p2); }}/*******************************************************//* parse HTML document and extract URLs from it and if *//* requested, also adjust content of document *//*******************************************************/dllist *html_process_document(doc * html_doc, dllist ** formlist){ char *base, *baset; html_parser_t *hp; html_extract_info_t einfo; html_rewrite_info_t rinfo; html_robots_info_t oinfo; int rewrite; int purestyle; int purescript; int follow = TRUE; /** call the -follow_cmd script **/ if(priv_cfg.condition.follow_cmd) { int rv = uexit_follow_cmd(html_doc); if(rv == 0) follow = FALSE; } purestyle = (html_doc->doc_url->status & URL_STYLE); purescript = (html_doc->doc_url->status & URL_ISSCRIPT); einfo.prev_a = NULL; einfo.urls = NULL; einfo.no_limits = (cfg.mode == MODE_FTPDIR) || (cfg.dump_urlfd >= 0); einfo.only_inline = (cfg.mode == MODE_SINGLE) || cfg.singlepage; einfo.enable_js = cfg.enable_js; rinfo.einfo = &einfo; rinfo.all_to_local = cfg.all_to_local; rinfo.selected_to_local = cfg.sel_to_local; rinfo.all_to_remote = cfg.all_to_remote; oinfo.index = TRUE; oinfo.follow = TRUE; oinfo.images = TRUE; rewrite = cfg.rewrite_links && cfg.mode != MODE_FTPDIR; hp = html_parser_init(html_link_tags, html_link_tags_num(), rewrite, purestyle, purescript); /** urls in script are relative to HTML document **/ /** where it is called not relative to script itself **/ if(purescript && html_doc->doc_url->parent_url) html_get_init_base_url((url *) html_doc->doc_url->parent_url->data, &base, &baset); else html_get_base_url(html_doc, &base, &baset); html_parser_set_base(hp, base, baset); html_parser_set_document(hp, html_doc->doc_url, html_doc->contents, html_doc->size); html_parser_add_tag_func(hp, html_parser_parse_tag, NULL);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -