📄 doc.c
字号:
/***************************************************************************//* This code is part of WWW grabber called pavuk *//* Copyright (c) 1997 - 2001 Stefan Ondrejicka *//* Distributed under GPL 2 or later *//***************************************************************************/#include "config.h"#include <unistd.h>#include <stdio.h>#include <string.h>#include <stdlib.h>#include <errno.h>#include <fcntl.h>#include <limits.h>#include <sys/types.h>#include <sys/socket.h>#include <sys/stat.h>#ifdef HAVE_SYS_PARAM_H#include <sys/param.h>#endif#ifdef HAVE_FSTATVFS#ifdef HAVE_SYS_STATVFS_H#include <sys/statvfs.h>#endif#else#ifdef HAVE_FSTATFS#ifdef HAVE_SYS_STATFS_H#include <sys/statfs.h>#endif#ifdef HAVE_SYS_VFS_H#include <sys/vfs.h>#endif#ifdef HAVE_SYS_MOUNT_H#include <sys/mount.h>#endif#endif#endif#include <sys/time.h>#include <time.h>#include <utime.h>#include "url.h"#include "doc.h"#include "tools.h"#include "mime.h"#include "http.h"#include "ftp.h"#include "gopher.h"#include "decode.h"#include "abstract.h"#include "mode.h"#include "times.h"#include "dinfo.h"#include "errcode.h"#include "log.h"#include "gui_api.h"#include "html.h"#ifdef I_FACEstatic void doc_set_info(doc *);#endifstatic void show_progress(doc *, ssize_t, int);static double compute_speed_rate(time_t, ssize_t);int doc_download_init(doc * docu, int load){ docu->remove_lock = TRUE; docu->lock_fn = NULL; docu->is_parsable = cfg.enable_js && (docu->doc_url->status & URL_ISSCRIPT); docu->contents = NULL; docu->mime = NULL; docu->type_str = NULL; docu->save_online = FALSE; docu->size = 0; docu->current_size = 0; docu->totsz = -1; docu->origsize = 0; docu->rest_pos = 0; docu->stime = time(NULL); docu->s_sock = NULL; docu->is_http11 = FALSE; docu->is_chunked = FALSE; docu->is_persistent = FALSE; docu->read_chunksize = FALSE; docu->read_trailer = FALSE; docu->doreget = FALSE; docu->origtime = docu->dtime; docu->adj_sz = 0; docu->load = load; docu->http_proxy_10 = FALSE; docu->ftp_data_con_finished = FALSE; docu->num_auth = cfg.auth_reuse_nonce ? 1 : 0; docu->num_proxy_auth = cfg.auth_reuse_proxy_nonce ? 1 : 0; docu->is_http_transfer = docu->doc_url->type == URLT_HTTP || docu->doc_url->type == URLT_HTTPS || (docu->doc_url->type == URLT_FTP && priv_cfg.ftp_proxy && cfg.ftp_via_http && !cfg.ftp_dirtyp) || (docu->doc_url->type == URLT_GOPHER && priv_cfg.gopher_proxy && cfg.gopher_via_http); /*** just default value, later will be assigned properly ***/ docu->request_type = HTTP_REQ_UNKNOWN; if(docu->is_http_transfer && !docu->http_proxy) { char *proxy = NULL; unsigned short port = 0; switch (docu->doc_url->type) { case URLT_HTTP: { http_proxy *pr = NULL; LOCK_PROXY; pr = http_proxy_get(); if(pr) { http_proxy_check(pr, docu); proxy = tl_strdup(pr->addr); port = pr->port; docu->http_proxy_10 = (pr->is_10 != 0); } UNLOCK_PROXY; } break;#ifdef USE_SSL case URLT_HTTPS: if(priv_cfg.ssl_proxy) { proxy = tl_strdup(priv_cfg.ssl_proxy); port = cfg.ssl_proxy_port; } break;#endif case URLT_FTP: if(priv_cfg.ftp_proxy) { proxy = tl_strdup(priv_cfg.ftp_proxy); port = cfg.ftp_proxy_port; } break; case URLT_GOPHER: if(priv_cfg.gopher_proxy) { proxy = tl_strdup(priv_cfg.gopher_proxy); port = cfg.gopher_proxy_port; } break; default: proxy = NULL; port = 0; break; } docu->http_proxy = proxy; docu->http_proxy_port = port; } if(cfg.dumpfd >= 0) { docu->remove_lock = FALSE; if(cfg.dump_after) { docu->load = TRUE; docu->save_online = FALSE; docu->s_sock = NULL; } else { docu->save_online = TRUE; docu->s_sock = bufio_dupfd(cfg.dumpfd); if(!docu->s_sock) { xperror("bufio_dupfd()"); docu->errcode = ERR_STORE_DOC; return -1; } } } gettimeofday(&docu->hr_start_time, NULL); timerclear(&docu->redirect_time); timerclear(&docu->dns_time); timerclear(&docu->connect_time); timerclear(&docu->first_byte_time); timerclear(&docu->end_time); return 0;}static int doc_check_quotas(doc * docu, ssize_t len, ssize_t totallen){ int retcode = 0;#define KILL_PERSISTANT_CONNECTION \ if(docu->doc_url->type == URLT_FTP || docu->doc_url->type == URLT_FTPS) \ docu->ftp_fatal_err = TRUE; \ if(docu->is_http11) \ docu->is_persistent = FALSE; if(cfg.minrate > 0.0 && (docu->doc_url->type != URLT_FILE && !(docu->doc_url->status & URL_REDIRECT))) { time_t _tm = doc_etime(docu, FALSE); double _rt = compute_speed_rate(_tm, totallen); if(_rt < (cfg.minrate * 1024.0)) { KILL_PERSISTANT_CONNECTION; docu->errcode = ERR_LOW_TRANSFER_RATE; retcode = -1; } } if(cfg.max_time > 0.0) { if((cfg.start_time + (int) (60.0 * cfg.max_time)) < time(NULL)) { KILL_PERSISTANT_CONNECTION; docu->errcode = ERR_QUOTA_TIME; retcode = -1; } } if(docu->doc_url->type != URLT_FILE && !(docu->doc_url->status & URL_REDIRECT)) cfg.trans_size += len; if(cfg.file_quota && ((cfg.file_quota * 1024) <= totallen) && (docu->doc_url->type != URLT_FILE) && !(docu->doc_url->status & URL_REDIRECT)) { KILL_PERSISTANT_CONNECTION; docu->errcode = ERR_QUOTA_FILE; retcode = 1; } if(cfg.trans_quota && ((cfg.trans_quota * 1024) <= cfg.trans_size)) { KILL_PERSISTANT_CONNECTION; docu->errcode = ERR_QUOTA_TRANS; retcode = -1; }#if defined HAVE_FSTATFS || defined HAVE_FSTATVFS if((cfg.dumpfd < 0) && cfg.fs_quota && (docu->doc_url->type != URLT_FILE) && !(docu->doc_url->status & URL_REDIRECT) && docu->s_sock && cfg.dumpfd < 0) {#ifdef HAVE_FSTATVFS struct statvfs fss; if(fstatvfs(bufio_getfd(docu->s_sock), &fss)) xperror("fstatvfs");#else struct statfs fss; if(fstatfs(bufio_getfd(docu->s_sock), &fss)) xperror("fstatfs");#endif else { long freespace = (fss.f_bsize * fss.f_bavail) / 1024; if(freespace < cfg.fs_quota) { KILL_PERSISTANT_CONNECTION; docu->errcode = ERR_QUOTA_FS; retcode = -1; } } }#endif return retcode;}static int doc_transfer_data(doc * docu){ char *buf; int bufsize; ssize_t len, totallen = 0; int retcode = 0; if(docu->report_size) gui_set_status(gettext("Transfering data")); show_progress(docu, docu->adj_sz, FALSE); bufsize = (cfg.bufsize > 0 ? cfg.bufsize : 1) * 1024; buf = _malloc(bufsize);#ifdef SO_RCVBUF#ifndef __QNX__ if(bufio_is_sock(docu->datasock)) { if(setsockopt(bufio_getfd(docu->datasock), SOL_SOCKET, SO_RCVBUF, (char *) &bufsize, sizeof(bufsize))) { xperror(gettext("setsockopt: SO_RCVBUF failed")); } }#endif#endif if(docu->save_online) { DEBUG_USER("Storing to file: %s\n", url_to_filename(docu->doc_url, TRUE)); } if(docu->mime && cfg.dump_resp && cfg.dumpfd >= 0 && !cfg.dump_after) bufio_write(docu->s_sock, docu->mime, strlen(docu->mime)); while((len = abs_read_data(docu, docu->datasock, buf, bufsize)) > 0) { if(docu->save_online) { if(write(bufio_getfd(docu->s_sock), buf, len) != len) { docu->errcode = ERR_STORE_DOC; xperror(gettext("storing document")); retcode = -1; if(docu->doc_url->type == URLT_FTP || docu->doc_url->type == URLT_FTPS) docu->ftp_fatal_err = TRUE; if(docu->is_http11) docu->is_persistent = FALSE; break; } } totallen += len; docu->current_size += len; if(cfg.maxrate > 0.0 && (docu->doc_url->type != URLT_FILE && !(docu->doc_url->status & URL_REDIRECT))) { time_t _tm = doc_etime(docu, FALSE); double _rt = compute_speed_rate(_tm, totallen); if(_rt > (cfg.maxrate * 1024.0)) { tl_msleep((time_t) (1000.0 * ((double) totallen) / (cfg.maxrate * 1024.0)) - _tm); } } docu->size = totallen; show_progress(docu, docu->adj_sz, FALSE); if(docu->load || docu->is_parsable || ((docu->doc_url->type == URLT_FTP || docu->doc_url->type == URLT_FTPS) && docu->doc_url->p.ftp.dir) || (docu->doc_url->type == URLT_GOPHER && (docu->doc_url->p.gopher.selector[0] == '1' || docu->doc_url->p.gopher.selector[0] == 'h'))) { docu->contents = _realloc(docu->contents, totallen + 1); memmove(docu->contents + totallen - len, buf, len); } retcode = doc_check_quotas(docu, len, totallen); if(retcode) { if(retcode == 1) retcode = 0; break; } if(docu->totsz > 0 && docu->totsz <= docu->current_size) break; } show_progress(docu, docu->adj_sz, TRUE); if(cfg.dumpfd >= 0 && !cfg.dump_after) { bufio_close(docu->s_sock); docu->s_sock = NULL; docu->save_online = FALSE; } if(cfg.progres && docu->report_size#ifdef I_FACE && !cfg.xi_face#endif ) { xprintf(0, "\n"); } if(len < 0 || ((docu->totsz > 0) && (docu->totsz != (docu->size + docu->rest_pos)))) { xperror(gettext("Document transfer data")); if((docu->doc_url->type == URLT_HTTP || docu->doc_url->type == URLT_HTTPS) && (!(docu->doc_url->status & URL_REDIRECT))) { docu->errcode = ERR_HTTP_TRUNC; } else if((docu->doc_url->type == URLT_FTP || docu->doc_url->type == URLT_FTPS) && (!(docu->doc_url->status & URL_REDIRECT))) { docu->errcode = ERR_FTP_TRUNC; } else if(!docu->errcode) docu->errcode = ERR_READ; docu->remove_lock = FALSE; retcode = -1; } if(docu->report_size) gui_set_status(gettext("Data transfer done")); if((docu->doc_url->type == URLT_FTP || docu->doc_url->type == URLT_FTPS) && docu->errcode == ERR_FTP_TRUNC) { docu->remove_lock = FALSE; retcode = -1; } /*** if transfer was not from begining, reread ***/ /*** document content to memory form local file ***/ /*** to be sure we will process whole document ***/ if(!retcode && docu->rest_pos && (docu->load || docu->is_parsable) && (cfg.dumpfd < 0)) { _free(docu->contents); totallen = 0; lseek(bufio_getfd(docu->s_sock), 0, SEEK_SET); bufio_reset(docu->s_sock); while((len = bufio_read(docu->s_sock, buf, bufsize)) > 0) { totallen += len; docu->contents = _realloc(docu->contents, totallen + 1); memmove(docu->contents + totallen - len, buf, len); } } if(docu->contents) *(docu->contents + totallen) = '\0'; _free(buf); docu->size = totallen; return retcode;}static int doc_check_doc_file(doc * docu, int *rv){ char *fn; struct stat estat; fn = url_to_filename(docu->doc_url, TRUE); if(cfg.mode != MODE_SYNC && cfg.mode != MODE_MIRROR) { if(docu->doc_url->type != URLT_FILE && (access(fn, R_OK) != -1)) { if(!stat(fn, &estat)) { if(!S_ISDIR(estat.st_mode)) { docu->doc_url->status |= URL_REDIRECT; } else { char *pom; char *savepath = url_get_path(docu->doc_url); pom = tl_str_concat(NULL, fn, "/", priv_cfg.index_name, NULL); if(!stat(pom, &estat)) { _free(pom); if(!S_ISDIR(estat.st_mode)) { url *newurl = url_dup_url(docu->doc_url); if(newurl->type != URLT_FILE) pom = tl_str_concat(NULL, savepath, "/", NULL); if(newurl->type == URLT_FTP || newurl->type == URLT_FTPS) newurl->p.ftp.dir = TRUE; url_set_path(newurl, pom); _free(pom); if(url_redirect_to(docu->doc_url, newurl, FALSE)) docu->errcode = ERR_HTTP_CYCLIC; else docu->errcode = ERR_HTTP_REDIR; *rv = -1; return -1; } } _free(pom); fn = url_to_filename(docu->doc_url, TRUE); } } } if((docu->doc_url->type == URLT_FILE || (docu->doc_url->status & URL_REDIRECT)) && !docu->load) { if(!stat(fn, &estat)) { if(S_ISDIR(estat.st_mode)) { docu->errcode = ERR_DIR_URL; *rv = -1; return -1; } } else { docu->errcode = ERR_FILE_OPEN; *rv = -1; return -1; } if((!cfg.ftp_html && strcmp(tl_get_basename(fn), priv_cfg.index_name) && (docu->doc_url->type == URLT_FTP || docu->doc_url->type == URLT_FTPS) && !docu->doc_url->p.ftp.dir) || !file_is_html(fn)) { docu->is_parsable = FALSE; docu->save_online = TRUE; docu->size = estat.st_size;#ifdef I_FACE if(cfg.xi_face) doc_set_info(docu);#endif xprintf(1, gettext("File redirect\n")); *rv = 0; return -1; } else { if(!strcasecmp("css", tl_get_extension(fn))) docu->doc_url->status |= URL_STYLE; docu->is_parsable = TRUE; } } } else { if(!stat(fn, &estat)) { docu->origsize = estat.st_size; /* pro: somehow it must have been forgotten to set the time as well... */ docu->origtime = estat.st_mtime; } } return 0;}static int doc_open_existing_in_file(doc * docu, int b_lock, int *rv){ char *inname; struct stat estat; if((cfg.dumpfd < 0) && (inname = url_to_in_filename(docu->doc_url))) { if(!stat(inname, &estat) && !S_ISDIR(estat.st_mode)) { if(doc_lock(docu, b_lock)) { docu->errcode = ERR_STORE_DOC; _free(inname); *rv = -1; return -1; } docu->rest_pos = estat.st_size - cfg.rollback; if(docu->rest_pos) { xprintf(1, gettext("Trying to resume from position %d\n"), docu->rest_pos); docu->origtime = estat.st_mtime; docu->stime = estat.st_mtime; docu->doreget = TRUE; docu->remove_lock = FALSE; } } _free(inname);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -