📄 recurse.c
字号:
/***************************************************************************//* This code is part of WWW grabber called pavuk *//* Copyright (c) 1997 - 2001 Stefan Ondrejicka *//* Distributed under GPL 2 or later *//***************************************************************************/#include "config.h"#include <assert.h>#include <string.h>#include <stdlib.h>#include <stdio.h>#include <errno.h>#include <netdb.h>#include <errno.h>#include <unistd.h>#include <sys/socket.h>#include <sys/types.h>#include <sys/stat.h>#include <dirent.h>#include <limits.h>#include <time.h>#include <signal.h>#include "url.h"#include "doc.h"#include "tools.h"#include "html.h"#include "http.h"#include "ftp.h"#include "myssl.h"#include "abstract.h"#include "recurse.h"#include "mime.h"#include "robots.h"#include "mode.h"#include "times.h"#include "stats.h"#include "errcode.h"#include "cookie.h"#include "log.h"#include "gui_api.h"#include "form.h"#include "ainterface.h"#include "gcinfo.h"static void dump_ftp_list(dllist *);static void dump_urls_list(dllist *);#define SETNEXTURL doc_cleanup(docu); \ _free(pstr); \ return docu->errcode;#ifdef HAVE_MTstatic void _sigintthr(int nr){#ifdef I_FACE if(!cfg.processing) { exit(0); }#endif errno = EINTR; cfg.stop = TRUE; cfg.rbreak = TRUE;}static void _sigquitthr(int nr){ pthread_exit(NULL);}#endifstatic void reschedule_url(url * urlp){ DEBUG_MISC(gettext("Rescheduling locked URL as no. %d\n"), cfg.total_cnt); LOCK_CFG_URLSTACK; cfg.urlstack = dllist_append(cfg.urlstack, (dllist_t) urlp);#ifdef HAVE_MT mt_semaphore_up(&cfg.urlstack_sem);#endif cfg.total_cnt++; UNLOCK_CFG_URLSTACK;}static void run_post_command(doc * docp){ char *urlstr; char *cmd; DEBUG_MISC(gettext("Running post-processing command\n")); urlstr = url_to_urlstr(docp->doc_url, TRUE); cmd = tl_str_concat(NULL, priv_cfg.post_cmd, " \'", url_to_filename(docp->doc_url, FALSE), docp->is_parsable ? "\' 1 \'" : "\' 0 \'", urlstr, "\'", NULL); _free(urlstr); tl_system(cmd); _free(cmd);}static void add_matching_form(doc * docp, int nform, url_info * ui){ char *ftext; int flen; form_info *fi; dllist *ptr, *fields, *sfields; url_info *nui; if(!(ftext = form_get_text(nform, docp->contents, docp->size, &flen))) { return; } fi = form_parse(ftext, flen); if(!fi) return; /* copy all fields supplied on cmdln */ fields = NULL; for(ptr = ui->fields; ptr; ptr = ptr->next) { fields = dllist_prepend(fields, (dllist_t) form_field_duplicate((form_field *) ptr->data)); } /* copy all suitable fields from HTML form */ sfields = NULL; form_get_default_successful(NULL, fi->infos, &sfields); for(; sfields; sfields = dllist_remove_entry(sfields, sfields)) { form_field *ff = (form_field *) sfields->data; if(dllist_find2(fields, (dllist_t) ff, form_field_compare_name)) { _free(ff->name); _free(ff->value); _free(ff); } else { fields = dllist_prepend(fields, (dllist_t) ff); } } nui = url_info_new(fi->action); nui->type = URLI_FORM; nui->fields = fields; nui->encoding = fi->encoding; nui->method = fi->method; nui->localname = tl_strdup(ui->localname); form_free(fi); append_starting_url(nui, docp->doc_url); url_info_free(nui);}static void add_matching_forms(doc * docp, dllist * formlist){ dllist *fptr, *uptr; int nform; for(fptr = formlist, nform = 0; fptr; fptr = fptr->next, nform++) { url *urlp; urlp = url_parse((char *) fptr->data); assert(urlp->type != URLT_FROMPARENT); if((urlp->type != URLT_HTTP) && (urlp->type != URLT_HTTPS)) { free_deep_url(urlp); _free(urlp); continue; } free_deep_url(urlp); _free(urlp); for(uptr = priv_cfg.formdata; uptr; uptr = uptr->next) { url_info *ui = (url_info *) uptr->data; if(!strcmp(ui->urlstr, (char *) fptr->data)) { add_matching_form(docp, nform, ui); } } }}int process_document(doc * docu, int check_lim){ url *urlr; int nreget = 0, nredir = 0, pokus = 0; time_t atm; char cpom[64]; char *pstr = NULL; int store_stat; struct stat estat; urlr = docu->doc_url; docu->check_limits = check_lim; _Xt_Serve; if(docu->check_limits) docu->check_limits = (urlr->parent_url != NULL); while(!cfg.stop && !cfg.rbreak) { _free(docu->ftp_pasv_host); docu->errcode = ERR_NOERROR; docu->mime = NULL; docu->type_str = NULL; docu->doc_url = urlr; docu->dtime = 0L; docu->contents = NULL; docu->is_chunked = FALSE; docu->read_chunksize = FALSE; docu->read_trailer = FALSE; docu->ftp_fatal_err = FALSE; pstr = url_to_urlstr(urlr, FALSE); if(pokus) xprintf(1, gettext("retry no. %d\n"), pokus);#ifdef HAVE_MT xprintf(1, gettext("URL[%2d]: %5d(%d) of %5d %s\n"), docu->threadnr + 1, docu->doc_nr, cfg.fail_cnt, cfg.total_cnt, pstr);#else xprintf(1, gettext("URL: %5d(%d) of %5d %s\n"), docu->doc_nr, cfg.fail_cnt, cfg.total_cnt, pstr);#endif#ifdef I_FACE if(cfg.xi_face) { gui_set_doccounter(); gui_set_url(pstr); gui_set_status(gettext("Starting download")); }#endif /*** to be able to revisit moved documents ***/ /*** especially for authorization purposes ***/ if((urlr->status & URL_PROCESSED) && urlr->moved_to && nredir) { urlr->status &= ~URL_PROCESSED; } if(docu->check_limits) { cond_info_t condp; condp.level = 2; condp.urlnr = docu->doc_nr; condp.size = 0; condp.time = 0L; condp.mimet = NULL; condp.full_tag = NULL; condp.params = NULL; condp.html_doc = NULL; condp.html_doc_offset = 0; condp.tag = NULL; condp.attrib = NULL; if(urlr->status & URL_PROCESSED) { xprintf(1, gettext("Already processed\n")); docu->errcode = ERR_PROCESSED; SETNEXTURL; } if(urlr->status & URL_USER_DISABLED) { xprintf(1, gettext("Disallowed by user\n")); docu->errcode = ERR_UDISABLED; SETNEXTURL; } if(!prottable[urlr->type].supported || (urlr->parent_url && (urlr->type == URLT_FTP || urlr->type == URLT_FTPS) && urlr->p.ftp.dir && !cfg.condition.ftpdir) || (urlr->parent_url && !url_append_condition(urlr, &condp))) { xprintf(1, gettext("Disallowed by rules\n")); urlr->status |= URL_REJECTED; docu->errcode = ERR_RDISABLED; SETNEXTURL; } gui_set_status(gettext("Checking \"robots.txt\"")); if(!robots_check(urlr)) { xprintf(1, gettext("Disallowed by \"robots.txt\"\n")); urlr->status |= URL_REJECTED; docu->errcode = ERR_RDISABLED; SETNEXTURL; } } if(cfg.mode == MODE_FTPDIR && (urlr->type != URLT_FTP && urlr->type != URLT_FTPS)) { xprintf(1, gettext("This URL type is not supported with ftpdir mode\n")); urlr->status |= URL_REJECTED; docu->errcode = ERR_RDISABLED; SETNEXTURL; } _Xt_Serve; if(cfg.mode == MODE_SYNC) { char *pp = url_to_filename(urlr, TRUE); if(!stat(pp, &estat) && !S_ISDIR(estat.st_mode)) { atm = time(NULL) - 86400 * cfg.ddays; /* pro: We do not want the message "No transfer - file not expired" if the server's clock is ahead of our clock. If no parameter cfg.ddays is given, then we do not compare the file modification times. */ if(cfg.ddays == 0 || estat.st_mtime < atm) docu->dtime = estat.st_mtime; else { xprintf(1, gettext("No transfer - file not expired\n")); urlr->status |= URL_REJECTED; docu->errcode = ERR_RDISABLED; SETNEXTURL; } urlr->status |= URL_ISLOCAL; docu->origsize = estat.st_size; } } if(cfg.show_time) { atm = time(NULL); LOCK_TIME; strftime(cpom, sizeof(cpom), "%H:%M:%S", localtime(&atm)); UNLOCK_TIME; xprintf(1, gettext("Starting time : %s\n"), cpom); }#ifdef I_FACE if(cfg.stop || cfg.rbreak) { _free(pstr); break; }#endif _Xt_Serve; if((urlr->type == URLT_FTP || urlr->type == URLT_FTP) && urlr->extension && ((ftp_url_extension *) urlr->extension)->type == FTP_TYPE_L && ((ftp_url_extension *) urlr->extension)->slink) { if(cfg.retrieve_slink) { /** need to kill extension, because we must **/ /** guess the file type beside the symlink **/ ftp_url_ext_free(urlr->extension); urlr->extension = NULL; } else { ftp_make_symlink(urlr); urlr->status |= URL_PROCESSED; docu->errcode = ERR_NOERROR; SETNEXTURL; } } gui_set_status(gettext("Starting download")); if(doc_download(docu, FALSE, FALSE)) { if(cfg.show_time) { atm = time(NULL); LOCK_TIME; strftime(cpom, sizeof(cpom), "%H:%M:%S", localtime(&atm)); UNLOCK_TIME; xprintf(1, gettext("Ending time : %s\n"), cpom); } _Xt_Serve; doc_remove_lock(docu); _free(docu->contents); report_error(docu, gettext("download")); DEBUG_USER("Error status code - (%d)\n"); if((nreget < cfg.nreget && (docu->errcode == ERR_HTTP_TRUNC || docu->errcode == ERR_FTP_TRUNC || docu->errcode == ERR_LOW_TRANSFER_RATE || docu->errcode == ERR_HTTP_FAILREGET || docu->errcode == ERR_HTTP_TIMEOUT || docu->errcode == ERR_HTTP_GW_TIMEOUT)) || (nredir < cfg.nredir && docu->errcode == ERR_HTTP_REDIR) || (docu->errcode == ERR_HTTP_AUTH) || (docu->errcode == ERR_HTTP_PROXY_AUTH)) { if(docu->errcode == ERR_HTTP_REDIR) { urlr->status |= URL_PROCESSED; if((urlr->moved_to->status & URL_PROCESSED) && (!urlr->moved_to->moved_to)) { SETNEXTURL; } else {#ifdef I_FACE if(cfg.xi_face) gui_tree_set_icon_for_doc(docu);#endif urlr = urlr->moved_to; } } if(docu->errcode == ERR_HTTP_TRUNC) { urlr->status |= URL_TRUNCATED; _free(docu->etag); docu->etag = get_mime_param_val_str("ETag:", docu->mime); if(!docu->etag) docu->etag = get_mime_param_val_str("Content-Location:", docu->mime); if(!docu->etag) docu->etag = get_mime_param_val_str("Last-Modified", docu->mime); } if(docu->errcode == ERR_HTTP_AUTH) { docu->doc_url->status |= URL_PROCESSED; docu->doc_url->status |= URL_ERR_REC; SETNEXTURL; } if(docu->errcode == ERR_HTTP_PROXY_AUTH) { docu->doc_url->status |= URL_PROCESSED; docu->doc_url->status |= URL_ERR_REC; SETNEXTURL; } _free(docu->mime); _free(docu->type_str); nreget += (docu->errcode == ERR_HTTP_TRUNC || docu->errcode == ERR_FTP_TRUNC) && cfg.mode != MODE_SREGET; nredir += (docu->errcode == ERR_HTTP_REDIR); _free(pstr); continue; } if(docu->errcode == ERR_FTP_UNKNOWN || docu->errcode == ERR_FTP_CONNECT || docu->errcode == ERR_FTP_DATACON || docu->errcode == ERR_FTPS_CONNECT || docu->errcode == ERR_FTPS_DATASSLCONNECT || docu->errcode == ERR_HTTP_UNKNOWN || docu->errcode == ERR_HTTP_CONNECT || docu->errcode == ERR_HTTP_SNDREQ || docu->errcode == ERR_HTTP_SNDREQDATA || docu->errcode == ERR_HTTP_RCVRESP || docu->errcode == ERR_HTTP_SERV || docu->errcode == ERR_HTTP_TIMEOUT || docu->errcode == ERR_HTTP_PROXY_CONN || docu->errcode == ERR_HTTPS_CONNECT || docu->errcode == ERR_READ || docu->errcode == ERR_ZERO_SIZE || docu->errcode == ERR_GOPHER_CONNECT || docu->errcode == ERR_PROXY_CONNECT || docu->errcode == ERR_HTTP_SERV) { urlr->status |= URL_ERR_REC; pokus++; /*** retry only when allowed ***/ if(pokus >= cfg.nretry) { urlr->status |= URL_PROCESSED; SETNEXTURL; } _free(pstr); _free(docu->mime); _free(docu->type_str); continue; } else if(docu->errcode == ERR_LOCKED) { if(!cfg.urlstack) { xprintf(1, gettext("last document locked -> sleeping for 5 seconds\n")); tl_sleep(5); } reschedule_url(urlr); SETNEXTURL; } else if(docu->errcode == ERR_BIGGER || docu->errcode == ERR_SMALLER || docu->errcode == ERR_NOMIMET || docu->errcode == ERR_OUTTIME || docu->errcode == ERR_SCRIPT_DISABLED) { urlr->status |= URL_PROCESSED; urlr->status |= URL_ERR_REC; SETNEXTURL; } else { /*** remove improper documents if required ***/ if((cfg.remove_old && (cfg.mode == MODE_SYNC || cfg.mode == MODE_MIRROR)) && (((docu->errcode == ERR_FTP_GET || docu->errcode == ERR_FTP_BDIR || docu->errcode == ERR_FTP_NODIR) && docu->ftp_respc == 550) || docu->errcode == ERR_HTTP_NFOUND || docu->errcode == ERR_HTTP_GONE)) { doc_remove(docu->doc_url); } urlr->status |= URL_ERR_UNREC; urlr->status |= URL_PROCESSED; SETNEXTURL; } } _Xt_Serve; if(urlr->status & URL_TRUNCATED) urlr->status &= ~URL_TRUNCATED; if(urlr->status & URL_ERR_REC) urlr->status &= ~URL_ERR_REC; if(cfg.show_time) { atm = time(NULL); LOCK_TIME; strftime(cpom, sizeof(cpom), "%H:%M:%S", localtime(&atm)); UNLOCK_TIME; xprintf(1, gettext("Ending time : %s\n"), cpom); } report_error(docu, gettext("download")); _Xt_Serve; if(docu->contents) { if(docu->is_parsable) { dllist *formlist = NULL; dllist *urls; gui_set_status(gettext("Relocating and scanning HTML document")); urls = html_process_document(docu, priv_cfg.formdata ? &formlist : NULL); _Xt_Serve; if(urls && cfg.dump_urlfd >= 0) { dump_urls_list(urls); } if(priv_cfg.formdata && formlist) { add_matching_forms(docu, formlist); while(formlist) { if(formlist->data) free((void *) formlist->data); formlist = dllist_remove_entry(formlist, formlist); } } if(cfg.mode != MODE_SREGET && cfg.mode != MODE_FTPDIR && !(docu->doc_url->status & URL_NORECURSE)) { gui_tree_add_start(); cat_links_to_url_list(urls); gui_tree_add_end(); } else if(cfg.mode == MODE_FTPDIR) { dump_ftp_list(urls); } else { for(; urls; urls = dllist_remove_entry(urls, urls)) { free_deep_url((url *) urls->data); if(urls->data) free((url *)urls->data); } } _Xt_Serve; } store_stat = 0; if(cfg.dumpfd >= 0 && cfg.dump_after) { bufio *fd; gui_set_status(gettext("Dumping processed document")); LOCK_DUMPFD; fd = bufio_dupfd(cfg.dumpfd); if(docu->mime && cfg.dump_resp) bufio_write(fd, docu->mime, strlen(docu->mime));
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -