⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 config.c

📁 网络爬虫程序
💻 C
📖 第 1 页 / 共 5 页
字号:
/***************************************************************************//*    This code is part of WWW grabber called pavuk                        *//*    Copyright (c) 1997 - 2001 Stefan Ondrejicka                          *//*    Distributed under GPL 2 or later                                     *//***************************************************************************/#include "config.h"#include <unistd.h>#include <stdio.h>#include <netdb.h>#include <string.h>#include <sys/types.h>#include <sys/stat.h>#include <fcntl.h>#include <limits.h>#include <errno.h>#include "config.h"#include "tools.h"#include "ftp.h"#include "http.h"#include "gopher.h"#include "authinfo.h"#include "times.h"#include "html.h"#include "lfname.h"#include "re.h"#include "mopt.h"#include "jstrans.h"#include "tag_pattern.h"static void cfg_version_info(void);static int cfg_load_scenario(const char *);#include "options.h"struct strategie_mapt{  strategie id;  char *name;  char *label;};static struct strategie_mapt strategie_map[] = {  {SSTRAT_DO_SIRKY, "level", gettext_nop("Level order")},  {SSTRAT_DO_SIRKY_I, "leveli", gettext_nop("Level order, inline first")},  {SSTRAT_DO_HLBKY, "pre", gettext_nop("Pre order")},  {SSTRAT_DO_HLBKY_I, "prei", gettext_nop("Pre order, inline first")},};static strategie get_strategie_by_str(char *str){  int i;  for(i = 0; i < SSTRAT_LAST; i++)  {    if(!strcasecmp(str, strategie_map[i].name))      return strategie_map[i].id;  }  return SSTRAT_LAST;}static char *get_strategie_str(strategie id){  return strategie_map[id].name;}char *get_strategie_label(strategie id){  return gettext(strategie_map[id].label);}static const struct{  char *name;  int id;} _ssl_versions[] ={  {"ssl23", 1},  {"ssl2",  2},  {"ssl3",  3},  {"tls1",  4}};static strategie get_ssl_version_by_str(char *str){  int i;  for(i = 0; i < NUM_ELEM(_ssl_versions); i++)  {    if(!strcasecmp(str, _ssl_versions[i].name))      return _ssl_versions[i].id;  }  return -1;}static char *get_ssl_version_str(strategie id){  return _ssl_versions[id - 1].name;}/**********************************//* show program usage information *//**********************************/void usage(void){  int i;  cfg.bgmode = FALSE;  xprintf(0,    gettext("Usage:  %s  [options]  [any number of URLS]\npavuk-%s %s\n"),    cfg.prg_path, VERSION, HOSTTYPE);  for(i = 0; i < NUM_ELEM(params); i++)  {    if(params[i].help)      xprintf(0, gettext(params[i].help));  }  fflush(stdout);  exit(PAVUK_EXIT_OK);}void usage_short(void){  xprintf(0, "%s %s %s %s\n", PACKAGE, VERSION, REVISION, HOSTTYPE);  xprintf(0, gettext("Type \"%s --help\" for long help\n"), cfg.prg_path);  exit(PAVUK_EXIT_CFG_ERR);}static void cfg_version_info(void){  xprintf(0, "%s %s %s %s\n", PACKAGE, VERSION, REVISION, HOSTTYPE);  xprintf(0, gettext("Optional features available :\n"));#ifdef DEBUG  xprintf(0, gettext(" - Debug mode\n"));#endif#ifdef GETTEXT_NLS  xprintf(0, gettext(" - GNU gettext internationalization of messages\n"));#endif#ifdef HAVE_FLOCK  xprintf(0, gettext(" - flock() document locking\n"));#endif#ifdef HAVE_FCNTL_LOCK  xprintf(0, gettext(" - fcntl() document locking\n"));#endif#ifdef I_FACE#ifdef GTK_FACE  xprintf(0, gettext(" - Gtk GUI interface\n"));#endif#ifdef WITH_TREE  xprintf(0, gettext(" - URL tree preview\n"));#endif#endif#ifdef USE_SSL  xprintf(0, gettext(" - HTTP and FTP over SSL\n"));#if defined(USE_SSL_IMPL_OPENSSL) && defined(OPENSSL)#define __SSLIMP "OpenSSL"#elif defined(USE_SSL_IMPL_OPENSSL)#define __SSLIMP "SSLeay"#elif defined(USE_SSL_IMPL_NSS)#define __SSLIMP "NSS3"#else#define __SSLIMP "unknown"#endif  xprintf(0, gettext(" - SSL layer implemented with %s library\n"), __SSLIMP);#endif#ifdef SOCKS  xprintf(0, gettext(" - Socks proxy support\n"));#endif#ifdef HAVE_FSTATFS  xprintf(0, gettext(" - filesystem free space checking\n"));#endif#ifdef HAVE_REGEX  xprintf(0,    gettext    (" - optional regex patterns in -fnrules and -*rpattern options\n"));#endif#ifdef HAVE_POSIX_REGEX  xprintf(0, gettext(" - POSIX regexp\n"));#endif#ifdef HAVE_V8_REGEX  xprintf(0, gettext(" - Bell V8 regexp\n"));#endif#ifdef HAVE_BSD_REGEX  xprintf(0, gettext(" - BSD regexp\n"));#endif#ifdef HAVE_GNU_REGEX  xprintf(0, gettext(" - GNU regexp\n"));#endif#ifdef HAVE_PCRE_REGEX  xprintf(0, gettext(" - PCRE regexp\n"));#endif#ifdef HAVE_BDB_18x  xprintf(0,    gettext(" - support for loading files from Netscape browser cache\n"));#endif#ifdef HAVE_TERMIOS  xprintf(0,    gettext    (" - support for detecting whether pavuk is running as background job\n"));#endif#ifdef HAVE_MT  xprintf(0, gettext(" - multithreading support\n"));#endif#ifdef ENABLE_NTLM  xprintf(0, gettext(" - NTLM authorization support\n"));#endif#ifdef HAVE_MOZJS  xprintf(0, gettext(" - JavaScript bindings\n"));#endif#ifdef HAVE_INET6  xprintf(0, gettext(" - IPv6 support\n"));#endif  exit(PAVUK_EXIT_OK);}static int htmltag_set_disabled(char *tagstr, int disable){  int i, j;  bool_t tfound, afound;  char *tag;  char *attrib;  char *pom, *strtokbuf;  if(!strcasecmp(tagstr, "all"))  {    for(i = 0; i < html_link_tags_num(); i++)    {      for(j = 0; html_link_tags[i].attribs[j].attrib; j++)      {        if(disable)          html_link_tags[i].attribs[j].stat |= LINK_DISABLED;        else          html_link_tags[i].attribs[j].stat &= ~LINK_DISABLED;      }    }    return -1;  }  pom = tl_strdup(tagstr);  tag = strtokc_r(pom, ',', &strtokbuf);  attrib = strtokc_r(NULL, ';', &strtokbuf);  while(tag)  {    tfound = FALSE;    afound = FALSE;    for(i = 0; i < html_link_tags_num(); i++)    {      if(!strcasecmp(html_link_tags[i].tag, tag))      {        tfound = TRUE;        for(j = 0; html_link_tags[i].attribs[j].attrib; j++)        {          if(attrib && *attrib)          {            if(!strcasecmp(html_link_tags[i].attribs[j].attrib, attrib))            {              afound = TRUE;              if(disable)                html_link_tags[i].attribs[j].stat |= LINK_DISABLED;              else                html_link_tags[i].attribs[j].stat &= ~LINK_DISABLED;              break;            }          }          else          {            afound = TRUE;            if(disable)              html_link_tags[i].attribs[j].stat |= LINK_DISABLED;            else              html_link_tags[i].attribs[j].stat &= ~LINK_DISABLED;          }        }        break;      }    }    if(!(tfound && afound))    {      xprintf(0, gettext("HTML tag not supported : %s.%s\n"), tag,        attrib ? attrib : "(null)");    }    tag = strtokc_r(NULL, ',', &strtokbuf);    attrib = strtokc_r(NULL, ';', &strtokbuf);  }  _free(pom);  return -1;}static void cfg_set_to_default(cfg_param_t * cpar){  char **p;  int x, j;  if(cpar->type & PARAM_UNSUPPORTED)    return;  if(cpar->type & PARAM_FOREIGN)    return;  switch (cpar->type)  {  case PARAM_NUM:    *((long *) cpar->val_adr) = (long) cpar->default_val;    break;  case PARAM_PBOOL:    *((bool_t *) cpar->val_adr) = (bool_t) (long) cpar->default_val;    break;  case PARAM_NBOOL:    *((bool_t *) cpar->val_adr) = (bool_t) (long) cpar->default_val;    break;  case PARAM_PORT_RANGE:    *((long *) cpar->val_adr) = (long) cpar->default_val;    *((long *) cpar->mval_adr) = (long) cpar->mdefault_val;    break;  case PARAM_PATH:  case PARAM_STR:  case PARAM_PASS:    _free(*((char **) cpar->val_adr));    *((char **) cpar->val_adr) = (char *) cpar->default_val;    break;  case PARAM_STRLIST:    for(p = *((char ***) cpar->val_adr); p && *p; p++)      _free(*p);    _free(*(char ***) cpar->val_adr);    *((char ***) cpar->val_adr) = (char **) cpar->default_val;    if(cpar->mval_adr)      *((bool_t *) cpar->mval_adr) = (bool_t) (long) cpar->mdefault_val;    break;  case PARAM_CONN:    _free(*((char **) cpar->val_adr));    *((char **) cpar->val_adr) = (char *) cpar->default_val;    if(cpar->mval_adr)      *((long *) cpar->mval_adr) = (long) cpar->mdefault_val;    break;  case PARAM_AUTHSCH:    *((long *) cpar->val_adr) = (long) cpar->default_val;    break;  case PARAM_MODE:    *((long *) cpar->val_adr) = (long) cpar->default_val;    break;  case PARAM_TIME:    *((time_t *) cpar->val_adr) = (time_t) 0;    break;  case PARAM_HTMLTAG:    for(x = 0; x < html_link_tags_num(); x++)      for(j = 0; html_link_tags[x].attribs[j].attrib; j++)        html_link_tags[x].attribs[j].stat &= ~LINK_DISABLED;    break;  case PARAM_TWO_QSTR:    *((char **) cpar->val_adr) = (char *) cpar->default_val;    *((char **) cpar->mval_adr) = (char *) cpar->mdefault_val;    break;  case PARAM_DOUBLE:    *((double *) cpar->val_adr) = *(double *) cpar->default_val;    break;  case PARAM_LFNAME:    while(cfg.lfnames)    {      lfname_free((lfname *) cfg.lfnames->data);      cfg.lfnames = dllist_remove_entry(cfg.lfnames, cfg.lfnames);    }    break;  case PARAM_RE:#ifdef HAVE_REGEX    {      dllist *ptr = *((dllist **) cpar->val_adr);      *((dllist **) cpar->val_adr) = NULL;      while(ptr)      {        re_free((re_entry *) ptr->data);        ptr = dllist_remove_entry(ptr, ptr);      }    }#endif    break;  case PARAM_USTRAT:    *((strategie *) cpar->val_adr) = (strategie) cpar->default_val;    break;  case PARAM_SSLVER:    *((long *) cpar->val_adr) = (long) cpar->default_val;    break;  case PARAM_HTTPHDR:    {      dllist *ptr = *(dllist **) cpar->val_adr;      *(dllist **) cpar->val_adr = NULL;      while(ptr)      {        httphdr_free((httphdr *)ptr->data);        ptr = dllist_remove_entry(ptr, ptr);      }    }    break;  case PARAM_DEBUGL:    *((long *) cpar->val_adr) = (long) cpar->default_val;    break;  case PARAM_REQUEST:    {      dllist *ptr = *(dllist **) cpar->val_adr;      *(dllist **) cpar->val_adr = NULL;      while(ptr)      {        url_info_free((url_info *) ptr->data);        ptr = dllist_remove_entry(ptr, ptr);      }    }    break;  case PARAM_TRANSPARENT:    {      if(cpar->val_adr)      {        http_proxy *pr = *((http_proxy **) cpar->val_adr);        if(pr)          http_proxy_free(pr);      }    }    break;  case PARAM_PROXY:    {      dllist *ptr = *((dllist **) cpar->val_adr);      *((dllist **) cpar->val_adr) = NULL;      while(ptr)      {        http_proxy *pr = (http_proxy *) ptr->data;        http_proxy_unref(pr);        ptr = dllist_remove_entry(ptr, ptr);      }    }    break;  case PARAM_FUNC:    break;  case PARAM_JSTRANS:#ifdef HAVE_REGEX    while(cfg.js_transform)    {      js_transform_free((js_transform_t *) cfg.js_transform->data);      cfg.js_transform =        dllist_remove_entry(cfg.js_transform, cfg.js_transform);    }#endif    break;  case PARAM_NUMLIST:    {      dllist *ptr = *((dllist **) cpar->val_adr);      *((dllist **) cpar->val_adr) = NULL;      while(ptr)        ptr = dllist_remove_entry(ptr, ptr);      if(cpar->mval_adr)        *((bool_t *) cpar->mval_adr) = (bool_t) (long) cpar->mdefault_val;    }    break;  case PARAM_FTPHS:    {      dllist *ptr = *((dllist **) cpar->val_adr);      *((dllist **) cpar->val_adr) = NULL;      for(; ptr; ptr = dllist_remove_entry(ptr, ptr))        ftp_handshake_info_free((ftp_handshake_info *)ptr->data);    }    break;  case PARAM_TAGPAT:    {      dllist *ptr = *((dllist **) cpar->val_adr);      *((dllist **) cpar->val_adr) = NULL;      for(; ptr; ptr = dllist_remove_entry(ptr, ptr))        tag_pattern_free((tag_pattern_t *)ptr->data);    }    break;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -