⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 condition.c

📁 网络爬虫程序
💻 C
📖 第 1 页 / 共 3 页
字号:
/***************************************************************************//*    This code is part of WWW grabber called pavuk                        *//*    Copyright (c) 1997 - 2001 Stefan Ondrejicka                          *//*    Distributed under GPL 2 or later                                     *//***************************************************************************/#include "config.h"#include <sys/types.h>#include <sys/socket.h>#include <netinet/in.h>#ifdef HAVE_ARPA_INET_H#include <arpa/inet.h>#endif#include <netdb.h>#include <stdlib.h>#include <string.h>#include <stdio.h>#ifdef HAVE_FNMATCH#include <fnmatch.h>#else#include "fnmatch.h"#endif#include "url.h"#include "condition.h"#include "tools.h"#include "uexit.h"#include "re.h"#include "dns.h"#include "debugl.h"#include "jsbind.h"#include "tag_pattern.h"#include "dlhash_tools.h"static int cond_unsupported(url *, cond_info_t *);static int cond_lmax(url *, cond_info_t *);static int cond_dmax(url *, cond_info_t *);static int cond_noftp(url *, cond_info_t *);static int cond_nhttp(url *, cond_info_t *);static int cond_nossl(url *, cond_info_t *);static int cond_nogopher(url *, cond_info_t *);static int cond_noftps(url *, cond_info_t *);static int cond_nocgi(url *, cond_info_t *);static int cond_asite(url *, cond_info_t *);static int cond_dsite(url *, cond_info_t *);static int cond_adomain(url *, cond_info_t *);static int cond_ddomain(url *, cond_info_t *);static int cond_aprefix(url *, cond_info_t *);static int cond_dprefix(url *, cond_info_t *);static int cond_asfx(url *, cond_info_t *);static int cond_dsfx(url *, cond_info_t *);static int cond_pattern(url *, cond_info_t *);static int cond_rpattern(url *, cond_info_t *);static int cond_skip_pattern(url *, cond_info_t *);static int cond_skip_rpattern(url *, cond_info_t *);static int cond_url_pattern(url *, cond_info_t *);static int cond_url_rpattern(url *, cond_info_t *);static int cond_skip_url_pattern(url *, cond_info_t *);static int cond_skip_url_rpattern(url *, cond_info_t *);static int cond_dont_leave_site(url *, cond_info_t *);static int cond_dont_leave_dir(url *, cond_info_t *);static int cond_user_condition(url *, cond_info_t *);static int cond_aip_pattern(url *, cond_info_t *);static int cond_dip_pattern(url *, cond_info_t *);static int cond_site_level(url *, cond_info_t *);static int cond_dont_leave_site_enter_dir(url *, cond_info_t *);static int cond_leave_level(url *, cond_info_t *);static int cond_aport(url *, cond_info_t *);static int cond_dport(url *, cond_info_t *);static int cond_max_size(url *, cond_info_t *);static int cond_min_size(url *, cond_info_t *);static int cond_amime_type(url *, cond_info_t *);static int cond_dmime_type(url *, cond_info_t *);static int cond_newer_than(url *, cond_info_t *);static int cond_older_than(url *, cond_info_t *);static int cond_tag_pattern(url *, cond_info_t *);static int cond_tag_rpattern(url *, cond_info_t *);#define CL0 (1 << 0)#define CL1 (1 << 1)#define CL2 (1 << 2)#define CL3 (1 << 3)#define CLALL (CL0 | CL1 | CL2 | CL3)struct cond_type_info_t{  cond_type_t type;  char *name;  int (*cond_func) (url *, cond_info_t *);  bool_t standard;  int level;};static const struct cond_type_info_t cond_type_info[] = {  {CONDT_UNSUP, "unsupported", cond_unsupported, TRUE, CLALL},  {CONDT_NOFTP, "-noftp", cond_noftp, TRUE, CL0 | CL2},  {CONDT_NOHTTP, "-nohttp", cond_nhttp, TRUE, CL0 | CL2},  {CONDT_NOSSL, "-nossl", cond_nossl, TRUE, CL0 | CL2},  {CONDT_NOGOPHER, "-nogopher", cond_nogopher, TRUE, CL0 | CL2},  {CONDT_NOFTPS, "-noftps", cond_noftps, TRUE, CL0 | CL2},  {CONDT_NOCGI, "-nocgi", cond_nocgi, TRUE, CL0 | CL2},  {CONDT_LMAX, "-lmax", cond_lmax, TRUE, CL0 | CL2},  {CONDT_DMAX, "-dmax", cond_dmax, TRUE, CL1 | CL2},  {CONDT_ASITE, "-asite", cond_asite, TRUE, CL0 | CL2},  {CONDT_DSITE, "-dsite", cond_dsite, TRUE, CL0 | CL2},  {CONDT_ADOMAIN, "-adomain", cond_adomain, TRUE, CL0 | CL2},  {CONDT_DDOMAIN, "-ddomain", cond_ddomain, TRUE, CL0 | CL2},  {CONDT_APREFIX, "-aprefix", cond_aprefix, TRUE, CL0 | CL2},  {CONDT_DPREFIX, "-dprefix", cond_dprefix, TRUE, CL0 | CL2},  {CONDT_ASFX, "-asfx", cond_asfx, TRUE, CL0 | CL2},  {CONDT_DSFX, "-dsfx", cond_dsfx, TRUE, CL0 | CL2},  {CONDT_DONT_LEAVE_SITE, "-dont_leave_site", cond_dont_leave_site, TRUE,      CL0 | CL2},  {CONDT_DONT_LEAVE_DIR, "-dont_leave_dir", cond_dont_leave_dir, TRUE,      CL0 | CL2},  {CONDT_SITE_LEVEL, "-site_level", cond_site_level, TRUE, CL0 | CL2},  {CONDT_LEAVE_LEVEL, "-leave_level", cond_leave_level, TRUE, CL0 | CL2},  {CONDT_DONT_LEAVE_SITE_ENTER_DIR, "-dont_leave_site_enter_dir",      cond_dont_leave_site_enter_dir, TRUE, CL0 | CL2},  {CONDT_APORTS, "-aport", cond_aport, TRUE, CL0 | CL2},  {CONDT_DPORTS, "-dport", cond_dport, TRUE, CL0 | CL2},  {CONDT_MAX_SIZE, "-max_size", cond_max_size, TRUE, CL3},  {CONDT_MIN_SIZE, "-min_size", cond_min_size, TRUE, CL3},  {CONDT_AMIME_TYPE, "-amimet", cond_amime_type, TRUE, CL3},  {CONDT_DMIME_TYPE, "-dmimet", cond_dmime_type, TRUE, CL3},  {CONDT_NEWER_THAN, "-newer_than", cond_newer_than, TRUE, CL3},  {CONDT_OLDER_THAN, "-older_than", cond_older_than, TRUE, CL3},  {CONDT_AIP_PATTERN, "-aip_pattern", cond_aip_pattern, TRUE, CL0 | CL2},  {CONDT_DIP_PATTERN, "-dip_pattern", cond_dip_pattern, TRUE, CL0 | CL2},  {CONDT_PATTERN, "-pattern", cond_pattern, FALSE, CL0 | CL2},  {CONDT_RPATTERN, "-rpattern", cond_rpattern, FALSE, CL0 | CL2},  {CONDT_SKIP_PATTERN, "-skip_pattern", cond_skip_pattern, FALSE, CL0 | CL2},  {CONDT_SKIP_RPATTERN, "-skip_rpattern", cond_skip_rpattern, FALSE,      CL0 | CL2},  {CONDT_URL_PATTERN, "-url_pattern", cond_url_pattern, FALSE, CL0 | CL2},  {CONDT_URL_RPATTERN, "-url_rpattern", cond_url_rpattern, FALSE, CL0 | CL2},  {CONDT_SKIP_URL_PATTERN, "-skip_url_pattern", cond_skip_url_pattern, FALSE,      CL0 | CL2},  {CONDT_SKIP_URL_RPATTERN, "-skip_url_rpattern", cond_skip_url_rpattern,      FALSE, CL0 | CL2},  {CONDT_TAG_PATTERN, "-tag_pattern", cond_tag_pattern, TRUE, CL0},  {CONDT_TAG_RPATTERN, "-tag_rpattern", cond_tag_rpattern, TRUE, CL0},  {CONDT_USER_CONDITION, "-user_condition", cond_user_condition, FALSE,      CL1 | CL3},};static const struct cond_type_info_t *cond_type_info_find(char *name){  int i;  for(i = 0; i < NUM_ELEM(cond_type_info); i++)  {    if(!strcasecmp(cond_type_info[i].name, name))      return &(cond_type_info[i]);  }  return NULL;}int url_append_one_condition(char *name, url * urlp, cond_info_t * condp){  const struct cond_type_info_t *cond;  if((cond = cond_type_info_find(name)))  {    condp->reason = cond->type;    return (cond->cond_func(urlp, condp) == TRUE);  }  else    return -1;}#define DLMSG(i) \  { \    char *urlstr = url_to_urlstr(urlp, FALSE); \    DEBUG_LIMITS("Failed URL condition (%s) -> %s\n", cond_type_info[i].name, \      urlstr); \    _free(urlstr); \    condp->reason = cond_type_info[i].type; \  }#ifdef HAVE_REGEXstatic int url_append_condition_patterns_default(url * urlp,  cond_info_t * condp){  int pm1, pm2, pm3, pm4;  pm1 = cond_pattern(urlp, condp);  pm3 = cond_rpattern(urlp, condp);  if(priv_cfg.condition.pattern && priv_cfg.condition.rpattern)  {    if(!pm1 && !pm3)    {      if(!pm1)      {        DLMSG(CONDT_PATTERN);      }      else      {        DLMSG(CONDT_RPATTERN);      }      return FALSE;    }  }  else if(priv_cfg.condition.pattern)  {    if(!pm1)    {      DLMSG(CONDT_PATTERN);      return FALSE;    }  }  else if(priv_cfg.condition.rpattern)  {    if(!pm3)    {      DLMSG(CONDT_RPATTERN);      return FALSE;    }  }  pm2 = cond_skip_pattern(urlp, condp);  pm4 = cond_skip_rpattern(urlp, condp);  if(priv_cfg.condition.skip_pattern && priv_cfg.condition.rskip_pattern)  {    if(!pm2 && !pm4)    {      if(!pm2)      {        DLMSG(CONDT_SKIP_PATTERN);      }      else      {        DLMSG(CONDT_SKIP_RPATTERN);      }      return FALSE;    }  }  else if(priv_cfg.condition.skip_pattern)  {    if(!pm2)    {      DLMSG(CONDT_SKIP_PATTERN);      return FALSE;    }  }  else if(priv_cfg.condition.rskip_pattern)  {    if(!pm4)    {      DLMSG(CONDT_SKIP_RPATTERN);      return FALSE;    }  }  pm1 = cond_url_pattern(urlp, condp);  pm3 = cond_url_rpattern(urlp, condp);  if(priv_cfg.condition.url_pattern && priv_cfg.condition.rurl_pattern)  {    if(!pm1 && !pm3)    {      if(!pm1)      {        DLMSG(CONDT_URL_PATTERN);      }      else      {        DLMSG(CONDT_URL_RPATTERN);      }      return FALSE;    }  }  else if(priv_cfg.condition.url_pattern)  {    if(!pm1)    {      DLMSG(CONDT_URL_PATTERN);      return FALSE;    }  }  else if(priv_cfg.condition.rurl_pattern)  {    if(!pm3)    {      DLMSG(CONDT_URL_RPATTERN);      return FALSE;    }  }  pm2 = cond_skip_url_pattern(urlp, condp);  pm4 = cond_skip_url_rpattern(urlp, condp);  if(priv_cfg.condition.skip_url_pattern &&    priv_cfg.condition.rskip_url_pattern)  {    if(!pm2 && !pm4)    {      if(!pm2)      {        DLMSG(CONDT_SKIP_URL_PATTERN);      }      else      {        DLMSG(CONDT_SKIP_URL_RPATTERN);      }      return FALSE;    }  }  else if(priv_cfg.condition.skip_url_pattern)  {    if(!pm2)    {      DLMSG(CONDT_SKIP_URL_PATTERN);      return FALSE;    }  }  else if(priv_cfg.condition.rskip_url_pattern)  {    if(!pm4)    {      DLMSG(CONDT_SKIP_URL_RPATTERN);      return FALSE;    }  }  return TRUE;}#elsestatic int url_append_condition_patterns_default(url * urlp,  cond_info_t * condp){  int pm1;  pm1 = cond_pattern(urlp, condp);  if(cfg.condition.pattern && !pm1)  {    DLMSG(CONDT_PATTERN);    return FALSE;  }  pm1 = cond_skip_pattern(urlp, condp);  if(cfg.condition.skip_pattern && !pm1)  {    DLMSG(CONDT_SKIP_PATTERN);    return FALSE;  }  pm1 = cond_url_pattern(urlp, condp);  if(cfg.condition.url_pattern && !pm1)  {    DLMSG(CONDT_URL_PATTERN);    return FALSE;  }  pm1 = cond_skip_url_pattern(urlp, condp);  if(cfg.condition.skip_url_pattern && !pm1)  {    DLMSG(CONDT_SKIP_URL_PATTERN);    return FALSE;  }  return TRUE;}#endif/********************************************************//* check wheter URL match given limiting conditions *//* following default pavuk rules      *//********************************************************/static int url_append_condition_default(url * urlp, cond_info_t * condp){  int i;  int level;  level = (1 << condp->level);  if(!cfg.condition.limit_inlines && (urlp->status & URL_INLINE_OBJ))    return TRUE;  for(i = 0; i < NUM_ELEM(cond_type_info); i++)  {    if(cond_type_info[i].standard &&      (cond_type_info[i].level & level) &&      !cond_type_info[i].cond_func(urlp, condp))    {      DLMSG(i);      return FALSE;    }  }  if((cond_type_info[CONDT_PATTERN].level & level))  {    if(!url_append_condition_patterns_default(urlp, condp))      return FALSE;  }  if((cond_type_info[CONDT_USER_CONDITION].level & level))  {    if(!cond_user_condition(urlp, condp))    {      DLMSG(CONDT_USER_CONDITION);      return FALSE;    }  }  return TRUE;}/********************************************************//* check wheter URL match given limiting conditions *//********************************************************/int url_append_condition(url * urlp, cond_info_t * condp){#ifdef HAVE_MOZJS  int rv;  if(!cond_unsupported(urlp, condp))    return FALSE;  rv = pjs_run_cond_check_func(urlp, condp);  return (rv < 0) ? url_append_condition_default(urlp, condp) : rv;#else  return url_append_condition_default(urlp, condp);#endif}/********************************************************//* check wheter site is from one of domains from aray *//********************************************************/static bool_t domain_condition(char *site, char **l){  char **p = l;  int sl = strlen(site);  while(*p)  {    int dl = strlen(*p);    if(dl <= sl)    {      if(!strcasecmp(*p, site + sl - dl))        return TRUE;    }    p++;  }  return FALSE;}/********************************************************//* check wheter up have suffix sfx      *//********************************************************/static bool_t cmp_sfx(char *up, char *sfx){  char *pom = NULL;  int nlen, slen;  int rv;  slen = strlen(sfx);  nlen = strlen(up);  if(nlen < slen)    return FALSE;  rv = (!strcmp(sfx, up + nlen - slen));  _free(pom);  return rv;}/********************************************************//* check wheter url have match one of sufixes from aray *//********************************************************/static bool_t sfx_condition(url * urlr, char **l){  char **pp = l;  char *p = url_get_full_path(urlr);  int rv = FALSE;  while(*pp)  {    if(cmp_sfx(p, *pp))    {      rv = TRUE;      break;    }    pp++;  }  _free(p);  return rv;}/********************************************************//* check wheter url path have one of prefixes from  *//* priv_cfg.condition.dir_prefix      *//********************************************************/static bool_t prefix_condition(url * urlr, char **l){  char **pp = l;  char *p = url_get_full_path(urlr);  int rv = FALSE;  while(*pp)  {    if(!strncmp(*pp, p, strlen(*pp)))    {      rv = TRUE;      break;    }    pp++;  }  _free(p);  return rv;}/********************************************************//* check wheter string is mattached by at least one *//* wildcard pattern from list       *//********************************************************/static bool_t cmp_pattern(char *str, char **pattern){  char **pp;  for(pp = pattern; pp && *pp; pp++)  {    if(!fnmatch(*pp, str, 0))      return FALSE;  }  return (pattern != NULL);}static bool_t cmp_dlpattern(char *str, dllist * pattern){  for(; pattern; pattern = pattern->next)  {    if(!fnmatch((char *) pattern->data, str, 0))      return FALSE;  }

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -