⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 condition.c

📁 网络爬虫程序
💻 C
📖 第 1 页 / 共 3 页
字号:
  return (pattern != NULL);}static char **str_list_to_array(dllist * sl){  char **rv;  int i;  rv = _malloc(sizeof(char *) * (dllist_count(sl) + 1));  for(i = 0; sl; i++, sl = sl->next)    rv[i] = (char *)sl->data;  rv[i] = NULL;  return rv;}#ifdef HAVE_REGEX/********************************************************//* check wheter string is mattached by at least one *//* regular pattern from list        *//********************************************************/static bool_t cmp_rpattern(char *str, dllist * pattern){  dllist *pp;  for(pp = pattern; pp; pp = pp->next)  {    if(re_pmatch((re_entry *) pp->data, str))    {      return FALSE;    }  }  return (pattern != NULL);}static dllist *str_list_to_re_list(dllist * sl){  dllist *rv = NULL;  for(; sl; sl = sl->next)  {    re_entry *re = re_make((char *) sl->data);    if(re)      rv = dllist_append(rv, (dllist_t)re);  }  return rv;}/********************************************************//* check wheter site of URL matches one of listed IP  *//* adress regular patterns        *//*              *//* TRUE  = match          *//* FALSE = don't match          *//* -1    = error resolving hostname     *//********************************************************/static int check_ip_list(url * urlp, dllist * iplist){  int retv = -1;  char *site = url_get_site(urlp);  if(site && iplist)  {    char *ip = NULL;    int rv, f;    int is_valid = TRUE;    char raddr[64];    _h_errno_ = 0;    memset(&raddr, '\0', sizeof(raddr));    if(dns_gethostbyname(site, &rv, raddr, &f))      is_valid = FALSE;#ifdef HAVE_INET6    if(is_valid)    {      char buf[64];      inet_ntop(f, raddr, buf, sizeof(buf));      ip = tl_strdup(buf);    }#else    if(is_valid)    {      struct in_addr ia;      memcpy(&ia, raddr, TL_MIN(rv, sizeof(ia)));      LOCK_INETNTOA;      ip = tl_strdup(inet_ntoa(ia));      UNLOCK_INETNTOA;    }#endif    if(is_valid)      retv = !cmp_rpattern(ip, iplist);    _free(ip);  }  return retv;}#endif/********************************************************//* below are functions for implementing particular  *//* limiting options         *//********************************************************/static int cond_unsupported(url * urlp, cond_info_t * condp){  /*     0 - file    x     1 - directory   x     2 - CSO index   x     3 - error     4 - macbinhex   x     5 - dosbin    x     6 - uuencoded   x     7 - index     8 - telnet     9 - bin     x     + - redundant server     T - t3270     g - GIF     x     I - image   x     h - HTML    x     i - info     w - WWW address     s - sound   x     : - image   x     ; - movie   x     < - sound   x   */  if((urlp->type == URLT_GOPHER) &&    strchr("0124569gIhs:;<", urlp->p.gopher.selector[0]))    return TRUE;  return prottable[urlp->type].supported;}static int cond_lmax(url * urlp, cond_info_t * condp){  if(condp->params)  {    int n = _atoi((char *) condp->params->data);    return (urlp->level - ((urlp->status & URL_INLINE_OBJ) ? 1 : 0)) <= n;  }  else if(cfg.condition.max_levels)    return (urlp->level - ((urlp->status & URL_INLINE_OBJ) ? 1 : 0))      <= cfg.condition.max_levels;  else    return TRUE;}static int cond_dmax(url * urlp, cond_info_t * condp){  int n;  if(condp->params)    n = _atoi((char *) condp->params->data);  else    n = cfg.condition.max_documents;  if(n)    return !((!condp->urlnr &&        (cfg.total_cnt + 1) > n) || (condp->urlnr && condp->urlnr > n));  else    return TRUE;}static int cond_noftp(url * urlp, cond_info_t * condp){  return cfg.condition.ftp ? TRUE : (urlp->type != URLT_FTP);}static int cond_nhttp(url * urlp, cond_info_t * condp){  return cfg.condition.http ? TRUE : (urlp->type != URLT_HTTP);}static int cond_nossl(url * urlp, cond_info_t * condp){#ifdef USE_SSL  return cfg.condition.https ? TRUE : (urlp->type != URLT_HTTPS);#else  return TRUE;#endif}static int cond_nogopher(url * urlp, cond_info_t * condp){  return cfg.condition.gopher ? TRUE : (urlp->type != URLT_GOPHER);}static int cond_noftps(url * urlp, cond_info_t * condp){#ifdef USE_SSL  return cfg.condition.ftps ? TRUE : (urlp->type != URLT_FTPS);#else  return TRUE;#endif}static int cond_nocgi(url * urlp, cond_info_t * condp){  if((urlp->type == URLT_HTTP || urlp->type == URLT_HTTPS) &&    !cfg.condition.cgi)    return (urlp->p.http.searchstr == NULL);  else    return TRUE;}static int cond_asite(url * urlp, cond_info_t * condp){  char *site = url_get_site(urlp);  if(!site)    return TRUE;  if(condp->params)    return is_in_dllist(site, condp->params);  else if(priv_cfg.condition.sites && priv_cfg.condition.sites[0] &&    site && priv_cfg.condition.allow_site)    return is_in_list(site, priv_cfg.condition.sites);  else    return TRUE;}static int cond_dsite(url * urlp, cond_info_t * condp){  char *site = url_get_site(urlp);  if(!site)    return TRUE;  if(condp->params)    return !is_in_dllist(site, condp->params);  if(priv_cfg.condition.sites && priv_cfg.condition.sites[0] && site &&    !priv_cfg.condition.allow_site)    return !is_in_list(site, priv_cfg.condition.sites);  else    return TRUE;}static int cond_adomain(url * urlp, cond_info_t * condp){  char *site = url_get_site(urlp);  if(!site)    return TRUE;  if(condp->params)  {    char **sa = str_list_to_array(condp->params);    int rv = domain_condition(site, sa);    _free(sa);    return rv;  }  else if(priv_cfg.condition.domains && priv_cfg.condition.domains[0] &&    site && priv_cfg.condition.allow_domain)    return domain_condition(site, priv_cfg.condition.domains);  else    return TRUE;}static int cond_ddomain(url * urlp, cond_info_t * condp){  char *site = url_get_site(urlp);  if(!site)    return TRUE;  if(condp->params)  {    char **sa = str_list_to_array(condp->params);    int rv = !domain_condition(site, sa);    _free(sa);    return rv;  }  else if(priv_cfg.condition.domains && priv_cfg.condition.domains[0] &&    site && !priv_cfg.condition.allow_domain)    return !domain_condition(site, priv_cfg.condition.domains);  else    return TRUE;}static int cond_aprefix(url * urlp, cond_info_t * condp){  if(condp->params)  {    char **sa = str_list_to_array(condp->params);    int rv = prefix_condition(urlp, sa);    _free(sa);    return rv;  }  else if(priv_cfg.condition.dir_prefix &&    priv_cfg.condition.dir_prefix[0] && (urlp->type != URLT_FILE) &&    priv_cfg.condition.allow_prefix)    return prefix_condition(urlp, priv_cfg.condition.dir_prefix);  else    return TRUE;}static int cond_dprefix(url * urlp, cond_info_t * condp){  if(condp->params)  {    char **sa = str_list_to_array(condp->params);    int rv = !prefix_condition(urlp, sa);    _free(sa);    return rv;  }  else if(priv_cfg.condition.dir_prefix &&    priv_cfg.condition.dir_prefix[0] && (urlp->type != URLT_FILE) &&    !priv_cfg.condition.allow_prefix)    return !prefix_condition(urlp, priv_cfg.condition.dir_prefix);  else    return TRUE;}static int cond_asfx(url * urlp, cond_info_t * condp){  if(condp->params)  {    char **sa = str_list_to_array(condp->params);    int rv = sfx_condition(urlp, sa);    _free(sa);    return rv;  }  else if(priv_cfg.condition.sufix && priv_cfg.condition.sufix[0] &&    (urlp->type != URLT_FILE) && priv_cfg.condition.allow_sufix)    return sfx_condition(urlp, priv_cfg.condition.sufix);  else    return TRUE;}static int cond_dsfx(url * urlp, cond_info_t * condp){  if(condp->params)  {    char **sa = str_list_to_array(condp->params);    int rv = !sfx_condition(urlp, sa);    _free(sa);    return rv;  }  else if(priv_cfg.condition.sufix && priv_cfg.condition.sufix[0] &&    (urlp->type != URLT_FILE) && !priv_cfg.condition.allow_sufix)    return !sfx_condition(urlp, priv_cfg.condition.sufix);  else    return TRUE;}static int cond_pattern(url * urlp, cond_info_t * condp){  int rv = TRUE;  if(condp->params)  {    char *p = url_get_full_path(urlp);    rv = !cmp_dlpattern(p, condp->params);    _free(p);  }  else if(urlp->type != URLT_FILE && priv_cfg.condition.pattern)  {    char *p = url_get_full_path(urlp);    rv = !cmp_pattern(p, priv_cfg.condition.pattern);    _free(p);  }  return rv;}static int cond_rpattern(url * urlp, cond_info_t * condp){  int rv = TRUE;#ifdef HAVE_REGEX  if(condp->params)  {    dllist *pl = str_list_to_re_list(condp->params);    if(pl)    {      char *p = url_get_full_path(urlp);      rv = !cmp_rpattern(p, pl);      _free(p);    }    for(; pl; pl = dllist_remove_entry(pl, pl))      re_free((re_entry *) pl->data);  }  else if(urlp->type != URLT_FILE && priv_cfg.condition.rpattern)  {    char *p = url_get_full_path(urlp);    rv = !cmp_rpattern(p, priv_cfg.condition.rpattern);    _free(p);  }#endif  return rv;}static int cond_skip_pattern(url * urlp, cond_info_t * condp){  int rv = TRUE;  if(condp->params)  {    char *p = url_get_full_path(urlp);    rv = cmp_dlpattern(p, condp->params);    _free(p);  }  else if(urlp->type != URLT_FILE && priv_cfg.condition.skip_pattern)  {    char *p = url_get_full_path(urlp);    rv = cmp_pattern(p, priv_cfg.condition.skip_pattern);    _free(p);  }  return rv;}static int cond_skip_rpattern(url * urlp, cond_info_t * condp){  int rv = TRUE;#ifdef HAVE_REGEX  if(condp->params)  {    dllist *pl = str_list_to_re_list(condp->params);    if(pl)    {      char *p = url_get_full_path(urlp);      rv = cmp_rpattern(p, pl);      _free(p);    }    for(; pl; pl = dllist_remove_entry(pl, pl))      re_free((re_entry *) pl->data);  }  else if(urlp->type != URLT_FILE && priv_cfg.condition.rskip_pattern)  {    char *p = url_get_full_path(urlp);    rv = cmp_rpattern(p, priv_cfg.condition.rskip_pattern);    _free(p);  }#endif  return rv;}static int cond_url_pattern(url * urlp, cond_info_t * condp){  int rv = TRUE;  if(condp->params)  {    char *p = url_to_urlstr(urlp, FALSE);    rv = !cmp_dlpattern(p, condp->params);    _free(p);  }  else if(urlp->type != URLT_FILE && priv_cfg.condition.url_pattern)  {    char *p = url_to_urlstr(urlp, FALSE);    rv = !cmp_pattern(p, priv_cfg.condition.url_pattern);    _free(p);  }  return rv;}static int cond_url_rpattern(url * urlp, cond_info_t * condp){  int rv = TRUE;#ifdef HAVE_REGEX  if(condp->params)  {    dllist *pl = str_list_to_re_list(condp->params);    if(pl)    {      char *p = url_to_urlstr(urlp, FALSE);      rv = !cmp_rpattern(p, pl);      _free(p);    }    for(; pl; pl = dllist_remove_entry(pl, pl))      re_free((re_entry *) pl->data);  }  else if(urlp->type != URLT_FILE && priv_cfg.condition.rurl_pattern)  {    char *p = url_to_urlstr(urlp, FALSE);    rv = !cmp_rpattern(p, priv_cfg.condition.rurl_pattern);    _free(p);  }#endif  return rv;}static int cond_skip_url_pattern(url * urlp, cond_info_t * condp){  int rv = TRUE;  if(condp->params)  {    char *p = url_to_urlstr(urlp, FALSE);    rv = cmp_dlpattern(p, condp->params);    _free(p);  }  else if(urlp->type != URLT_FILE && priv_cfg.condition.skip_url_pattern)  {    char *p = url_to_urlstr(urlp, FALSE);    rv = cmp_pattern(p, priv_cfg.condition.skip_url_pattern);    _free(p);  }  return rv;}static int cond_skip_url_rpattern(url * urlp, cond_info_t * condp){  int rv = TRUE;#ifdef HAVE_REGEX  if(condp->params)  {    dllist *pl = str_list_to_re_list(condp->params);    if(pl)    {      char *p = url_to_urlstr(urlp, FALSE);      rv = cmp_rpattern(p, pl);      _free(p);    }    for(; pl; pl = dllist_remove_entry(pl, pl))      re_free((re_entry *) pl->data);  }  else if(urlp->type != URLT_FILE && priv_cfg.condition.rskip_url_pattern)  {    char *p = url_to_urlstr(urlp, FALSE);    rv = cmp_rpattern(p, priv_cfg.condition.rskip_url_pattern);    _free(p);  }#endif  return rv;}static int cond_dont_leave_site(url * urlp, cond_info_t * condp)

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -