⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 condition.c

📁 网络爬虫程序
💻 C
📖 第 1 页 / 共 3 页
字号:
{  url *gparent = urlp;  if(urlp->type != URLT_FILE && cfg.condition.dont_leave_site)  {    bool_t isgp = FALSE;    while(!isgp)    {#ifdef HAVE_MT      url *ogp = gparent;#endif      if(gparent->status & URL_ISSTARTING)      {        isgp = TRUE;        break;      }      LOCK_URL(ogp);      if(gparent->parent_url)        gparent = (url *) gparent->parent_url->data;      else        isgp = TRUE;      UNLOCK_URL(ogp);    }    return url_is_same_site(urlp, gparent);  }  else    return TRUE;}static int cond_dont_leave_dir(url * urlp, cond_info_t * condp){  if(urlp->type != URLT_FILE && cfg.condition.dont_leave_dir)  {    url *gparent = urlp;    char *p1, *p2, *p;    int len = 0;    bool_t isgp = FALSE;    while(!isgp)    {#ifdef HAVE_MT      url *ogp = gparent;#endif      if(gparent->status & URL_ISSTARTING)      {        isgp = TRUE;        break;      }      LOCK_URL(ogp);      if(gparent->parent_url)        gparent = (url *) gparent->parent_url->data;      else        isgp = TRUE;      UNLOCK_URL(ogp);    }    p1 = url_get_path(urlp);    p2 = url_get_path(gparent);    p = strrchr(p2, '/');    if(p)      len = p - p2;    return url_is_same_site(urlp, gparent) && !strncmp(p1, p2, len);  }  else    return TRUE;}static int cond_user_condition(url * urlp, cond_info_t * condp){  return priv_cfg.condition.uexit ? uexit_condition(urlp, NULL, 0L) : TRUE;}static int cond_aip_pattern(url * urlp, cond_info_t * condp){#ifdef HAVE_REGEX  if(condp->params)  {    int rv = TRUE;    dllist *p = str_list_to_re_list(condp->params);    if(p)      rv = check_ip_list(urlp, p);    for(; p; p = dllist_remove_entry(p, p))      re_free((re_entry *) p->data);    return rv;  }  else if(priv_cfg.condition.aip)    return check_ip_list(urlp, priv_cfg.condition.aip);#endif  return TRUE;}static int cond_dip_pattern(url * urlp, cond_info_t * condp){#ifdef HAVE_REGEX  if(condp->params)  {    int rv = TRUE;    dllist *p = str_list_to_re_list(condp->params);    if(p)    {      rv = check_ip_list(urlp, p);      if(rv == 0 || rv == -1)        rv = TRUE;      else        rv = FALSE;    }    for(; p; p = dllist_remove_entry(p, p))      re_free((re_entry *) p->data);    return rv;  }  else if(priv_cfg.condition.skipip)  {    int rv = check_ip_list(urlp, priv_cfg.condition.skipip);    if(rv == 0 || rv == -1)      return TRUE;    else      return FALSE;  }#endif  return TRUE;}static int cond_site_level(url * urlp, cond_info_t * condp){  int lvl;  if(condp->params)    lvl = _atoi((char *) condp->params->data);  else    lvl = cfg.condition.site_level;  if(urlp->type != URLT_FILE && lvl)  {    url *curl = urlp;    url *parent;    int level = 0;    int slevel = 0;    LOCK_URL(urlp);    if(urlp->parent_url)      parent = (url *) urlp->parent_url->data;    else      parent = NULL;    UNLOCK_URL(urlp);    while(parent)    {      if(parent->status & URL_ISSTARTING)        break;      if((curl->type != parent->type) ||        (url_get_port(curl) != url_get_port(parent)) ||        strcmp(url_get_site(curl), url_get_site(parent)))      {        if(!curl->moved_to || slevel)          level++;        slevel = 0;      }      else if(!curl->moved_to)        slevel++;      curl = parent;      LOCK_URL(curl);      if(urlp->parent_url)        parent = (url *) parent->parent_url->data;      else        parent = NULL;      UNLOCK_URL(curl);    }    return level <= lvl;  }  else    return TRUE;}static int cond_dont_leave_site_enter_dir(url * urlp, cond_info_t * condp){  if(urlp->type != URLT_FILE && cfg.condition.dont_leave_site_dir)  {    url *gparent = urlp;    char *p1, *p2, *p;    int len = 0;    bool_t isgp = FALSE;    while(!isgp)    {      url *ogp = gparent;      if(gparent->status & URL_ISSTARTING)      {        isgp = TRUE;      }      else      {        LOCK_URL(ogp);        if(gparent->parent_url)          gparent = (url *) gparent->parent_url->data;        else          isgp = TRUE;        UNLOCK_URL(ogp);      }      if((ogp->type != gparent->type) ||        (url_get_port(urlp) != url_get_port(gparent)) ||        strcmp(url_get_site(urlp), url_get_site(gparent)))      {        gparent = ogp;        break;      }    }    while(gparent->moved_to)    {      if(gparent == urlp)        break;      gparent = gparent->moved_to;    }    if(!isgp)    {      p1 = url_get_path(urlp);      p2 = url_get_path(gparent);      p = strrchr(p2, '/');      if(p)        len = p - p2;      return url_is_same_site(urlp, gparent) && !strncmp(p1, p2, len);    }  }  return TRUE;}static int cond_leave_level(url * urlp, cond_info_t * condp){  int lvl;  if(condp->params)    lvl = _atoi((char *) condp->params->data);  else    lvl = cfg.condition.leave_level;  if(urlp->type != URLT_FILE && lvl)  {    url *gparent = urlp;    url *pomurl = urlp;    int level = -1;    bool_t isgp = FALSE;    while(!isgp)    {#ifdef HAVE_MT      url *ogp = gparent;#endif      if(gparent->status & URL_ISSTARTING)      {        isgp = TRUE;        break;      }      LOCK_URL(ogp);      if(gparent->parent_url)        gparent = (url *) gparent->parent_url->data;      else        isgp = TRUE;      UNLOCK_URL(ogp);    }    while(pomurl)    {#ifdef HAVE_MT      url *tempurl = pomurl;#endif      if((pomurl->type == gparent->type) &&        (url_get_port(pomurl) == url_get_port(gparent)) &&        !strcmp(url_get_site(pomurl), url_get_site(gparent)))      {        break;      }      if(!pomurl->moved_to)        level++;      if((level - ((urlp->status & URL_INLINE_OBJ) ? 1 : 0)) >= lvl)      {        return FALSE;      }      if(pomurl->status & URL_ISSTARTING)      {        pomurl = NULL;      }      else      {        LOCK_URL(tempurl);        if(pomurl->parent_url)          pomurl = (url *) pomurl->parent_url->data;        else          pomurl = NULL;        UNLOCK_URL(tempurl);      }    }  }  return TRUE;}static int cond_aport(url * urlp, cond_info_t * condp){  long port = url_get_port(urlp);  if(condp->params)  {    char pom[10];    sprintf(pom, "%ld", port);    return !dllist_find2(condp->params, (dllist_t)pom, str_comp_func);  }  else if(priv_cfg.condition.ports && port && priv_cfg.condition.allow_ports)  {    return (dllist_find(priv_cfg.condition.ports, (dllist_t) port)    ? TRUE : FALSE);  }  else    return TRUE;}static int cond_dport(url * urlp, cond_info_t * condp){  long port = url_get_port(urlp);  if(condp->params)  {    char pom[10];    sprintf(pom, "%ld", port);    return !dllist_find2(condp->params, (dllist_t)pom, str_comp_func);  }  else if(priv_cfg.condition.ports && port && !priv_cfg.condition.allow_ports)  {    return ((!dllist_find(priv_cfg.condition.ports, (dllist_t) port))    ? TRUE : FALSE);  }  else    return TRUE;}static int cond_max_size(url * urlp, cond_info_t * condp){  if(condp->params)  {    int n = _atoi((char *) condp->params->data);    if(n)      return (n >= condp->size);  }  else if(cfg.condition.max_size && condp->size)    return (cfg.condition.max_size >= condp->size);  return TRUE;}static int cond_min_size(url * urlp, cond_info_t * condp){  if(condp->params)  {    int n = _atoi((char *) condp->params->data);    if(n)      return (n <= condp->size);  }  else if(cfg.condition.min_size && condp->size)    return (cfg.condition.min_size <= condp->size);  return TRUE;}static int cond_amime_type(url * urlp, cond_info_t * condp){  if(condp->params)  {    return !is_in_pattern_dllist(condp->mimet, condp->params);  }  else if(priv_cfg.condition.mime && priv_cfg.condition.allow_mime &&    condp->mimet)  {    return is_in_pattern_list(condp->mimet, priv_cfg.condition.mime);  }  else    return TRUE;}static int cond_dmime_type(url * urlp, cond_info_t * condp){  if(condp->params)  {    return !is_in_pattern_dllist(condp->mimet, condp->params);  }  else if(priv_cfg.condition.mime && !priv_cfg.condition.allow_mime &&    condp->mimet)  {    return !is_in_pattern_list(condp->mimet, priv_cfg.condition.mime);  }  else    return TRUE;}static int cond_newer_than(url * urlp, cond_info_t * condp){  if(condp->params)  {    time_t t = _atoi((char *) condp->params->data);    if(t)      return difftime(condp->time, t) <= 0;  }  else if(cfg.condition.etime && condp->time)    return difftime(condp->time, cfg.condition.etime) <= 0;  return TRUE;}static int cond_older_than(url * urlp, cond_info_t * condp){  if(condp->params)  {    time_t t = _atoi((char *) condp->params->data);    if(t)      return difftime(condp->time, t) >= 0;  }  else if(cfg.condition.etime && condp->time)    return difftime(condp->time, cfg.condition.btime) >= 0;  return TRUE;}static int cond_tag_pattern(url * urlp, cond_info_t * condp){  tag_pattern_t *tp;  char *p = NULL;  if(!condp->tag || !condp->attrib)    return TRUE;  if(condp->params)  {    if(dllist_count(condp->params) != 3)      return FALSE;    else      tp = tag_pattern_new(TAGP_WC,        (char *) dllist_nth(condp->params, 0),        (char *) dllist_nth(condp->params, 1),        (char *) dllist_nth(condp->params, 2));    if(tp)    {      int r;      p = url_to_urlstr(urlp, FALSE);      r = tag_pattern_match(tp, condp->tag, condp->attrib, p);      tag_pattern_free(tp);      _free(p);      return r;    }    else      return FALSE;  }  else  {    dllist *ptr;    if(priv_cfg.condition.tag_patterns)      p = url_to_urlstr(urlp, FALSE);    for(ptr = priv_cfg.condition.tag_patterns; ptr; ptr = ptr->next)    {      tp = (tag_pattern_t *) ptr->data;      if(tag_pattern_match(tp, condp->tag, condp->attrib, p))      {        _free(p);        return TRUE;      }    }    _free(p);    return priv_cfg.condition.tag_patterns == NULL;  }}static int cond_tag_rpattern(url * urlp, cond_info_t * condp){  if(!condp->tag || !condp->attrib)    return TRUE;  if(condp->params)  {    tag_pattern_t *tp;    if(dllist_count(condp->params) != 3)      return FALSE;    else      tp = tag_pattern_new(TAGP_RE,        (char *) dllist_nth(condp->params, 0),        (char *) dllist_nth(condp->params, 1),        (char *) dllist_nth(condp->params, 2));    if(tp)    {      int r;      char *p;      p = url_to_urlstr(urlp, FALSE);      r = tag_pattern_match(tp, condp->tag, condp->attrib, p);      tag_pattern_free(tp);      _free(p);      return r;    }    else      return FALSE;  }  else  {    /* always return TRUE standard is handled by cond_tag_pattern */    return TRUE;  }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -