⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 url.c

📁 网络爬虫程序
💻 C
📖 第 1 页 / 共 5 页
字号:
            inl = dllist_append(inl, (dllist_t)urlp);          else            reg = dllist_append(reg, (dllist_t)urlp);          break;        default:          break;        }        if(cfg.hack_add_index && !url_get_search_str(urlp))        {          char *pom;          char *ustr = url_to_urlstr(urlp, FALSE);          pom = strrchr(ustr, '/');          if(pom && pom[1])          {            url *nurl;            pom[1] = '\0';            nurl = url_parse(ustr);            assert(nurl->type != URLT_FROMPARENT);            dllist_append(p, (dllist_t) nurl);          }          _free(ustr);        }      }    }    else    {      LOCK_REJCNT;      cfg.reject_cnt++;      UNLOCK_REJCNT;      free_deep_url((url *) p->data);      free((url *)p->data);    }    p = p->next;  }  dllist_free_all(l1);  LOCK_CFG_URLSTACK;  switch (cfg.scheduling_strategie)  {  case SSTRAT_DO_SIRKY:  case SSTRAT_DO_SIRKY_I:    if(reg || inl)      append_url_list_to_list(dllist_concat(inl, reg), NULL);    break;  case SSTRAT_DO_HLBKY:  case SSTRAT_DO_HLBKY_I:    if(reg || inl)      append_url_list_to_list(dllist_concat(inl, reg), cfg.urlstack);    break;  default:    break;  }  UNLOCK_CFG_URLSTACK;#ifdef HAVE_MT  /* this is here for signaling sleeping downloading processes which */  /* wait for URL to be queued inside downloading queue              */  for(; nadd > 0; nadd--)  {    mt_semaphore_up(&cfg.urlstack_sem);  }#endif}void append_url_to_list(url * urlp){  if(!prottable[urlp->type].supported)  {    xprintf(1, gettext("unsupported URL type \"%s\"\n"),      prottable[urlp->type].urlid ? prottable[urlp->type].urlid :      gettext("unknown"));    return;  }  urlp->ref_cnt = 1;#ifdef WITH_TREE#ifdef I_FACE  if(cfg.xi_face)  {    urlp->tree_nfo = _malloc(sizeof(GUI_TREE_RTYPE));    urlp->tree_nfo[0] = gui_tree_make_entry(urlp);  }#endif#endif  url_add_to_url_hash_tab(urlp);  cfg.urlstack = dllist_append(cfg.urlstack, (dllist_t) urlp);  cfg.total_cnt++;#ifdef HAVE_MT  mt_semaphore_up(&cfg.urlstack_sem);#endif}void append_url_list_to_list(dllist * list, dllist * after){  if(after)    cfg.urlstack = dllist_insert_list_after(cfg.urlstack, after, list);  else    cfg.urlstack = dllist_concat(cfg.urlstack, list);}void link_url_in_list(url * orig, url * copy){  url *cpar;  LOCK_URL(copy);  if(copy->parent_url)    cpar = (url *) copy->parent_url->data;  else    cpar = NULL;  UNLOCK_URL(copy);  if(cpar && (orig != cpar))  {    dllist *ptr;    bool_t found = FALSE;    if(copy->parent_url)    {      LOCK_URL(orig);      for(ptr = orig->parent_url; ptr; ptr = ptr->next)        if((url *)ptr->data == cpar)          found = TRUE;      UNLOCK_URL(orig);    }    if(!found)    {      LOCK_URL(orig);      orig->ref_cnt++;      if(cpar)        orig->parent_url = dllist_append(orig->parent_url, (dllist_t) cpar);#ifdef WITH_TREE#ifdef I_FACE      if(cfg.xi_face)      {        orig->tree_nfo =          _realloc(orig->tree_nfo, orig->ref_cnt * sizeof(GUI_TREE_RTYPE));        orig->tree_nfo[orig->ref_cnt - 1] = gui_tree_make_entry(orig);      }#endif#endif      UNLOCK_URL(orig);      if(cpar && (orig->status & URL_MOVED) && (orig->status & URL_MOVED))      {        url *purl = orig;        char *fn;        while(purl->moved_to)          purl = purl->moved_to;        if(purl->status & URL_DOWNLOADED)        {          fn = url_to_filename(purl, TRUE);          rewrite_one_parent_links(copy, cpar, fn);        }      }    }  }}int url_redirect_to(url * src, url * dst, int is_303){  url *pomurl, *pomurl2;  src->status |= URL_MOVED;  url_clear_anchor(dst);  if((pomurl = url_was_befor(dst)))  {    free_deep_url(dst);    _free(dst);    pomurl2 = pomurl;    while(pomurl2)    {      if(src == pomurl2)      {        src->status &= ~URL_MOVED;        return -1;      }      pomurl2 = pomurl2->moved_to;    }    LOCK_URL(pomurl);    pomurl->parent_url = dllist_append(pomurl->parent_url, (dllist_t) src);    pomurl->ref_cnt++;    src->moved_to = pomurl;    src->status |= URL_MOVED;#ifdef WITH_TREE#ifdef I_FACE    if(cfg.xi_face)    {      pomurl->tree_nfo = _realloc(pomurl->tree_nfo,        (pomurl->ref_cnt) * sizeof(GUI_TREE_RTYPE));      pomurl->tree_nfo[pomurl->ref_cnt - 1] = gui_tree_make_entry(pomurl);    }#endif#endif    UNLOCK_URL(pomurl);    if((pomurl->status & URL_MOVED) || (pomurl->status & URL_DOWNLOADED))    {      url *purl = pomurl;      char *fn;      xprintf(1, gettext("Moved to already processed URL.\n"));      if(pomurl->status & URL_MOVED)      {        while(purl->moved_to)          purl = purl->moved_to;        fn = url_to_filename(purl, TRUE);      }      else        fn = url_to_filename(pomurl, TRUE);      if(cfg.rewrite_links && (purl->status & URL_DOWNLOADED))        rewrite_parents_links(src, fn);    }  }  else  {    dst->parent_url = dllist_append(dst->parent_url, (dllist_t) src);    src->moved_to = dst;    src->status |= URL_MOVED;    if(!is_303 && !dst->extension && (src->status & URL_FORM_ACTION))      dst->extension = form_info_dup(src->extension);#ifdef WITH_TREE#ifdef I_FACE    if(cfg.xi_face)    {      dst->tree_nfo = _malloc(sizeof(GUI_TREE_RTYPE));      dst->tree_nfo[0] = gui_tree_make_entry(dst);    }#endif#endif    dst->ref_cnt = 1;    url_add_to_url_hash_tab(dst);  }  return 0;}void url_add_to_url_hash_tab(url * urlp){  url_clear_anchor(urlp);  LOCK_CFG_URLHASH;  dlhash_insert(cfg.url_hash_tbl, (dllist_t) urlp);  UNLOCK_CFG_URLHASH;}void url_remove_from_url_hash_tab(url * urlp){  if(!prottable[urlp->type].supported)    return;  LOCK_CFG_URLHASH;  dlhash_exclude(cfg.url_hash_tbl, (dllist_t) urlp);  UNLOCK_CFG_URLHASH;}void url_add_to_file_hash_tab(url * urlp){  if(!prottable[urlp->type].supported)    return;  url_to_filename(urlp, TRUE);}void url_remove_from_file_hash_tab(url * urlp){  if(!prottable[urlp->type].supported)    return;  if(urlp->local_name)  {    LOCK_CFG_FILEHASH;    dlhash_exclude_exact(cfg.fn_hash_tbl, (dllist_t) urlp);    UNLOCK_CFG_FILEHASH;  }}/**********************************************//* kopirovanie obsahu na nove miesto v pamati *//* FIXME: Translate me!                       *//**********************************************/url *new_url(url * urlo){  url *res = (url *) _malloc(sizeof(url));  memcpy(res, urlo, sizeof(url));  return res;}#define isforbiddenchar(a) ((a) == '\\' || (a) == '/')static char *encode_forbiddenchars(const char *str){  int size = 1;  const char *s;  char *res, *r;  for(s = str; *s; ++s)  {    if(isforbiddenchar(*s))      size += 2;  }  size += (s-str); /* add string length */  r = res = (char *) _malloc(size);  for(s = str; *s; ++s)  {    if(isforbiddenchar(*s))    {      /* no buffer overflow possible here, sprintf is save */      sprintf(r, "%%%02x", *s);      r += 3;    }    else    {      *(r++) = *s;    }  }  *r = '\0';  return res;}static char *url_get_default_local_name_real(url * urlp, int add_index){  char *pom2 = NULL;  char pbuf[50];  char *p;  snprintf(pbuf, sizeof(pbuf), "_%d", url_get_port(urlp));  switch (urlp->type)  {  case URLT_HTTP:  case URLT_HTTPS:    p = url_decode_str(urlp->p.http.document, strlen(urlp->p.http.document));    pom2 = tl_str_concat(pom2,      prottable[urlp->type].dirname, "/", urlp->p.http.host, pbuf, p, NULL);    _free(p);    if(urlp->p.http.searchstr)    {      /* search strings may have a / or \ inside, which must be encoded */      char *sstr = encode_forbiddenchars(urlp->p.http.searchstr);      pom2 = tl_str_concat(pom2, "?", sstr, NULL);      _free(sstr);    }    if(urlp->status & URL_FORM_ACTION)    {      form_info *fi = (form_info *) urlp->extension;      p = form_encode_urlencoded(((form_info *) urlp->extension)->infos);      if(p)      {        pom2 = tl_str_concat(pom2, (fi->method == FORM_M_POST) ? "#" : "?", p,        NULL);      }      _free(p);    }    if(tl_is_dirname(pom2) && add_index)      pom2 = tl_str_append(pom2, priv_cfg.index_name);    break;  case URLT_FILE:    pom2 =      url_decode_str(urlp->p.file.filename, strlen(urlp->p.file.filename));    if(urlp->p.file.searchstr)    {      p = url_decode_str(urlp->p.file.searchstr,        strlen(urlp->p.file.searchstr));      pom2 = tl_str_concat(pom2, "?", p, NULL);      free(p);    }    break;  case URLT_FTP:  case URLT_FTPS:    pom2 = tl_str_concat(pom2, prottable[urlp->type].dirname, "/",      urlp->p.ftp.host, pbuf, "/", urlp->p.ftp.path,      urlp->p.ftp.dir ? "/" : NULL,      add_index ? priv_cfg.index_name : NULL, NULL);    break;  case URLT_GOPHER:    pom2 = tl_str_concat(pom2, prottable[URLT_GOPHER].dirname, "/",      urlp->p.gopher.host, pbuf, urlp->p.gopher.selector,      (urlp->p.gopher.selector[0] == '1' && add_index)      ? priv_cfg.index_name : NULL, NULL);    break;  case URLT_FROMPARENT:  default:    return NULL;  }  return pom2;}char *url_get_default_local_name(url * urlp){  return url_get_default_local_name_real(urlp, TRUE);}static char *url_get_local_name_tr(url * urlp, char *local_name,  const char *mime_type, int *isdinfo){  dllist *pl = priv_cfg.lfnames;  char *ustr = url_to_urlstr(urlp, FALSE);  char *trs, *lfstr = NULL;  char *pom2 = local_name;  char *rv = NULL;  while(pl)  {    if(lfname_match((lfname *) pl->data, ustr))    {      lfstr = lfname_get_by_url(urlp, ustr, mime_type, (lfname *) pl->data);      pom2 = lfstr;      *isdinfo = TRUE;      break;    }    pl = pl->next;  }  _free(ustr);  trs = tr(pom2);  if(tl_is_dirname(trs))    rv = tl_str_concat(NULL, priv_cfg.cache_dir,      (*trs == '/' ? "" : "/"), trs, priv_cfg.index_name, NULL);  else    rv = tl_str_concat(NULL, priv_cfg.cache_dir,      (*trs == '/' ? "" : "/"), trs, NULL);  _free(trs);  _free(lfstr);  return rv;}/**********************************************/char *url_get_local_name_real(url * urlp, const char *mime_type, int adj){  char *pom = NULL;  char *pom2 = NULL;  char *p1, *p2;  char *p;  int isdinfo = FALSE;  struct stat estat;  if((urlp->status & URL_ISFIRST) &&    priv_cfg.store_name /* && cfg.mode == MODE_SINGLE */ )  {    return get_abs_file_path_oss(priv_cfg.store_name);  }  pom = url_get_default_local_name(urlp);  if(urlp->type != URLT_FILE)  {    pom2 = url_get_local_name_tr(urlp, pom, mime_type, &isdinfo);    _free(pom);    pom = pom2;  }#ifdef FS_UNSAFE_CHARACTERS  /* This is for automatic handling of windoze  */  /* filesystem unsafe characters - \:*?"<>|  */  if(urlp->type != URLT_FILE    && strlen(pom) != strcspn(pom, FS_UNSAFE_CHARACTERS))  {    if(strchr(FS_UNSAFE_CHARACTERS, '_'))      p = tr_del_chr(FS_UNSAFE_CHARACTERS, pom);    else      p = tr_chr_chr(FS_UNSAFE_CHARACTERS, "_", pom);    _free(pom);    pom = p;  }#endif  /* adjusting of filename size if required  */  if(urlp->type != URLT_FILE && tl_filename_needs_adjust(pom))  {    p = tl_adjust_filename(pom);    _free(pom);    pom = p;  }  if(!lstat(pom, &estat) && S_ISDIR(estat.st_mode) && adj)  {    pom = tl_str_concat(pom, "/", priv_cfg.index_name, NULL);  }  if((urlp->type != URLT_FILE) && cfg.base_level && !isdinfo)  {    p = get_abs_file_path_oss(pom);    _free(pom);    pom = p;    p1 = pom + strlen(priv_cfg.cache_dir) +      (tl_is_dirname(priv_cfg.cache_dir) == 0);    if(!(p2 = strfindnchr(p1, '/', cfg.base_level)))    {      if((p2 = strrchr(pom, '/')))        p2++;    }    if(p2)      memmove(p1, p2, strlen(p2) + 1);  }  /* this is here for ensure, that we */  /* don't have directory as filename :-) */  if(tl_is_dirname(pom))    pom = tl_str_append(pom, priv_cfg.index_name);  p = get_abs_file_path_oss(pom);  _free(pom);  /* In mode MIRROR we want to use exactly the same filenames as the     remove server. Therefore we have to unquote our filename. */  if(cfg.mode == MODE_MIRROR)  {    /* now we unquote the string */    char *s = p;    char *t = p;    int hex;    while(*s != 0)    {      if(s[0] == '%' && isxdigit(s[1]) && isxdigit(s[2]))      {        sscanf(s + 1, "%2x", &hex);        *t++ = hex;        s += 3;        continue;      }      *t++ = *s++;    }    *t = 0;  }

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -