⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 url.c

📁 网络爬虫程序
💻 C
📖 第 1 页 / 共 5 页
字号:
int url_get_auth_scheme(url * urlr, char *realm){  authinfo *ai;  int scheme = cfg.auth_scheme;  ai = authinfo_match_entry(urlr->type, url_get_site(urlr),    url_get_port(urlr), url_get_path(urlr), realm);  if(ai)    scheme = ai->type;  return scheme;}char *url_get_anchor_name(url * urlp){  char *anchor;  switch (urlp->type)  {  case URLT_HTTP:  case URLT_HTTPS:    anchor = urlp->p.http.anchor_name;    break;  case URLT_FTP:  case URLT_FTPS:    anchor = urlp->p.ftp.anchor_name;    break;  case URLT_FILE:    anchor = urlp->p.file.anchor_name;    break;  default:    anchor = NULL;    break;  }  return anchor;}void url_clear_anchor(url * urlp){  switch (urlp->type)  {  case URLT_HTTP:  case URLT_HTTPS:    _free(urlp->p.http.anchor_name);    break;  case URLT_FTP:  case URLT_FTPS:    _free(urlp->p.ftp.anchor_name);    break;  case URLT_FILE:    _free(urlp->p.file.anchor_name);    break;  default:    break;  }}char *url_get_search_str(url * urlp){  char *sstr;  switch (urlp->type)  {  case URLT_HTTP:  case URLT_HTTPS:    sstr = urlp->p.http.searchstr;    break;  case URLT_FILE:    sstr = urlp->p.file.searchstr;    break;  default:    sstr = NULL;    break;  }  return sstr;}int url_is_dir_index(url * urlp){  return ((urlp->type == URLT_HTTP || urlp->type == URLT_HTTPS) &&    tl_is_dirname(urlp->p.http.document)) ||    ((urlp->type == URLT_FTP || urlp->type == URLT_FTPS) && urlp->p.ftp.dir);}/* Check if URL is on same site. Be careful not to disallow   protocol changes like HTTP to HTTPS. */int url_is_same_site(url * urla, url * urlb){  return  /* (urla->type == urlb->type) &&     (url_get_port(urla) == url_get_port(urlb)) && */   !strcmp(url_get_site(urla), url_get_site(urlb));}/**************************************************//* FIXME: Translate me                            *//* absolutna cesta k dokumentu z lokalneho stromu *//* ktory je referencovany relativne               *//**************************************************/char *get_redirect_abs_path(url * rurl, char *fstr){  char *pom, *p, *p1;  pom = tl_strdup(url_to_filename(rurl, TRUE));  p = strrchr(pom, '/');  p1 = realloc(pom, strlen(fstr) + (p - pom) + 2);  strcpy(p1 + (p - pom) + 1, fstr);  p = get_abs_file_path_oss(p1);  free(p1);  return p;}void url_path_abs(url * urlp){  char *p;  switch (urlp->type)  {  case URLT_HTTP:  case URLT_HTTPS:    p = get_abs_file_path(urlp->p.http.document);    free(urlp->p.http.document);    urlp->p.http.document = p;    break;  case URLT_FTP:  case URLT_FTPS:    p = get_abs_file_path(urlp->p.ftp.path);    if(urlp->p.ftp.path[0] == '/' && urlp->p.ftp.path[1] == '/')    {      char *pp = tl_str_concat(NULL, "/", p, NULL);      _free(p);      p = pp;    }    free(urlp->p.ftp.path);    urlp->p.ftp.path = p;    break;  case URLT_FILE:    p = get_abs_file_path(urlp->p.file.filename);    free(urlp->p.file.filename);    urlp->p.file.filename = p;    break;  default:    break;  }}url *filename_to_url(char *ifn){  int cdln = strlen(priv_cfg.cache_dir);  bool_t isok = FALSE;  if(*ifn != '/')    return NULL;  if(cfg.enable_info)  {    url *nurl = dinfo_get_url_for_filename(ifn);    if(nurl)      return nurl;  }  if(!strncmp(ifn, priv_cfg.cache_dir, cdln))  {    char *p;    int i;    url *nurl = _malloc(sizeof(url));    char *fn = tl_strdup(ifn);    p = fn + cdln;    p += (*p == '/');    if(!strcasecmp(tl_get_extension(fn), "css"))      nurl->status = URL_STYLE;    else      nurl->status = 0;    nurl->level = 0;    nurl->parent_url = NULL;    nurl->moved_to = NULL;    nurl->extension = NULL;    nurl->local_name = tl_is_dirname(ifn) ?      tl_str_concat(NULL, ifn, priv_cfg.index_name, NULL) : tl_strdup(ifn);#ifdef HAVE_MT    pthread_mutex_init(&nurl->lock, NULL);#endif#ifdef WITH_TREE#ifdef I_FACE    nurl->prop = NULL;    nurl->tree_nfo = NULL;#endif#endif    if(cfg.base_level && cfg.default_prefix)    {      char *tfn, *pfn;      url *purl = url_parse(priv_cfg.default_prefix);      assert(purl->type != URLT_FROMPARENT);      pfn = url_get_default_local_name_real(purl, FALSE);      tfn = tl_str_concat(NULL, priv_cfg.cache_dir,        tl_is_dirname(priv_cfg.cache_dir) ? "" : "/",        pfn, tl_is_dirname(pfn) ? "" : "/", p, NULL);      _free(pfn);      _free(fn);      fn = tfn;      p = fn + cdln;      p += (*p == '/');      free_deep_url(purl);    }    for(i = 0; i < NUM_ELEM(prottable); i++)    {      if(prottable[i].dirname &&        !strncmp(p, prottable[i].dirname,          strlen(prottable[i].dirname)) &&        p[strlen(prottable[i].dirname)] == '/')      {        isok = TRUE;        break;      }    }    if(isok)    {      char *p2, *p3;      nurl->type = prottable[i].id;      nurl->parent_url = NULL;      p += strlen(prottable[i].dirname) + 1;      if(!p)      {        free(nurl);        free(fn);        return NULL;      }      switch (nurl->type)      {      case URLT_HTTP:      case URLT_HTTPS:        nurl->p.http.password = NULL;        nurl->p.http.user = NULL;        nurl->p.http.anchor_name = NULL;        nurl->p.http.searchstr = NULL;        nurl->p.http.port = prottable[i].default_port;        if((p2 = strchr(p, '/')))        {          int p2_len = strlen(p2);          int idx_len = strlen(priv_cfg.index_name);          char *query = NULL;          if(idx_len <= p2_len &&            !strcmp((p2 + p2_len - idx_len), priv_cfg.index_name) &&            ((p2_len > idx_len && *(p2 + p2_len - idx_len - 1) == '/')              || idx_len == p2_len))          {            *(p2 + p2_len - idx_len) = '\0';          }          /* for POST #query */          p3 = strchr(p2, '#');          if(p3)          {            form_info *fi;            *p3 = '\0';            query = p3 + 1;            fi = _malloc(sizeof(form_info));            fi->method = FORM_M_POST;            fi->encoding = FORM_E_URLENCODED;            fi->action = NULL;            fi->text = NULL;            fi->infos = form_parse_urlencoded_query(query);            fi->parent_url = NULL;            nurl->extension = fi;            nurl->status |= URL_FORM_ACTION;          }          /* for query part of GET request URL */          p3 = strchr(p2, '?');          if(p3)          {            *p3 = '\0';            nurl->p.http.searchstr = tl_strdup(p3 + 1);          }          nurl->p.http.document = tl_strdup(p2);          *p2 = '\0';          p2 = strrchr(p, '_');          if(p2)          {            p2++;            nurl->p.http.port = _atoi(p2);            if(errno == ERANGE)            {              nurl->p.http.host = tl_strdup(p);              nurl->p.http.port = prottable[i].default_port;            }            else            {              nurl->p.http.host = tl_strndup(p, p2 - p - 1);            }          }          else            nurl->p.http.host = tl_strdup(p);        }        else        {          free(nurl);          free(fn);          return NULL;        }        break;      case URLT_GOPHER:        nurl->p.gopher.port = prottable[i].default_port;        if((p2 = strchr(p, '/')))        {          int p2_len = strlen(p2);          int idx_len = strlen(priv_cfg.index_name);          p2++;          if(idx_len <= p2_len &&            !strcmp((p2 + p2_len - idx_len), priv_cfg.index_name) &&            ((p2_len > idx_len && *(p2 + p2_len - idx_len - 1) == '1')              || idx_len == p2_len))          {            *(p2 + p2_len - idx_len) = '\0';          }          nurl->p.gopher.selector = tl_strdup(p2);          *p2 = '\0';          p2 = strrchr(p, '_');          if(p2)          {            p2++;            nurl->p.gopher.port = _atoi(p2);            if(errno == ERANGE)            {              nurl->p.gopher.host = tl_strdup(p);              nurl->p.gopher.port = prottable[i].default_port;            }            else            {              nurl->p.gopher.host = tl_strndup(p, p2 - p - 1);            }          }          else            nurl->p.gopher.host = tl_strdup(p);        }        else        {          free(nurl);          free(fn);          return NULL;        }        break;      case URLT_FTP:      case URLT_FTPS:        nurl->p.ftp.port = prottable[i].default_port;        nurl->p.ftp.password = NULL;        nurl->p.ftp.user = NULL;        nurl->p.ftp.dir = FALSE;        nurl->p.ftp.anchor_name = NULL;        if((p2 = strchr(p, '/')))        {          int p2_len = strlen(p2);          int idx_len = strlen(priv_cfg.index_name);          if(idx_len <= p2_len &&            !strcmp((p2 + p2_len - idx_len), priv_cfg.index_name) &&            ((p2_len > idx_len && *(p2 + p2_len - idx_len - 1) == '/')              || idx_len == p2_len))          {            *(p2 + p2_len - idx_len) = '\0';            nurl->p.ftp.dir = TRUE;          }          nurl->p.ftp.path = tl_strdup(p2);          *p2 = '\0';          p2 = strrchr(p, '_');          if(p2)          {            p2++;            nurl->p.ftp.port = _atoi(p2);            if(errno == ERANGE)            {              nurl->p.ftp.host = tl_strdup(p);              nurl->p.ftp.port = prottable[i].default_port;            }            else            {              nurl->p.ftp.host = tl_strndup(p, p2 - p - 1);            }          }          else            nurl->p.ftp.host = tl_strdup(p);        }        else        {          free(nurl);          free(fn);          return NULL;        }        break;      default:        free(nurl);        nurl = NULL;        break;      }      free(fn);      return nurl;    }    free(nurl);  }  return NULL;}/****************************************//* zisti ci bol dokument referencovany  *//* v predchadzajucich cykloch           *//* FIXME: Translate me!                 *//****************************************/url *url_was_befor(url * urlp){  url *ret;  if(!prottable[urlp->type].supported)    return NULL;  LOCK_CFG_URLHASH;  ret = (url *) dlhash_find(cfg.url_hash_tbl, (dllist_t) urlp);  UNLOCK_CFG_URLHASH;  return ret;}void url_forget_filename(url * urlp){  if(cfg.enable_info && cfg.post_update)    dinfo_remove(urlp->local_name);  url_remove_from_file_hash_tab(urlp);  _free(urlp->local_name);}int dllist_url_compare(dllist_t key1, dllist_t key2){  return url_compare((url *) key1, (url *) key2);}int url_compare(url * u1, url * u2){  int rv;  if(u1->type != u2->type)    return 0;  switch (u1->type)  {  case URLT_HTTP:  case URLT_HTTPS:    if((rv = strcmp(u1->p.http.document, u2->p.http.document)))      return !rv;    if(u1->p.http.searchstr && u2->p.http.searchstr)      rv = strcmp(u1->p.http.searchstr, u2->p.http.searchstr);    else      rv = u1->p.http.searchstr - u2->p.http.searchstr;    if(rv)      return !rv;    if(u1->p.http.user && u2->p.http.user)      rv = strcmp(u1->p.http.user, u2->p.http.user);    else      rv = u1->p.http.user - u2->p.http.user;    if(rv)      return !rv;    if(u1->p.http.password && u2->p.http.password)      rv = strcmp(u1->p.http.password, u2->p.http.password);    else      rv = u1->p.http.password - u2->p.http.password;    if(rv)      return !rv;    if((rv = strcmp(u1->p.http.host, u2->p.http.host)))      return !rv;    if(u1->p.http.port != u2->p.http.port)      return FALSE;    if((u1->status & URL_FORM_ACTION) != (u2->status & URL_FORM_ACTION))      return FALSE;    if((u1->status & URL_FORM_ACTION) && (u2->status & URL_FORM_ACTION))    {      dllist *ptr;      form_info *fi1 = (form_info *) u1->extension;      form_info *fi2 = (form_info *) u2->extension;      if(fi1->method != fi2->method)        return FALSE;      if(fi1->encoding != fi2->encoding)        return FALSE;      ptr = fi1->infos;      while(ptr)      {        if(!dllist_find2(fi2->infos, ptr->data, form_field_compare))          return FALSE;        ptr = ptr->next;      }    }    return TRUE;    break;  case URLT_FTP:  case URLT_FTPS:    if((rv = strcmp(u1->p.ftp.path, u2->p.ftp.path)))      return !rv;    if(u1->p.ftp.user && u2->p.ftp.user)      rv = strcmp(u1->p.ftp.user, u2->p.ftp.user);    else      rv = u1->p.ftp.user - u2->p.ftp.user;    if(rv)      return !rv;    if(u1->p.ftp.password && u2->p.ftp.password)      rv = strcmp(u1->p.ftp.password, u2->p.ftp.password);    else      rv = u1->p.ftp.password - u2->p.ftp.password;    if(rv)      return !rv;    if((rv = strcmp(u1->p.ftp.host, u2->p.ftp.host)))      return !rv;    return u1->p.ftp.port == u2->p.ftp.port;    break;  case URLT_GOPHER:    if((rv = strcmp(u1->p.gopher.selector, u2->p.gopher.s

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -