📄 url.c

📁 网络爬虫程序
💻 C
📖 第 1 页 / 共 5 页
字号:
12 3 4 5 下一页
/***************************************************************************//*    This code is part of WWW grabber called pavuk                        *//*    Copyright (c) 1997 - 2001 Stefan Ondrejicka                          *//*    Distributed under GPL 2 or later                                     *//***************************************************************************/#include "config.h"#include <assert.h>#include <ctype.h>#include <sys/types.h>#include <sys/stat.h>#include <stdlib.h>#include <stdio.h>#include <string.h>#include <unistd.h>#include <limits.h>#include <errno.h>#include "gui.h"#include "http.h"#include "ftp.h"#include "gopher.h"#include "url.h"#include "html.h"#include "tools.h"#include "authinfo.h"#include "tr.h"#include "dinfo.h"#include "form.h"#include "gui_api.h"#include "lfname.h"static char *url_decode_html(const char *, int);/* here can you specify characters, *//* which are unsafe in file names */#ifdef __CYGWIN__#define FS_UNSAFE_CHARACTERS "\\:*?\"<>|"#endif/* for hexadecimal encoding */static const char hexa[] = "0123456789ABCDEF";#define HEXASC2HEXNR(x) (((x) >= '0' && (x) <= '9') ? \  ((x) - '0') : (tl_ascii_toupper(x) - 'A' + 10))#define HEX2CHAR(x) (HEXASC2HEXNR(*(x + 1)) << 4) + HEXASC2HEXNR(*(x + 2))const protinfo prottable[] = {  {URLT_UNKNOWN, NULL, "unknown", NULL, 0, FALSE},  {URLT_HTTP, "http", "http", "http://", 80, TRUE},#ifdef USE_SSL  {URLT_HTTPS, "https", "https", "https://", 443, TRUE},#else  {URLT_HTTPS, "https", "https", "https://", 443, FALSE},#endif  {URLT_FTP, "ftp", "ftp", "ftp://", 21, TRUE},#ifdef USE_SSL  {URLT_FTPS, "ftps", "ftps", "ftps://", 21, TRUE},#else  {URLT_FTPS, "ftps", "ftps", "ftps://", 21, FALSE},#endif  {URLT_FILE, NULL, "file", "file://", 0, TRUE},  {URLT_GOPHER, "gopher", "gopher", "gopher://", 70, TRUE},  {URLT_FROMPARENT, NULL, "//", "//", 80, TRUE}};#define _STRCLS_LOWER "abcdefghijklmnopqrstuvwxyz"#define _STRCLS_UPER "ABCDEFGHIJKLMNOPQRSTUVWXYZ"#define _STRCLS_DIGIT "0123456789"char *url_parse_scheme(char *urlstr){  char *p;  char *retv = NULL;  if((p = strchr(urlstr, ':')) && tl_ascii_isalpha(*urlstr))  {    int l1 = strspn(urlstr, _STRCLS_LOWER _STRCLS_UPER _STRCLS_DIGIT "+-.");    if(l1 == (p - urlstr))    {      retv = tl_strndup(urlstr, l1);      lowerstr(retv);    }  }  else  {    if(urlstr[0] == '/' && urlstr[1] == '/')      retv = strdup("//");  }  return retv;}static char *url_parse_authority(char *urlschpart){  char *retv = NULL;  if(urlschpart[0] == '/' && urlschpart[1] == '/')  {    int l1 = strcspn(urlschpart + 2, "/?#;");    retv = tl_strndup(urlschpart + 2, l1);  }  return retv;}static int url_split_authority(char *authority, char **user, char **password,  char **host, unsigned short *port){  char *p, *p2;  if(user)    *user = NULL;  if(password)    *password = NULL;  *host = NULL;  *port = 0;  if(user && (p = strrchr(authority, '@')))  {    p2 = strchr(authority, ':');    if(p2 && p2 < p)    {      *user = tl_strndup(authority, p2 - authority);      *password = tl_strndup(p2 + 1, p - p2 - 1);    }    else    {      *user = tl_strndup(authority, p - authority);    }    p++;  }  else    p = authority;  if((p2 = strrchr(p, ':')))  {    *host = tl_strndup(p, p2 - p);    *port = _atoi(p2 + 1);  }  else  {    *host = tl_strdup(p);  }  lowerstr(*host);  return 0;}static int url_split_path(char *urlpath, char **path, char **query,  char **anchor){  char *p = NULL, *p1 = NULL, *p2 = NULL;  *path = NULL;  if(query)    *query = NULL;  if(anchor)    *anchor = NULL;  if(anchor)    p1 = strchr(urlpath, '#');  if(query)    p2 = strchr(urlpath, '?');  if(p1 && p2)  {    if(p1 > p2)    {      *anchor = tl_strdup(p1 + 1);      *query = url_decode_html(p2 + 1, p1 - (p2 + 1));      p = p2;    }    else    {      *query = url_decode_html(p2 + 1, strlen(p2 + 1));      *anchor = tl_strndup(p1 + 1, p2 - (p1 + 1));      p = p1;    }  }  else if(p1)  {    *anchor = tl_strdup(p1 + 1);    p = p1;  }  else if(p2)  {    *query = url_decode_html(p2 + 1, strlen(p2 + 1));    p = p2;  }  if(p)  {    if(p - urlpath)    {      *path = tl_strndup(urlpath, p - urlpath);      if(**path == '/')      {        p = *path;        *path = get_abs_file_path(_strtrchr(p, '\\', '/'));        free(p);      }    }  }  else  {    if(*urlpath)    {      *path = tl_strdup(urlpath);      if(**path == '/')      {        p = *path;        *path = get_abs_file_path(_strtrchr(p, '\\', '/'));        free(p);      }    }  }  return 0;}protocol url_scheme_to_schemeid(char *scheme){  int i;  for(i = 0; i < NUM_ELEM(prottable); i++)  {    if(prottable[i].urlid && !strcmp(prottable[i].urlid, scheme))    {      return prottable[i].id;    }  }  return URLT_UNKNOWN;}/* * If a path is relative and starts // we need to get the type from * the parent, which only the caller can do. This function is called * by the caller of url_parse when url_parse has returned  type = URTL_FROMPARENT * and the parent can figure out the path. It basically does all the work * that url_parse would do once it knew the scheme. * however, we start with the urlstr in url->p.unsup.urlstr rather * than as an argument */static void url_finishpath(url * url){  char *authority = NULL;  char *p;  if(url->type == URLT_FROMPARENT)    url->type = URLT_UNKNOWN;  if(url->type == URLT_UNKNOWN)    return;                     /* can't help here */  p = url->p.unsup.urlstr;  authority = url_parse_authority(p);  if(authority)    p += strlen(authority) + 2;  if(authority && *authority)  {    switch (url->type)    {    case URLT_FROMPARENT:      break;    case URLT_HTTP:    case URLT_HTTPS:      url_split_authority(authority,        &(url->p.http.user),        &(url->p.http.password), &(url->p.http.host), &(url->p.http.port));      if(!url->p.http.port)        url->p.http.port = prottable[url->type].default_port;      url_split_path(p,        &(url->p.http.document),        &(url->p.http.searchstr), &(url->p.http.anchor_name));      if(!url->p.http.document)        url->p.http.document = tl_strdup("/");      break;    case URLT_FTP:    case URLT_FTPS:      url_split_authority(authority,        &(url->p.ftp.user),        &(url->p.ftp.password), &(url->p.ftp.host), &(url->p.ftp.port));      if(!url->p.ftp.port)        url->p.ftp.port = prottable[url->type].default_port;      url_split_path(p, &url->p.ftp.path, NULL, &url->p.ftp.anchor_name);      if(!url->p.ftp.path)        url->p.ftp.path = tl_strdup("/");      if(p && p[0] == '/' && p[1] == '/')      {        char *pp = tl_str_concat(NULL, "/", url->p.ftp.path, NULL);        _free(url->p.ftp.path);        url->p.ftp.path = pp;      }      if((p = strrchr(url->p.ftp.path, ';')) && !strncasecmp(p, ";type=", 6))        *p = '\0';      url->p.ftp.dir = tl_is_dirname(url->p.ftp.path) != 0;      break;    case URLT_GOPHER:      url_split_authority(authority,        NULL, NULL, &(url->p.gopher.host), &(url->p.gopher.port));      if(!url->p.gopher.port)        url->p.gopher.port = prottable[url->type].default_port;      if(*(p + 1))        url->p.gopher.selector = tl_strdup(p + 1);      else        url->p.gopher.selector = tl_strdup("1");      break;    case URLT_FILE:      url_split_path(p,        &(url->p.file.filename),        &(url->p.file.searchstr), &(url->p.file.anchor_name));      if(!url->p.file.filename)        url->p.file.filename = tl_strdup("");      break;    default:      return;    }  }  if(!authority || !*authority)  {    switch (url->type)    {    case URLT_FILE:    case URLT_FTP:    case URLT_FTPS:    case URLT_HTTP:    case URLT_HTTPS:      url->type = URLT_FILE;      url_split_path(p,        &(url->p.file.filename),        &(url->p.file.searchstr), &(url->p.file.anchor_name));      if(!url->p.file.filename)        url->p.file.filename = tl_strdup("");      break;    default:      url->type = URLT_UNKNOWN;      return;      break;    }  }  _free(authority);  return;}url *url_parse(char *urlstr){  char *scheme = NULL;  char *authority = NULL;  char *p;  url ret_url;  ret_url.type = URLT_UNKNOWN;  ret_url.status = 0;  ret_url.parent_url = NULL;  ret_url.moved_to = NULL;  ret_url.ref_cnt = 1;  ret_url.level = 0;  ret_url.extension = NULL;  ret_url.local_name = NULL;#ifdef WITH_TREE#ifdef I_FACE  ret_url.prop = NULL;  ret_url.tree_nfo = NULL;#endif#endif#ifdef HAVE_MT  pthread_mutex_init(&ret_url.lock, NULL);#endif  p = urlstr;  if(p)    scheme = url_parse_scheme(urlstr);  if(scheme)  {    ret_url.type = url_scheme_to_schemeid(scheme);    /* If the string starts with // then we */    /* don't know the scheme type so we have */    /* to wait for the parent to set it. */    if(ret_url.type == URLT_FROMPARENT)    {      ret_url.p.unsup.urlstr = tl_strdup(urlstr);      authority = url_parse_authority(urlstr);    }    else    {      /* We do know the scheme type, so move past it */      /* and get the 'authority' */      p += strlen(scheme) + 1;      authority = url_parse_authority(p);    }    if(authority)      p += strlen(authority) + 2;    if(authority && *authority)    {      switch (ret_url.type)      {      case URLT_FROMPARENT:        break;      case URLT_HTTP:      case URLT_HTTPS:        url_split_authority(authority,          &ret_url.p.http.user,          &ret_url.p.http.password,          &ret_url.p.http.host, &ret_url.p.http.port);        if(!ret_url.p.http.port)          ret_url.p.http.port = prottable[ret_url.type].default_port;        url_split_path(p,          &ret_url.p.http.document,          &ret_url.p.http.searchstr, &ret_url.p.http.anchor_name);        if(!ret_url.p.http.document)          ret_url.p.http.document = tl_strdup("/");        break;      case URLT_FTP:      case URLT_FTPS:        url_split_authority(authority,          &ret_url.p.ftp.user,          &ret_url.p.ftp.password, &ret_url.p.ftp.host, &ret_url.p.ftp.port);        if(!ret_url.p.ftp.port)          ret_url.p.ftp.port = prottable[ret_url.type].default_port;        url_split_path(p,          &ret_url.p.ftp.path, NULL, &ret_url.p.ftp.anchor_name);        if(!ret_url.p.ftp.path)          ret_url.p.ftp.path = tl_strdup("/");        if(p && p[0] == '/' && p[1] == '/')        {          char *pp = tl_str_concat(NULL, "/", ret_url.p.ftp.path, NULL);          _free(ret_url.p.ftp.path);          ret_url.p.ftp.path = pp;        }        if((p = strrchr(ret_url.p.ftp.path, ';')) &&          !strncasecmp(p, ";type=", 6))          *p = '\0';        ret_url.p.ftp.dir = tl_is_dirname(ret_url.p.ftp.path) != 0;        break;      case URLT_GOPHER:        url_split_authority(authority,          NULL, NULL, &ret_url.p.gopher.host, &ret_url.p.gopher.port);        if(!ret_url.p.gopher.port)          ret_url.p.gopher.port = prottable[ret_url.type].default_port;        if(*(p + 1))          ret_url.p.gopher.selector = tl_strdup(p + 1);        else          ret_url.p.gopher.selector = tl_strdup("1");        break;      case URLT_FILE:        url_split_path(p,          &ret_url.p.file.filename,          &ret_url.p.file.searchstr, &ret_url.p.file.anchor_name);        if(!ret_url.p.file.filename)          ret_url.p.file.filename = tl_strdup("");        break;      default:        ret_url.p.unsup.urlstr = tl_strdup(urlstr);        break;      }    }  }  if(!scheme || !authority || !*authority)  {    if(!scheme)      ret_url.type = URLT_FILE;    switch (ret_url.type)    {    case URLT_FILE:    case URLT_FTP:    case URLT_FTPS:    case URLT_HTTP:    case URLT_HTTPS:    case URLT_FROMPARENT:      ret_url.type = URLT_FILE;      url_split_path(p,        &ret_url.p.file.filename,        &ret_url.p.file.searchstr, &ret_url.p.file.anchor_name);      if(!ret_url.p.file.filename)        ret_url.p.file.filename = tl_strdup("");      break;    default:      ret_url.type = URLT_UNKNOWN;      ret_url.p.unsup.urlstr = tl_strdup(urlstr);      break;    }  }  _free(authority);  _free(scheme);  return new_url(&ret_url);}url *url_dup_url(url * src){  url dst;  dst.type = src->type;  dst.parent_url = NULL;  dst.moved_to = NULL;  dst.level = src->level;  dst.ref_cnt = 1;  dst.status = src->status &    (URL_INLINE_OBJ | URL_STYLE | URL_ISHTML | URL_NORECURSE |    URL_FORM_ACTION | URL_ISSCRIPT | URL_ISSTARTING);  dst.extension = NULL;  dst.local_name = NULL;#ifdef WITH_TREE#ifdef I_FACE  dst.prop = NULL;  dst.tree_nfo = NULL;#endif#endif#ifdef HAVE_MT  pthread_mutex_init(&dst.lock, NULL);#endif  switch (dst.type)  {  case URLT_FILE:    dst.p.file.filename = tl_strdup(src->p.file.filename);    dst.p.file.searchstr = tl_strdup(src->p.file.searchstr);    dst.p.file.anchor_name = tl_strdup(src->p.file.anchor_name);    break;
12 3 4 5 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -