📄 url.c

📁 wget讓你可以在console介面下
💻 C
📖 第 1 页 / 共 4 页
字号:
/* Used by main.c: detect URLs written using the "shorthand" URL forms   popularized by Netscape and NcFTP.  HTTP shorthands look like this:   www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file   www.foo.com[:port]            -> http://www.foo.com[:port]   FTP shorthands look like this:   foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file   foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file   If the URL needs not or cannot be rewritten, return NULL.  */char *rewrite_shorthand_url (const char *url){  const char *p;  if (url_scheme (url) != SCHEME_INVALID)    return NULL;  /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the     latter Netscape.  */  for (p = url; *p && *p != ':' && *p != '/'; p++)    ;  if (p == url)    return NULL;  /* If we're looking at "://", it means the URL uses a scheme we     don't support, which may include "https" when compiled without     SSL support.  Don't bogusly rewrite such URLs.  */  if (p[0] == ':' && p[1] == '/' && p[2] == '/')    return NULL;  if (*p == ':')    {      const char *pp;      char *res;      /* If the characters after the colon and before the next slash	 or end of string are all digits, it's HTTP.  */      int digits = 0;      for (pp = p + 1; ISDIGIT (*pp); pp++)	++digits;      if (digits > 0 && (*pp == '/' || *pp == '\0'))	goto http;      /* Prepend "ftp://" to the entire URL... */      res = xmalloc (6 + strlen (url) + 1);      sprintf (res, "ftp://%s", url);      /* ...and replace ':' with '/'. */      res[6 + (p - url)] = '/';      return res;    }  else    {      char *res;    http:      /* Just prepend "http://" to what we have. */      res = xmalloc (7 + strlen (url) + 1);      sprintf (res, "http://%s", url);      return res;    }}static void split_path PARAMS ((const char *, char **, char **));/* Like strpbrk, with the exception that it returns the pointer to the   terminating zero (end-of-string aka "eos") if no matching character   is found.   Although I normally balk at Gcc-specific optimizations, it probably   makes sense here: glibc has optimizations that detect strpbrk being   called with literal string as ACCEPT and inline the search.  That   optimization is defeated if strpbrk is hidden within the call to   another function.  (And no, making strpbrk_or_eos inline doesn't   help because the check for literal accept is in the   preprocessor.)  */#if defined(__GNUC__) && __GNUC__ >= 3#define strpbrk_or_eos(s, accept) ({		\  char *SOE_p = strpbrk (s, accept);		\  if (!SOE_p)					\    SOE_p = strchr (s, '\0');			\  SOE_p;					\})#else  /* not __GNUC__ or old gcc */static inline char *strpbrk_or_eos (const char *s, const char *accept){  char *p = strpbrk (s, accept);  if (!p)    p = strchr (s, '\0');  return p;}#endif /* not __GNUC__ or old gcc *//* Turn STR into lowercase; return non-zero if a character was   actually changed. */static intlowercase_str (char *str){  int change = 0;  for (; *str; str++)    if (ISUPPER (*str))      {	change = 1;	*str = TOLOWER (*str);      }  return change;}static const char *parse_errors[] = {#define PE_NO_ERROR			0  N_("No error"),#define PE_UNSUPPORTED_SCHEME		1  N_("Unsupported scheme"),#define PE_INVALID_HOST_NAME		2  N_("Invalid host name"),#define PE_BAD_PORT_NUMBER		3  N_("Bad port number"),#define PE_INVALID_USER_NAME		4  N_("Invalid user name"),#define PE_UNTERMINATED_IPV6_ADDRESS	5  N_("Unterminated IPv6 numeric address"),#define PE_IPV6_NOT_SUPPORTED		6  N_("IPv6 addresses not supported"),#define PE_INVALID_IPV6_ADDRESS		7  N_("Invalid IPv6 numeric address")};/* Parse a URL.   Return a new struct url if successful, NULL on error.  In case of   error, and if ERROR is not NULL, also set *ERROR to the appropriate   error code. */struct url *url_parse (const char *url, int *error){  struct url *u;  const char *p;  int path_modified, host_modified;  enum url_scheme scheme;  const char *uname_b,     *uname_e;  const char *host_b,      *host_e;  const char *path_b,      *path_e;  const char *params_b,    *params_e;  const char *query_b,     *query_e;  const char *fragment_b,  *fragment_e;  int port;  char *user = NULL, *passwd = NULL;  char *url_encoded = NULL;  int error_code;  scheme = url_scheme (url);  if (scheme == SCHEME_INVALID)    {      error_code = PE_UNSUPPORTED_SCHEME;      goto err;    }  url_encoded = reencode_escapes (url);  p = url_encoded;  p += strlen (supported_schemes[scheme].leading_string);  uname_b = p;  p = url_skip_credentials (p);  uname_e = p;  /* scheme://user:pass@host[:port]... */  /*                    ^              */  /* We attempt to break down the URL into the components path,     params, query, and fragment.  They are ordered like this:       scheme://host[:port][/path][;params][?query][#fragment]  */  params_b   = params_e   = NULL;  query_b    = query_e    = NULL;  fragment_b = fragment_e = NULL;  host_b = p;  if (*p == '[')    {      /* Handle IPv6 address inside square brackets.  Ideally we'd	 just look for the terminating ']', but rfc2732 mandates	 rejecting invalid IPv6 addresses.  */      /* The address begins after '['. */      host_b = p + 1;      host_e = strchr (host_b, ']');      if (!host_e)	{	  error_code = PE_UNTERMINATED_IPV6_ADDRESS;	  goto err;	}#ifdef ENABLE_IPV6      /* Check if the IPv6 address is valid. */      if (!is_valid_ipv6_address(host_b, host_e))	{	  error_code = PE_INVALID_IPV6_ADDRESS;	  goto err;	}      /* Continue parsing after the closing ']'. */      p = host_e + 1;#else      error_code = PE_IPV6_NOT_SUPPORTED;      goto err;#endif      /* The closing bracket must be followed by a separator or by the	 null char.  */      /* http://[::1]... */      /*             ^   */      if (!strchr (":/;?#", *p))	{	  /* Trailing garbage after []-delimited IPv6 address. */	  error_code = PE_INVALID_HOST_NAME;	  goto err;	}    }  else    {      p = strpbrk_or_eos (p, ":/;?#");      host_e = p;    }  if (host_b == host_e)    {      error_code = PE_INVALID_HOST_NAME;      goto err;    }  port = scheme_default_port (scheme);  if (*p == ':')    {      const char *port_b, *port_e, *pp;      /* scheme://host:port/tralala */      /*              ^             */      ++p;      port_b = p;      p = strpbrk_or_eos (p, "/;?#");      port_e = p;      /* Allow empty port, as per rfc2396. */      if (port_b != port_e)	{	  for (port = 0, pp = port_b; pp < port_e; pp++)	    {	      if (!ISDIGIT (*pp))		{	 	  /* http://host:12randomgarbage/blah */		  /*               ^                  */		  error_code = PE_BAD_PORT_NUMBER;		  goto err;		}	      port = 10 * port + (*pp - '0');	      /* Check for too large port numbers here, before we have		 a chance to overflow on bogus port values.  */	      if (port > 65535)		{		  error_code = PE_BAD_PORT_NUMBER;		  goto err;		}	    }	}    }  if (*p == '/')    {      ++p;      path_b = p;      p = strpbrk_or_eos (p, ";?#");      path_e = p;    }  else    {      /* Path is not allowed not to exist. */      path_b = path_e = p;    }  if (*p == ';')    {      ++p;      params_b = p;      p = strpbrk_or_eos (p, "?#");      params_e = p;    }  if (*p == '?')    {      ++p;      query_b = p;      p = strpbrk_or_eos (p, "#");      query_e = p;      /* Hack that allows users to use '?' (a wildcard character) in	 FTP URLs without it being interpreted as a query string	 delimiter.  */      if (scheme == SCHEME_FTP)	{	  query_b = query_e = NULL;	  path_e = p;	}    }  if (*p == '#')    {      ++p;      fragment_b = p;      p += strlen (p);      fragment_e = p;    }  assert (*p == 0);  if (uname_b != uname_e)    {      /* http://user:pass@host */      /*        ^         ^    */      /*     uname_b   uname_e */      if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))	{	  error_code = PE_INVALID_USER_NAME;	  goto err;	}    }  u = xnew0 (struct url);  u->scheme = scheme;  u->host   = strdupdelim (host_b, host_e);  u->port   = port;  u->user   = user;  u->passwd = passwd;  u->path = strdupdelim (path_b, path_e);  path_modified = path_simplify (u->path);  split_path (u->path, &u->dir, &u->file);  host_modified = lowercase_str (u->host);  /* Decode %HH sequences in host name.  This is important not so much     to support %HH sequences in host names (which other browser     don't), but to support binary characters (which will have been     converted to %HH by reencode_escapes).  */  if (strchr (u->host, '%'))    {      url_unescape (u->host);      host_modified = 1;    }  if (params_b)    u->params = strdupdelim (params_b, params_e);  if (query_b)    u->query = strdupdelim (query_b, query_e);  if (fragment_b)    u->fragment = strdupdelim (fragment_b, fragment_e);  if (path_modified || u->fragment || host_modified || path_b == path_e)    {      /* If we suspect that a transformation has rendered what	 url_string might return different from URL_ENCODED, rebuild	 u->url using url_string.  */      u->url = url_string (u, 0);      if (url_encoded != url)	xfree ((char *) url_encoded);    }  else    {      if (url_encoded == url)	u->url = xstrdup (url);      else	u->url = url_encoded;    }  return u; err:  /* Cleanup in case of error: */  if (url_encoded && url_encoded != url)    xfree (url_encoded);  /* Transmit the error code to the caller, if the caller wants to     know.  */  if (error)    *error = error_code;  return NULL;}/* Return the error message string from ERROR_CODE, which should have   been retrieved from url_parse.  The error message is translated.  */const char *url_error (int error_code){  assert (error_code >= 0 && error_code < countof (parse_errors));  return _(parse_errors[error_code]);}/* Split PATH into DIR and FILE.  PATH comes from the URL and is   expected to be URL-escaped.   The path is split into directory (the part up to the last slash)   and file (the part after the last slash), which are subsequently   unescaped.  Examples:   PATH                 DIR           FILE   "foo/bar/baz"        "foo/bar"     "baz"   "foo/bar/"           "foo/bar"     ""   "foo"                ""            "foo"   "foo/bar/baz%2fqux"  "foo/bar"     "baz/qux" (!)   DIR and FILE are freshly allocated.  */static voidsplit_path (const char *path, char **dir, char **file){  char *last_slash = strrchr (path, '/');  if (!last_slash)    {      *dir = xstrdup ("");      *file = xstrdup (path);    }  else    {      *dir = strdupdelim (path, last_slash);      *file = xstrdup (last_slash + 1);    }  url_unescape (*dir);  url_unescape (*file);}/* Note: URL's "full path" is the path with the query string and   params appended.  The "fragment" (#foo) is intentionally ignored,   but that might be changed.  For example, if the original URL was   "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",   the full path will be "/foo/bar/baz;bullshit?querystring".  *//* Return the length of the full path, without the terminating   zero.  */static intfull_path_length (const struct url *url){  int len = 0;#define FROB(el) if (url->el) len += 1 + strlen (url->el)  FROB (path);  FROB (params);  FROB (query);#undef FROB  return len;}/* Write out the full path. */static voidfull_path_write (const struct url *url, char *where){#define FROB(el, chr) do {			\  char *f_el = url->el;				\  if (f_el) {					\    int l = strlen (f_el);			\    *where++ = chr;				\    memcpy (where, f_el, l);			\    where += l;					\  }						\} while (0)  FROB (path, '/');  FROB (params, ';');  FROB (query, '?');#undef FROB}/* Public function for getting the "full path".  E.g. if u->path is   "foo/bar" and u->query is "param=value", full_path will be   "/foo/bar?param=value". */char *url_full_path (const struct url *url){  int length = full_path_length (url);  char *full_path = (char *) xmalloc (length + 1);  full_path_write (url, full_path);  full_path[length] = '\0';  return full_path;}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -