📄 url.c

📁 wget讓你可以在console介面下
💻 C
📖 第 1 页 / 共 4 页
字号:
12 3 4 下一页
/* URL handling.   Copyright (C) 2005 Free Software Foundation, Inc.This file is part of GNU Wget.GNU Wget is free software; you can redistribute it and/or modifyit under the terms of the GNU General Public License as published bythe Free Software Foundation; either version 2 of the License, or (atyour option) any later version.GNU Wget is distributed in the hope that it will be useful,but WITHOUT ANY WARRANTY; without even the implied warranty ofMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See theGNU General Public License for more details.You should have received a copy of the GNU General Public Licensealong with Wget; if not, write to the Free SoftwareFoundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.In addition, as a special exception, the Free Software Foundationgives permission to link the code of its release of Wget with theOpenSSL project's "OpenSSL" library (or with modified versions of itthat use the same license as the "OpenSSL" library), and distributethe linked executables.  You must obey the GNU General Public Licensein all respects for all of the code used other than "OpenSSL".  If youmodify this file, you may extend this exception to your version of thefile, but you are not obligated to do so.  If you do not wish to doso, delete this exception statement from your version.  */#include <config.h>#include <stdio.h>#include <stdlib.h>#ifdef HAVE_STRING_H# include <string.h>#else# include <strings.h>#endif#include <sys/types.h>#ifdef HAVE_UNISTD_H# include <unistd.h>#endif#include <errno.h>#include <assert.h>#include "wget.h"#include "utils.h"#include "url.h"#include "host.h"  /* for is_valid_ipv6_address */#ifndef errnoextern int errno;#endifstruct scheme_data{  const char *name;  const char *leading_string;  int default_port;  int enabled;};/* Supported schemes: */static struct scheme_data supported_schemes[] ={  { "http",	"http://",  DEFAULT_HTTP_PORT,  1 },#ifdef HAVE_SSL  { "https",	"https://", DEFAULT_HTTPS_PORT, 1 },#endif  { "ftp",	"ftp://",   DEFAULT_FTP_PORT,   1 },  /* SCHEME_INVALID */  { NULL,	NULL,       -1,                 0 }};/* Forward declarations: */static int path_simplify PARAMS ((char *));/* Support for escaping and unescaping of URL strings.  *//* Table of "reserved" and "unsafe" characters.  Those terms are   rfc1738-speak, as such largely obsoleted by rfc2396 and later   specs, but the general idea remains.   A reserved character is the one that you can't decode without   changing the meaning of the URL.  For example, you can't decode   "/foo/%2f/bar" into "/foo///bar" because the number and contents of   path components is different.  Non-reserved characters can be   changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar".  The   unsafe characters are loosely based on rfc1738, plus "$" and ",",   as recommended by rfc2396, and minus "~", which is very frequently   used (and sometimes unrecognized as %7E by broken servers).   An unsafe character is the one that should be encoded when URLs are   placed in foreign environments.  E.g. space and newline are unsafe   in HTTP contexts because HTTP uses them as separator and line   terminator, so they must be encoded to %20 and %0A respectively.   "*" is unsafe in shell context, etc.   We determine whether a character is unsafe through static table   lookup.  This code assumes ASCII character set and 8-bit chars.  */enum {  /* rfc1738 reserved chars + "$" and ",".  */  urlchr_reserved = 1,  /* rfc1738 unsafe chars, plus non-printables.  */  urlchr_unsafe   = 2};#define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))#define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)#define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)/* Shorthands for the table: */#define R  urlchr_reserved#define U  urlchr_unsafe#define RU R|Ustatic const unsigned char urlchr_table[256] ={  U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */  U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */  U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */  U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */  U,  0,  U, RU,   R,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */  0,  0,  0,  R,   R,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */  0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */  0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */ RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */  0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */  0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */  0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */  U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */  0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */  0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */  0,  0,  0,  U,   U,  U,  0,  U,   /* x   y   z   {    |   }   ~   DEL */  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,};#undef R#undef U#undef RU/* URL-unescape the string S.   This is done by transforming the sequences "%HH" to the character   represented by the hexadecimal digits HH.  If % is not followed by   two hexadecimal digits, it is inserted literally.   The transformation is done in place.  If you need the original   string intact, make a copy before calling this function.  */static voidurl_unescape (char *s){  char *t = s;			/* t - tortoise */  char *h = s;			/* h - hare     */  for (; *h; h++, t++)    {      if (*h != '%')	{	copychar:	  *t = *h;	}      else	{	  char c;	  /* Do nothing if '%' is not followed by two hex digits. */	  if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2])))	    goto copychar;	  c = X2DIGITS_TO_NUM (h[1], h[2]);	  /* Don't unescape %00 because there is no way to insert it	     into a C string without effectively truncating it. */	  if (c == '\0')	    goto copychar;	  *t = c;	  h += 2;	}    }  *t = '\0';}/* The core of url_escape_* functions.  Escapes the characters that   match the provided mask in urlchr_table.   If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars   will be returned unchanged.  If ALLOW_PASSTHROUGH is zero, a   freshly allocated string will be returned in all cases.  */static char *url_escape_1 (const char *s, unsigned char mask, int allow_passthrough){  const char *p1;  char *p2, *newstr;  int newlen;  int addition = 0;  for (p1 = s; *p1; p1++)    if (urlchr_test (*p1, mask))      addition += 2;		/* Two more characters (hex digits) */  if (!addition)    return allow_passthrough ? (char *)s : xstrdup (s);  newlen = (p1 - s) + addition;  newstr = (char *)xmalloc (newlen + 1);  p1 = s;  p2 = newstr;  while (*p1)    {      /* Quote the characters that match the test mask. */      if (urlchr_test (*p1, mask))	{	  unsigned char c = *p1++;	  *p2++ = '%';	  *p2++ = XNUM_TO_DIGIT (c >> 4);	  *p2++ = XNUM_TO_DIGIT (c & 0xf);	}      else	*p2++ = *p1++;    }  assert (p2 - newstr == newlen);  *p2 = '\0';  return newstr;}/* URL-escape the unsafe characters (see urlchr_table) in a given   string, returning a freshly allocated string.  */char *url_escape (const char *s){  return url_escape_1 (s, urlchr_unsafe, 0);}/* URL-escape the unsafe characters (see urlchr_table) in a given   string.  If no characters are unsafe, S is returned.  */static char *url_escape_allow_passthrough (const char *s){  return url_escape_1 (s, urlchr_unsafe, 1);}/* Decide whether the char at position P needs to be encoded.  (It is   not enough to pass a single char *P because the function may need   to inspect the surrounding context.)   Return 1 if the char should be escaped as %XX, 0 otherwise.  */static inline intchar_needs_escaping (const char *p){  if (*p == '%')    {      if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))	return 0;      else	/* Garbled %.. sequence: encode `%'. */	return 1;    }  else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))    return 1;  else    return 0;}/* Translate a %-escaped (but possibly non-conformant) input string S   into a %-escaped (and conformant) output string.  If no characters   are encoded or decoded, return the same string S; otherwise, return   a freshly allocated string with the new contents.   After a URL has been run through this function, the protocols that   use `%' as the quote character can use the resulting string as-is,   while those that don't can use url_unescape to get to the intended   data.  This function is stable: once the input is transformed,   further transformations of the result yield the same output.   Let's discuss why this function is needed.   Imagine Wget is asked to retrieve `http://abc.xyz/abc def'.  Since   a raw space character would mess up the HTTP request, it needs to   be quoted, like this:       GET /abc%20def HTTP/1.0   It would appear that the unsafe chars need to be quoted, for   example with url_escape.  But what if we're requested to download   `abc%20def'?  url_escape transforms "%" to "%25", which would leave   us with `abc%2520def'.  This is incorrect -- since %-escapes are   part of URL syntax, "%20" is the correct way to denote a literal   space on the Wget command line.  This leads to the conclusion that   in that case Wget should not call url_escape, but leave the `%20'   as is.  This is clearly contradictory, but it only gets worse.   What if the requested URI is `abc%20 def'?  If we call url_escape,   we end up with `/abc%2520%20def', which is almost certainly not   intended.  If we don't call url_escape, we are left with the   embedded space and cannot complete the request.  What the user   meant was for Wget to request `/abc%20%20def', and this is where   reencode_escapes kicks in.   Wget used to solve this by first decoding %-quotes, and then   encoding all the "unsafe" characters found in the resulting string.   This was wrong because it didn't preserve certain URL special   (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b   == '+') would get translated to "a%2B%2Bb" or "a++b" depending on   whether we considered `+' reserved (it is).  One of these results   is inevitable because by the second step we would lose information   on whether the `+' was originally encoded or not.  Both results   were wrong because in CGI parameters + means space, while %2B means   literal plus.  reencode_escapes correctly translates the above to   "a%2B+b", i.e. returns the original string.   This function uses a modified version of the algorithm originally   proposed by Anon Sricharoenchai:   * Encode all "unsafe" characters, except those that are also     "reserved", to %XX.  See urlchr_table for which characters are     unsafe and reserved.   * Encode the "%" characters not followed by two hex digits to     "%25".   * Pass through all other characters and %XX escapes as-is.  (Up to     Wget 1.10 this decoded %XX escapes corresponding to "safe"     characters, but that was obtrusive and broke some servers.)   Anon's test case:   "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"   ->   "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc"   Simpler test cases:   "foo bar"         -> "foo%20bar"   "foo%20bar"       -> "foo%20bar"   "foo %20bar"      -> "foo%20%20bar"   "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')   "foo%25%20bar"    -> "foo%25%20bar"   "foo%2%20bar"     -> "foo%252%20bar"   "foo+bar"         -> "foo+bar"            (plus is reserved!)   "foo%2b+bar"      -> "foo%2b+bar"  */static char *reencode_escapes (const char *s){  const char *p1;  char *newstr, *p2;  int oldlen, newlen;  int encode_count = 0;  /* First pass: inspect the string to see if there's anything to do,     and to calculate the new length.  */  for (p1 = s; *p1; p1++)    if (char_needs_escaping (p1))      ++encode_count;  if (!encode_count)    /* The string is good as it is. */    return (char *) s;		/* C const model sucks. */  oldlen = p1 - s;  /* Each encoding adds two characters (hex digits).  */  newlen = oldlen + 2 * encode_count;  newstr = xmalloc (newlen + 1);  /* Second pass: copy the string to the destination address, encoding     chars when needed.  */  p1 = s;  p2 = newstr;  while (*p1)    if (char_needs_escaping (p1))      {	unsigned char c = *p1++;	*p2++ = '%';	*p2++ = XNUM_TO_DIGIT (c >> 4);	*p2++ = XNUM_TO_DIGIT (c & 0xf);      }    else      *p2++ = *p1++;  *p2 = '\0';  assert (p2 - newstr == newlen);  return newstr;}/* Returns the scheme type if the scheme is supported, or   SCHEME_INVALID if not.  */enum url_schemeurl_scheme (const char *url){  int i;  for (i = 0; supported_schemes[i].leading_string; i++)    if (0 == strncasecmp (url, supported_schemes[i].leading_string,			  strlen (supported_schemes[i].leading_string)))      {	if (supported_schemes[i].enabled)	  return (enum url_scheme) i;	else	  return SCHEME_INVALID;      }  return SCHEME_INVALID;}#define SCHEME_CHAR(ch) (ISALNUM (ch) || (ch) == '-' || (ch) == '+')/* Return 1 if the URL begins with any "scheme", 0 otherwise.  As   currently implemented, it returns true if URL begins with   [-+a-zA-Z0-9]+: .  */inturl_has_scheme (const char *url){  const char *p = url;  /* The first char must be a scheme char. */  if (!*p || !SCHEME_CHAR (*p))    return 0;  ++p;  /* Followed by 0 or more scheme chars. */  while (*p && SCHEME_CHAR (*p))    ++p;  /* Terminated by ':'. */  return *p == ':';}intscheme_default_port (enum url_scheme scheme){  return supported_schemes[scheme].default_port;}voidscheme_disable (enum url_scheme scheme){  supported_schemes[scheme].enabled = 0;}/* Skip the username and password, if present in the URL.  The   function should *not* be called with the complete URL, but with the   portion after the scheme.   If no username and password are found, return URL.  */static const char *url_skip_credentials (const char *url){  /* Look for '@' that comes before terminators, such as '/', '?',     '#', or ';'.  */  const char *p = (const char *)strpbrk (url, "@/?#;");  if (!p || *p != '@')    return url;  return p + 1;}/* Parse credentials contained in [BEG, END).  The region is expected   to have come from a URL and is unescaped.  */static intparse_credentials (const char *beg, const char *end, char **user, char **passwd){  char *colon;  const char *userend;  if (beg == end)    return 0;			/* empty user name */  colon = memchr (beg, ':', end - beg);  if (colon == beg)    return 0;			/* again empty user name */  if (colon)    {      *passwd = strdupdelim (colon + 1, end);      userend = colon;      url_unescape (*passwd);    }  else    {      *passwd = NULL;      userend = end;    }  *user = strdupdelim (beg, userend);  url_unescape (*user);  return 1;}
12 3 4 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -