📄 url.c

📁 wget (command line browser) source code
💻 C
📖 第 1 页 / 共 4 页
字号:
12 3 4 下一页
/* URL handling.   Copyright (C) 1995, 1996, 1997, 2000, 2001, 2003, 2003   Free Software Foundation, Inc.This file is part of GNU Wget.GNU Wget is free software; you can redistribute it and/or modifyit under the terms of the GNU General Public License as published bythe Free Software Foundation; either version 2 of the License, or (atyour option) any later version.GNU Wget is distributed in the hope that it will be useful,but WITHOUT ANY WARRANTY; without even the implied warranty ofMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See theGNU General Public License for more details.You should have received a copy of the GNU General Public Licensealong with Wget; if not, write to the Free SoftwareFoundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.In addition, as a special exception, the Free Software Foundationgives permission to link the code of its release of Wget with theOpenSSL project's "OpenSSL" library (or with modified versions of itthat use the same license as the "OpenSSL" library), and distributethe linked executables.  You must obey the GNU General Public Licensein all respects for all of the code used other than "OpenSSL".  If youmodify this file, you may extend this exception to your version of thefile, but you are not obligated to do so.  If you do not wish to doso, delete this exception statement from your version.  */#include <config.h>#include <stdio.h>#include <stdlib.h>#ifdef HAVE_STRING_H# include <string.h>#else# include <strings.h>#endif#include <sys/types.h>#ifdef HAVE_UNISTD_H# include <unistd.h>#endif#include <errno.h>#include <assert.h>#include "wget.h"#include "utils.h"#include "url.h"#ifndef errnoextern int errno;#endifstruct scheme_data{  char *leading_string;  int default_port;  int enabled;};/* Supported schemes: */static struct scheme_data supported_schemes[] ={  { "http://",  DEFAULT_HTTP_PORT,  1 },#ifdef HAVE_SSL  { "https://", DEFAULT_HTTPS_PORT, 1 },#endif  { "ftp://",   DEFAULT_FTP_PORT,   1 },  /* SCHEME_INVALID */  { NULL,       -1,                 0 }};/* Forward declarations: */static int path_simplify PARAMS ((char *));/* Support for encoding and decoding of URL strings.  We determine   whether a character is unsafe through static table lookup.  This   code assumes ASCII character set and 8-bit chars.  */enum {  /* rfc1738 reserved chars, preserved from encoding.  */  urlchr_reserved = 1,  /* rfc1738 unsafe chars, plus some more.  */  urlchr_unsafe   = 2};#define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))#define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)#define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)/* Shorthands for the table: */#define R  urlchr_reserved#define U  urlchr_unsafe#define RU R|Uconst static unsigned char urlchr_table[256] ={  U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */  U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */  U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */  U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */  U,  0,  U, RU,   0,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */  0,  0,  0,  R,   0,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */  0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */  0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */ RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */  0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */  0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */  0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */  U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */  0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */  0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */  0,  0,  0,  U,   U,  U,  U,  U,   /* x   y   z   {    |   }   ~   DEL */  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,};#undef R#undef U#undef RU/* URL-unescape the string S.   This is done by transforming the sequences "%HH" to the character   represented by the hexadecimal digits HH.  If % is not followed by   two hexadecimal digits, it is inserted literally.   The transformation is done in place.  If you need the original   string intact, make a copy before calling this function.  */static voidurl_unescape (char *s){  char *t = s;			/* t - tortoise */  char *h = s;			/* h - hare     */  for (; *h; h++, t++)    {      if (*h != '%')	{	copychar:	  *t = *h;	}      else	{	  /* Do nothing if '%' is not followed by two hex digits. */	  if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2])))	    goto copychar;	  *t = X2DIGITS_TO_NUM (h[1], h[2]);	  h += 2;	}    }  *t = '\0';}/* The core of url_escape_* functions.  Escapes the characters that   match the provided mask in urlchr_table.   If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars   will be returned unchanged.  If ALLOW_PASSTHROUGH is zero, a   freshly allocated string will be returned in all cases.  */static char *url_escape_1 (const char *s, unsigned char mask, int allow_passthrough){  const char *p1;  char *p2, *newstr;  int newlen;  int addition = 0;  for (p1 = s; *p1; p1++)    if (urlchr_test (*p1, mask))      addition += 2;		/* Two more characters (hex digits) */  if (!addition)    return allow_passthrough ? (char *)s : xstrdup (s);  newlen = (p1 - s) + addition;  newstr = (char *)xmalloc (newlen + 1);  p1 = s;  p2 = newstr;  while (*p1)    {      /* Quote the characters that match the test mask. */      if (urlchr_test (*p1, mask))	{	  unsigned char c = *p1++;	  *p2++ = '%';	  *p2++ = XNUM_TO_DIGIT (c >> 4);	  *p2++ = XNUM_TO_DIGIT (c & 0xf);	}      else	*p2++ = *p1++;    }  assert (p2 - newstr == newlen);  *p2 = '\0';  return newstr;}/* URL-escape the unsafe characters (see urlchr_table) in a given   string, returning a freshly allocated string.  */char *url_escape (const char *s){  return url_escape_1 (s, urlchr_unsafe, 0);}/* URL-escape the unsafe characters (see urlchr_table) in a given   string.  If no characters are unsafe, S is returned.  */static char *url_escape_allow_passthrough (const char *s){  return url_escape_1 (s, urlchr_unsafe, 1);}enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };/* Decide whether to encode, decode, or pass through the char at P.   This used to be a macro, but it got a little too convoluted.  */static inline enum copy_methoddecide_copy_method (const char *p){  if (*p == '%')    {      if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))	{	  /* %xx sequence: decode it, unless it would decode to an	     unsafe or a reserved char; in that case, leave it as	     is. */	  char preempt = X2DIGITS_TO_NUM (*(p + 1), *(p + 2));	  if (URL_UNSAFE_CHAR (preempt) || URL_RESERVED_CHAR (preempt))	    return CM_PASSTHROUGH;	  else	    return CM_DECODE;	}      else	/* Garbled %.. sequence: encode `%'. */	return CM_ENCODE;    }  else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))    return CM_ENCODE;  else    return CM_PASSTHROUGH;}/* Translate a %-escaped (but possibly non-conformant) input string S   into a %-escaped (and conformant) output string.  If no characters   are encoded or decoded, return the same string S; otherwise, return   a freshly allocated string with the new contents.   After a URL has been run through this function, the protocols that   use `%' as the quote character can use the resulting string as-is,   while those that don't call url_unescape() to get to the intended   data.  This function is also stable: after an input string is   transformed the first time, all further transformations of the   result yield the same result string.   Let's discuss why this function is needed.   Imagine Wget is to retrieve `http://abc.xyz/abc def'.  Since a raw   space character would mess up the HTTP request, it needs to be   quoted, like this:       GET /abc%20def HTTP/1.0   It appears that the unsafe chars need to be quoted, for example   with url_escape.  But what if we're requested to download   `abc%20def'?  url_escape transforms "%" to "%25", which would leave   us with `abc%2520def'.  This is incorrect -- since %-escapes are   part of URL syntax, "%20" is the correct way to denote a literal   space on the Wget command line.  This leaves us in the conclusion   that in that case Wget should not call url_escape, but leave the   `%20' as is.   And what if the requested URI is `abc%20 def'?  If we call   url_escape, we end up with `/abc%2520%20def', which is almost   certainly not intended.  If we don't call url_escape, we are left   with the embedded space and cannot complete the request.  What the   user meant was for Wget to request `/abc%20%20def', and this is   where reencode_escapes kicks in.   Wget used to solve this by first decoding %-quotes, and then   encoding all the "unsafe" characters found in the resulting string.   This was wrong because it didn't preserve certain URL special   (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b   == '+') would get translated to "a%2B%2Bb" or "a++b" depending on   whether we considered `+' reserved (it is).  One of these results   is inevitable because by the second step we would lose information   on whether the `+' was originally encoded or not.  Both results   were wrong because in CGI parameters + means space, while %2B means   literal plus.  reencode_escapes correctly translates the above to   "a%2B+b", i.e. returns the original string.   This function uses an algorithm proposed by Anon Sricharoenchai:   1. Encode all URL_UNSAFE and the "%" that are not followed by 2      hexdigits.   2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and      "+".   ...except that this code conflates the two steps, and decides   whether to encode, decode, or pass through each character in turn.   The function still uses two passes, but their logic is the same --   the first pass exists merely for the sake of allocation.  Another   small difference is that we include `+' to URL_RESERVED.   Anon's test case:   "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"   ->   "http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"   Simpler test cases:   "foo bar"         -> "foo%20bar"   "foo%20bar"       -> "foo%20bar"   "foo %20bar"      -> "foo%20%20bar"   "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')   "foo%25%20bar"    -> "foo%25%20bar"   "foo%2%20bar"     -> "foo%252%20bar"   "foo+bar"         -> "foo+bar"            (plus is reserved!)   "foo%2b+bar"      -> "foo%2b+bar"  */static char *reencode_escapes (const char *s){  const char *p1;  char *newstr, *p2;  int oldlen, newlen;  int encode_count = 0;  int decode_count = 0;  /* First, pass through the string to see if there's anything to do,     and to calculate the new length.  */  for (p1 = s; *p1; p1++)    {      switch (decide_copy_method (p1))	{	case CM_ENCODE:	  ++encode_count;	  break;	case CM_DECODE:	  ++decode_count;	  break;	case CM_PASSTHROUGH:	  break;	}    }  if (!encode_count && !decode_count)    /* The string is good as it is. */    return (char *)s;		/* C const model sucks. */  oldlen = p1 - s;  /* Each encoding adds two characters (hex digits), while each     decoding removes two characters.  */  newlen = oldlen + 2 * (encode_count - decode_count);  newstr = xmalloc (newlen + 1);  p1 = s;  p2 = newstr;  while (*p1)    {      switch (decide_copy_method (p1))	{	case CM_ENCODE:	  {	    unsigned char c = *p1++;	    *p2++ = '%';	    *p2++ = XNUM_TO_DIGIT (c >> 4);	    *p2++ = XNUM_TO_DIGIT (c & 0xf);	  }	  break;	case CM_DECODE:	  *p2++ = X2DIGITS_TO_NUM (p1[1], p1[2]);	  p1 += 3;		/* skip %xx */	  break;	case CM_PASSTHROUGH:	  *p2++ = *p1++;	}    }  *p2 = '\0';  assert (p2 - newstr == newlen);  return newstr;}/* Returns the scheme type if the scheme is supported, or   SCHEME_INVALID if not.  */enum url_schemeurl_scheme (const char *url){  int i;  for (i = 0; supported_schemes[i].leading_string; i++)    if (0 == strncasecmp (url, supported_schemes[i].leading_string,			  strlen (supported_schemes[i].leading_string)))      {	if (supported_schemes[i].enabled)	  return (enum url_scheme) i;	else	  return SCHEME_INVALID;      }  return SCHEME_INVALID;}#define SCHEME_CHAR(ch) (ISALNUM (ch) || (ch) == '-' || (ch) == '+')/* Return 1 if the URL begins with any "scheme", 0 otherwise.  As   currently implemented, it returns true if URL begins with   [-+a-zA-Z0-9]+: .  */inturl_has_scheme (const char *url){  const char *p = url;  /* The first char must be a scheme char. */  if (!*p || !SCHEME_CHAR (*p))    return 0;  ++p;  /* Followed by 0 or more scheme chars. */  while (*p && SCHEME_CHAR (*p))    ++p;  /* Terminated by ':'. */  return *p == ':';}intscheme_default_port (enum url_scheme scheme){  return supported_schemes[scheme].default_port;}voidscheme_disable (enum url_scheme scheme){  supported_schemes[scheme].enabled = 0;}/* Skip the username and password, if present here.  The function   should *not* be called with the complete URL, but with the part   right after the scheme.   If no username and password are found, return 0.  */static inturl_skip_credentials (const char *url){  /* Look for '@' that comes before terminators, such as '/', '?',     '#', or ';'.  */  const char *p = (const char *)strpbrk (url, "@/?#;");  if (!p || *p != '@')    return 0;  return p + 1 - url;}/* Parse credentials contained in [BEG, END).  The region is expected   to have come from a URL and is unescaped.  */static intparse_credentials (const char *beg, const char *end, char **user, char **passwd){  char *colon;  const char *userend;  if (beg == end)    return 0;			/* empty user name */  colon = memchr (beg, ':', end - beg);  if (colon == beg)    return 0;			/* again empty user name */  if (colon)    {      *passwd = strdupdelim (colon + 1, end);      userend = colon;      url_unescape (*passwd);    }  else    {      *passwd = NULL;      userend = end;    }  *user = strdupdelim (beg, userend);  url_unescape (*user);  return 1;}/* Used by main.c: detect URLs written using the "shorthand" URL forms   popularized by Netscape and NcFTP.  HTTP shorthands look like this:   www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file   www.foo.com[:port]            -> http://www.foo.com[:port]   FTP shorthands look like this:   foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file   foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file   If the URL needs not or cannot be rewritten, return NULL.  */char *rewrite_shorthand_url (const char *url){  const char *p;  if (url_has_scheme (url))    return NULL;  /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the     latter Netscape.  */  for (p = url; *p && *p != ':' && *p != '/'; p++)    ;  if (p == url)    return NULL;
12 3 4 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -