📄 url.c

📁 一个从网络上自动下载文件的自由工具
💻 C
📖 第 1 页 / 共 5 页
字号:
12 3 4 5 下一页
/* URL handling.   Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,   2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.This file is part of GNU Wget.GNU Wget is free software; you can redistribute it and/or modifyit under the terms of the GNU General Public License as published bythe Free Software Foundation; either version 3 of the License, or (atyour option) any later version.GNU Wget is distributed in the hope that it will be useful,but WITHOUT ANY WARRANTY; without even the implied warranty ofMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See theGNU General Public License for more details.You should have received a copy of the GNU General Public Licensealong with Wget.  If not, see <http://www.gnu.org/licenses/>.Additional permission under GNU GPL version 3 section 7If you modify this program, or any covered work, by linking orcombining it with the OpenSSL project's OpenSSL library (or amodified version of that library), containing parts covered by theterms of the OpenSSL or SSLeay licenses, the Free Software Foundationgrants you additional permission to convey the resulting work.Corresponding Source for a non-source form of such a combinationshall include the source code for the parts of OpenSSL used as wellas that of the covered work.  */#include <config.h>#include <stdio.h>#include <stdlib.h>#include <string.h>#ifdef HAVE_UNISTD_H# include <unistd.h>#endif#include <errno.h>#include <assert.h>#include "wget.h"#include "utils.h"#include "url.h"#include "host.h"  /* for is_valid_ipv6_address */#ifdef TESTING#include "test.h"#endifenum {  scm_disabled = 1,             /* for https when OpenSSL fails to init. */  scm_has_params = 2,           /* whether scheme has ;params */  scm_has_query = 4,            /* whether scheme has ?query */  scm_has_fragment = 8          /* whether scheme has #fragment */};struct scheme_data{  /* Short name of the scheme, such as "http" or "ftp". */  const char *name;  /* Leading string that identifies the scheme, such as "https://". */  const char *leading_string;  /* Default port of the scheme when none is specified. */  int default_port;  /* Various flags. */  int flags;};/* Supported schemes: */static struct scheme_data supported_schemes[] ={  { "http",     "http://",  DEFAULT_HTTP_PORT,  scm_has_query|scm_has_fragment },#ifdef HAVE_SSL  { "https",    "https://", DEFAULT_HTTPS_PORT, scm_has_query|scm_has_fragment },#endif  { "ftp",      "ftp://",   DEFAULT_FTP_PORT,   scm_has_params|scm_has_fragment },  /* SCHEME_INVALID */  { NULL,       NULL,       -1,                 0 }};/* Forward declarations: */static bool path_simplify (char *);/* Support for escaping and unescaping of URL strings.  *//* Table of "reserved" and "unsafe" characters.  Those terms are   rfc1738-speak, as such largely obsoleted by rfc2396 and later   specs, but the general idea remains.   A reserved character is the one that you can't decode without   changing the meaning of the URL.  For example, you can't decode   "/foo/%2f/bar" into "/foo///bar" because the number and contents of   path components is different.  Non-reserved characters can be   changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar".  The   unsafe characters are loosely based on rfc1738, plus "$" and ",",   as recommended by rfc2396, and minus "~", which is very frequently   used (and sometimes unrecognized as %7E by broken servers).   An unsafe character is the one that should be encoded when URLs are   placed in foreign environments.  E.g. space and newline are unsafe   in HTTP contexts because HTTP uses them as separator and line   terminator, so they must be encoded to %20 and %0A respectively.   "*" is unsafe in shell context, etc.   We determine whether a character is unsafe through static table   lookup.  This code assumes ASCII character set and 8-bit chars.  */enum {  /* rfc1738 reserved chars + "$" and ",".  */  urlchr_reserved = 1,  /* rfc1738 unsafe chars, plus non-printables.  */  urlchr_unsafe   = 2};#define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))#define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)#define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)/* Shorthands for the table: */#define R  urlchr_reserved#define U  urlchr_unsafe#define RU R|Ustatic const unsigned char urlchr_table[256] ={  U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */  U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */  U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */  U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */  U,  0,  U, RU,   R,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */  0,  0,  0,  R,   R,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */  0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */  0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */ RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */  0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */  0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */  0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */  U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */  0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */  0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */  0,  0,  0,  U,   U,  U,  0,  U,   /* x   y   z   {    |   }   ~   DEL */  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,};#undef R#undef U#undef RU/* URL-unescape the string S.   This is done by transforming the sequences "%HH" to the character   represented by the hexadecimal digits HH.  If % is not followed by   two hexadecimal digits, it is inserted literally.   The transformation is done in place.  If you need the original   string intact, make a copy before calling this function.  */static voidurl_unescape (char *s){  char *t = s;                  /* t - tortoise */  char *h = s;                  /* h - hare     */  for (; *h; h++, t++)    {      if (*h != '%')        {        copychar:          *t = *h;        }      else        {          char c;          /* Do nothing if '%' is not followed by two hex digits. */          if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2])))            goto copychar;          c = X2DIGITS_TO_NUM (h[1], h[2]);          /* Don't unescape %00 because there is no way to insert it             into a C string without effectively truncating it. */          if (c == '\0')            goto copychar;          *t = c;          h += 2;        }    }  *t = '\0';}/* The core of url_escape_* functions.  Escapes the characters that   match the provided mask in urlchr_table.   If ALLOW_PASSTHROUGH is true, a string with no unsafe chars will be   returned unchanged.  If ALLOW_PASSTHROUGH is false, a freshly   allocated string will be returned in all cases.  */static char *url_escape_1 (const char *s, unsigned char mask, bool allow_passthrough){  const char *p1;  char *p2, *newstr;  int newlen;  int addition = 0;  for (p1 = s; *p1; p1++)    if (urlchr_test (*p1, mask))      addition += 2;            /* Two more characters (hex digits) */  if (!addition)    return allow_passthrough ? (char *)s : xstrdup (s);  newlen = (p1 - s) + addition;  newstr = xmalloc (newlen + 1);  p1 = s;  p2 = newstr;  while (*p1)    {      /* Quote the characters that match the test mask. */      if (urlchr_test (*p1, mask))        {          unsigned char c = *p1++;          *p2++ = '%';          *p2++ = XNUM_TO_DIGIT (c >> 4);          *p2++ = XNUM_TO_DIGIT (c & 0xf);        }      else        *p2++ = *p1++;    }  assert (p2 - newstr == newlen);  *p2 = '\0';  return newstr;}/* URL-escape the unsafe characters (see urlchr_table) in a given   string, returning a freshly allocated string.  */char *url_escape (const char *s){  return url_escape_1 (s, urlchr_unsafe, false);}/* URL-escape the unsafe characters (see urlchr_table) in a given   string.  If no characters are unsafe, S is returned.  */static char *url_escape_allow_passthrough (const char *s){  return url_escape_1 (s, urlchr_unsafe, true);}/* Decide whether the char at position P needs to be encoded.  (It is   not enough to pass a single char *P because the function may need   to inspect the surrounding context.)   Return true if the char should be escaped as %XX, false otherwise.  */static inline boolchar_needs_escaping (const char *p){  if (*p == '%')    {      if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))        return false;      else        /* Garbled %.. sequence: encode `%'. */        return true;    }  else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))    return true;  else    return false;}/* Translate a %-escaped (but possibly non-conformant) input string S   into a %-escaped (and conformant) output string.  If no characters   are encoded or decoded, return the same string S; otherwise, return   a freshly allocated string with the new contents.   After a URL has been run through this function, the protocols that   use `%' as the quote character can use the resulting string as-is,   while those that don't can use url_unescape to get to the intended   data.  This function is stable: once the input is transformed,   further transformations of the result yield the same output.   Let's discuss why this function is needed.   Imagine Wget is asked to retrieve `http://abc.xyz/abc def'.  Since   a raw space character would mess up the HTTP request, it needs to   be quoted, like this:       GET /abc%20def HTTP/1.0   It would appear that the unsafe chars need to be quoted, for   example with url_escape.  But what if we're requested to download   `abc%20def'?  url_escape transforms "%" to "%25", which would leave   us with `abc%2520def'.  This is incorrect -- since %-escapes are   part of URL syntax, "%20" is the correct way to denote a literal   space on the Wget command line.  This leads to the conclusion that   in that case Wget should not call url_escape, but leave the `%20'   as is.  This is clearly contradictory, but it only gets worse.   What if the requested URI is `abc%20 def'?  If we call url_escape,   we end up with `/abc%2520%20def', which is almost certainly not   intended.  If we don't call url_escape, we are left with the   embedded space and cannot complete the request.  What the user   meant was for Wget to request `/abc%20%20def', and this is where   reencode_escapes kicks in.   Wget used to solve this by first decoding %-quotes, and then   encoding all the "unsafe" characters found in the resulting string.   This was wrong because it didn't preserve certain URL special   (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b   == '+') would get translated to "a%2B%2Bb" or "a++b" depending on   whether we considered `+' reserved (it is).  One of these results   is inevitable because by the second step we would lose information   on whether the `+' was originally encoded or not.  Both results   were wrong because in CGI parameters + means space, while %2B means   literal plus.  reencode_escapes correctly translates the above to   "a%2B+b", i.e. returns the original string.   This function uses a modified version of the algorithm originally   proposed by Anon Sricharoenchai:   * Encode all "unsafe" characters, except those that are also     "reserved", to %XX.  See urlchr_table for which characters are     unsafe and reserved.   * Encode the "%" characters not followed by two hex digits to     "%25".   * Pass through all other characters and %XX escapes as-is.  (Up to     Wget 1.10 this decoded %XX escapes corresponding to "safe"     characters, but that was obtrusive and broke some servers.)   Anon's test case:   "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"   ->   "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc"   Simpler test cases:   "foo bar"         -> "foo%20bar"   "foo%20bar"       -> "foo%20bar"   "foo %20bar"      -> "foo%20%20bar"   "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')   "foo%25%20bar"    -> "foo%25%20bar"   "foo%2%20bar"     -> "foo%252%20bar"   "foo+bar"         -> "foo+bar"            (plus is reserved!)   "foo%2b+bar"      -> "foo%2b+bar"  */static char *reencode_escapes (const char *s){  const char *p1;  char *newstr, *p2;  int oldlen, newlen;  int encode_count = 0;  /* First pass: inspect the string to see if there's anything to do,     and to calculate the new length.  */  for (p1 = s; *p1; p1++)    if (char_needs_escaping (p1))      ++encode_count;  if (!encode_count)    /* The string is good as it is. */    return (char *) s;          /* C const model sucks. */  oldlen = p1 - s;  /* Each encoding adds two characters (hex digits).  */  newlen = oldlen + 2 * encode_count;  newstr = xmalloc (newlen + 1);  /* Second pass: copy the string to the destination address, encoding     chars when needed.  */  p1 = s;  p2 = newstr;  while (*p1)    if (char_needs_escaping (p1))      {        unsigned char c = *p1++;        *p2++ = '%';        *p2++ = XNUM_TO_DIGIT (c >> 4);        *p2++ = XNUM_TO_DIGIT (c & 0xf);      }    else      *p2++ = *p1++;  *p2 = '\0';  assert (p2 - newstr == newlen);  return newstr;}/* Returns the scheme type if the scheme is supported, or   SCHEME_INVALID if not.  */enum url_schemeurl_scheme (const char *url){  int i;  for (i = 0; supported_schemes[i].leading_string; i++)    if (0 == strncasecmp (url, supported_schemes[i].leading_string,                          strlen (supported_schemes[i].leading_string)))      {        if (!(supported_schemes[i].flags & scm_disabled))          return (enum url_scheme) i;        else          return SCHEME_INVALID;      }
12 3 4 5 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -