📄 url.c
字号:
/* URL handling. Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.This file is part of GNU Wget.GNU Wget is free software; you can redistribute it and/or modifyit under the terms of the GNU General Public License as published bythe Free Software Foundation; either version 3 of the License, or (atyour option) any later version.GNU Wget is distributed in the hope that it will be useful,but WITHOUT ANY WARRANTY; without even the implied warranty ofMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See theGNU General Public License for more details.You should have received a copy of the GNU General Public Licensealong with Wget. If not, see <http://www.gnu.org/licenses/>.Additional permission under GNU GPL version 3 section 7If you modify this program, or any covered work, by linking orcombining it with the OpenSSL project's OpenSSL library (or amodified version of that library), containing parts covered by theterms of the OpenSSL or SSLeay licenses, the Free Software Foundationgrants you additional permission to convey the resulting work.Corresponding Source for a non-source form of such a combinationshall include the source code for the parts of OpenSSL used as wellas that of the covered work. */#include <config.h>#include <stdio.h>#include <stdlib.h>#include <string.h>#ifdef HAVE_UNISTD_H# include <unistd.h>#endif#include <errno.h>#include <assert.h>#include "wget.h"#include "utils.h"#include "url.h"#include "host.h" /* for is_valid_ipv6_address */#ifdef TESTING#include "test.h"#endifenum { scm_disabled = 1, /* for https when OpenSSL fails to init. */ scm_has_params = 2, /* whether scheme has ;params */ scm_has_query = 4, /* whether scheme has ?query */ scm_has_fragment = 8 /* whether scheme has #fragment */};struct scheme_data{ /* Short name of the scheme, such as "http" or "ftp". */ const char *name; /* Leading string that identifies the scheme, such as "https://". */ const char *leading_string; /* Default port of the scheme when none is specified. */ int default_port; /* Various flags. */ int flags;};/* Supported schemes: */static struct scheme_data supported_schemes[] ={ { "http", "http://", DEFAULT_HTTP_PORT, scm_has_query|scm_has_fragment },#ifdef HAVE_SSL { "https", "https://", DEFAULT_HTTPS_PORT, scm_has_query|scm_has_fragment },#endif { "ftp", "ftp://", DEFAULT_FTP_PORT, scm_has_params|scm_has_fragment }, /* SCHEME_INVALID */ { NULL, NULL, -1, 0 }};/* Forward declarations: */static bool path_simplify (char *);/* Support for escaping and unescaping of URL strings. *//* Table of "reserved" and "unsafe" characters. Those terms are rfc1738-speak, as such largely obsoleted by rfc2396 and later specs, but the general idea remains. A reserved character is the one that you can't decode without changing the meaning of the URL. For example, you can't decode "/foo/%2f/bar" into "/foo///bar" because the number and contents of path components is different. Non-reserved characters can be changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar". The unsafe characters are loosely based on rfc1738, plus "$" and ",", as recommended by rfc2396, and minus "~", which is very frequently used (and sometimes unrecognized as %7E by broken servers). An unsafe character is the one that should be encoded when URLs are placed in foreign environments. E.g. space and newline are unsafe in HTTP contexts because HTTP uses them as separator and line terminator, so they must be encoded to %20 and %0A respectively. "*" is unsafe in shell context, etc. We determine whether a character is unsafe through static table lookup. This code assumes ASCII character set and 8-bit chars. */enum { /* rfc1738 reserved chars + "$" and ",". */ urlchr_reserved = 1, /* rfc1738 unsafe chars, plus non-printables. */ urlchr_unsafe = 2};#define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))#define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)#define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)/* Shorthands for the table: */#define R urlchr_reserved#define U urlchr_unsafe#define RU R|Ustatic const unsigned char urlchr_table[256] ={ U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */ U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */ U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */ U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */ U, 0, U, RU, R, U, R, 0, /* SP ! " # $ % & ' */ 0, 0, 0, R, R, 0, 0, R, /* ( ) * + , - . / */ 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */ 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */ RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */ 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */ 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */ 0, 0, 0, RU, U, RU, U, 0, /* X Y Z [ \ ] ^ _ */ U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */ 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */ 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */ 0, 0, 0, U, U, U, 0, U, /* x y z { | } ~ DEL */ U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,};#undef R#undef U#undef RU/* URL-unescape the string S. This is done by transforming the sequences "%HH" to the character represented by the hexadecimal digits HH. If % is not followed by two hexadecimal digits, it is inserted literally. The transformation is done in place. If you need the original string intact, make a copy before calling this function. */static voidurl_unescape (char *s){ char *t = s; /* t - tortoise */ char *h = s; /* h - hare */ for (; *h; h++, t++) { if (*h != '%') { copychar: *t = *h; } else { char c; /* Do nothing if '%' is not followed by two hex digits. */ if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2]))) goto copychar; c = X2DIGITS_TO_NUM (h[1], h[2]); /* Don't unescape %00 because there is no way to insert it into a C string without effectively truncating it. */ if (c == '\0') goto copychar; *t = c; h += 2; } } *t = '\0';}/* The core of url_escape_* functions. Escapes the characters that match the provided mask in urlchr_table. If ALLOW_PASSTHROUGH is true, a string with no unsafe chars will be returned unchanged. If ALLOW_PASSTHROUGH is false, a freshly allocated string will be returned in all cases. */static char *url_escape_1 (const char *s, unsigned char mask, bool allow_passthrough){ const char *p1; char *p2, *newstr; int newlen; int addition = 0; for (p1 = s; *p1; p1++) if (urlchr_test (*p1, mask)) addition += 2; /* Two more characters (hex digits) */ if (!addition) return allow_passthrough ? (char *)s : xstrdup (s); newlen = (p1 - s) + addition; newstr = xmalloc (newlen + 1); p1 = s; p2 = newstr; while (*p1) { /* Quote the characters that match the test mask. */ if (urlchr_test (*p1, mask)) { unsigned char c = *p1++; *p2++ = '%'; *p2++ = XNUM_TO_DIGIT (c >> 4); *p2++ = XNUM_TO_DIGIT (c & 0xf); } else *p2++ = *p1++; } assert (p2 - newstr == newlen); *p2 = '\0'; return newstr;}/* URL-escape the unsafe characters (see urlchr_table) in a given string, returning a freshly allocated string. */char *url_escape (const char *s){ return url_escape_1 (s, urlchr_unsafe, false);}/* URL-escape the unsafe characters (see urlchr_table) in a given string. If no characters are unsafe, S is returned. */static char *url_escape_allow_passthrough (const char *s){ return url_escape_1 (s, urlchr_unsafe, true);}/* Decide whether the char at position P needs to be encoded. (It is not enough to pass a single char *P because the function may need to inspect the surrounding context.) Return true if the char should be escaped as %XX, false otherwise. */static inline boolchar_needs_escaping (const char *p){ if (*p == '%') { if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2))) return false; else /* Garbled %.. sequence: encode `%'. */ return true; } else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p)) return true; else return false;}/* Translate a %-escaped (but possibly non-conformant) input string S into a %-escaped (and conformant) output string. If no characters are encoded or decoded, return the same string S; otherwise, return a freshly allocated string with the new contents. After a URL has been run through this function, the protocols that use `%' as the quote character can use the resulting string as-is, while those that don't can use url_unescape to get to the intended data. This function is stable: once the input is transformed, further transformations of the result yield the same output. Let's discuss why this function is needed. Imagine Wget is asked to retrieve `http://abc.xyz/abc def'. Since a raw space character would mess up the HTTP request, it needs to be quoted, like this: GET /abc%20def HTTP/1.0 It would appear that the unsafe chars need to be quoted, for example with url_escape. But what if we're requested to download `abc%20def'? url_escape transforms "%" to "%25", which would leave us with `abc%2520def'. This is incorrect -- since %-escapes are part of URL syntax, "%20" is the correct way to denote a literal space on the Wget command line. This leads to the conclusion that in that case Wget should not call url_escape, but leave the `%20' as is. This is clearly contradictory, but it only gets worse. What if the requested URI is `abc%20 def'? If we call url_escape, we end up with `/abc%2520%20def', which is almost certainly not intended. If we don't call url_escape, we are left with the embedded space and cannot complete the request. What the user meant was for Wget to request `/abc%20%20def', and this is where reencode_escapes kicks in. Wget used to solve this by first decoding %-quotes, and then encoding all the "unsafe" characters found in the resulting string. This was wrong because it didn't preserve certain URL special (reserved) characters. For instance, URI containing "a%2B+b" (0x2b == '+') would get translated to "a%2B%2Bb" or "a++b" depending on whether we considered `+' reserved (it is). One of these results is inevitable because by the second step we would lose information on whether the `+' was originally encoded or not. Both results were wrong because in CGI parameters + means space, while %2B means literal plus. reencode_escapes correctly translates the above to "a%2B+b", i.e. returns the original string. This function uses a modified version of the algorithm originally proposed by Anon Sricharoenchai: * Encode all "unsafe" characters, except those that are also "reserved", to %XX. See urlchr_table for which characters are unsafe and reserved. * Encode the "%" characters not followed by two hex digits to "%25". * Pass through all other characters and %XX escapes as-is. (Up to Wget 1.10 this decoded %XX escapes corresponding to "safe" characters, but that was obtrusive and broke some servers.) Anon's test case: "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc" -> "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc" Simpler test cases: "foo bar" -> "foo%20bar" "foo%20bar" -> "foo%20bar" "foo %20bar" -> "foo%20%20bar" "foo%%20bar" -> "foo%25%20bar" (0x25 == '%') "foo%25%20bar" -> "foo%25%20bar" "foo%2%20bar" -> "foo%252%20bar" "foo+bar" -> "foo+bar" (plus is reserved!) "foo%2b+bar" -> "foo%2b+bar" */static char *reencode_escapes (const char *s){ const char *p1; char *newstr, *p2; int oldlen, newlen; int encode_count = 0; /* First pass: inspect the string to see if there's anything to do, and to calculate the new length. */ for (p1 = s; *p1; p1++) if (char_needs_escaping (p1)) ++encode_count; if (!encode_count) /* The string is good as it is. */ return (char *) s; /* C const model sucks. */ oldlen = p1 - s; /* Each encoding adds two characters (hex digits). */ newlen = oldlen + 2 * encode_count; newstr = xmalloc (newlen + 1); /* Second pass: copy the string to the destination address, encoding chars when needed. */ p1 = s; p2 = newstr; while (*p1) if (char_needs_escaping (p1)) { unsigned char c = *p1++; *p2++ = '%'; *p2++ = XNUM_TO_DIGIT (c >> 4); *p2++ = XNUM_TO_DIGIT (c & 0xf); } else *p2++ = *p1++; *p2 = '\0'; assert (p2 - newstr == newlen); return newstr;}/* Returns the scheme type if the scheme is supported, or SCHEME_INVALID if not. */enum url_schemeurl_scheme (const char *url){ int i; for (i = 0; supported_schemes[i].leading_string; i++) if (0 == strncasecmp (url, supported_schemes[i].leading_string, strlen (supported_schemes[i].leading_string))) { if (!(supported_schemes[i].flags & scm_disabled)) return (enum url_scheme) i; else return SCHEME_INVALID; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -