📄 url.c
字号:
/* Used by main.c: detect URLs written using the "shorthand" URL forms popularized by Netscape and NcFTP. HTTP shorthands look like this: www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file www.foo.com[:port] -> http://www.foo.com[:port] FTP shorthands look like this: foo.bar.com:dir/file -> ftp://foo.bar.com/dir/file foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file If the URL needs not or cannot be rewritten, return NULL. */char *rewrite_shorthand_url (const char *url){ const char *p; if (url_scheme (url) != SCHEME_INVALID) return NULL; /* Look for a ':' or '/'. The former signifies NcFTP syntax, the latter Netscape. */ for (p = url; *p && *p != ':' && *p != '/'; p++) ; if (p == url) return NULL; /* If we're looking at "://", it means the URL uses a scheme we don't support, which may include "https" when compiled without SSL support. Don't bogusly rewrite such URLs. */ if (p[0] == ':' && p[1] == '/' && p[2] == '/') return NULL; if (*p == ':') { const char *pp; char *res; /* If the characters after the colon and before the next slash or end of string are all digits, it's HTTP. */ int digits = 0; for (pp = p + 1; ISDIGIT (*pp); pp++) ++digits; if (digits > 0 && (*pp == '/' || *pp == '\0')) goto http; /* Prepend "ftp://" to the entire URL... */ res = xmalloc (6 + strlen (url) + 1); sprintf (res, "ftp://%s", url); /* ...and replace ':' with '/'. */ res[6 + (p - url)] = '/'; return res; } else { char *res; http: /* Just prepend "http://" to what we have. */ res = xmalloc (7 + strlen (url) + 1); sprintf (res, "http://%s", url); return res; }}static void split_path PARAMS ((const char *, char **, char **));/* Like strpbrk, with the exception that it returns the pointer to the terminating zero (end-of-string aka "eos") if no matching character is found. Although I normally balk at Gcc-specific optimizations, it probably makes sense here: glibc has optimizations that detect strpbrk being called with literal string as ACCEPT and inline the search. That optimization is defeated if strpbrk is hidden within the call to another function. (And no, making strpbrk_or_eos inline doesn't help because the check for literal accept is in the preprocessor.) */#if defined(__GNUC__) && __GNUC__ >= 3#define strpbrk_or_eos(s, accept) ({ \ char *SOE_p = strpbrk (s, accept); \ if (!SOE_p) \ SOE_p = strchr (s, '\0'); \ SOE_p; \})#else /* not __GNUC__ or old gcc */static inline char *strpbrk_or_eos (const char *s, const char *accept){ char *p = strpbrk (s, accept); if (!p) p = strchr (s, '\0'); return p;}#endif /* not __GNUC__ or old gcc *//* Turn STR into lowercase; return non-zero if a character was actually changed. */static intlowercase_str (char *str){ int change = 0; for (; *str; str++) if (ISUPPER (*str)) { change = 1; *str = TOLOWER (*str); } return change;}static const char *parse_errors[] = {#define PE_NO_ERROR 0 N_("No error"),#define PE_UNSUPPORTED_SCHEME 1 N_("Unsupported scheme"),#define PE_INVALID_HOST_NAME 2 N_("Invalid host name"),#define PE_BAD_PORT_NUMBER 3 N_("Bad port number"),#define PE_INVALID_USER_NAME 4 N_("Invalid user name"),#define PE_UNTERMINATED_IPV6_ADDRESS 5 N_("Unterminated IPv6 numeric address"),#define PE_IPV6_NOT_SUPPORTED 6 N_("IPv6 addresses not supported"),#define PE_INVALID_IPV6_ADDRESS 7 N_("Invalid IPv6 numeric address")};/* Parse a URL. Return a new struct url if successful, NULL on error. In case of error, and if ERROR is not NULL, also set *ERROR to the appropriate error code. */struct url *url_parse (const char *url, int *error){ struct url *u; const char *p; int path_modified, host_modified; enum url_scheme scheme; const char *uname_b, *uname_e; const char *host_b, *host_e; const char *path_b, *path_e; const char *params_b, *params_e; const char *query_b, *query_e; const char *fragment_b, *fragment_e; int port; char *user = NULL, *passwd = NULL; char *url_encoded = NULL; int error_code; scheme = url_scheme (url); if (scheme == SCHEME_INVALID) { error_code = PE_UNSUPPORTED_SCHEME; goto err; } url_encoded = reencode_escapes (url); p = url_encoded; p += strlen (supported_schemes[scheme].leading_string); uname_b = p; p = url_skip_credentials (p); uname_e = p; /* scheme://user:pass@host[:port]... */ /* ^ */ /* We attempt to break down the URL into the components path, params, query, and fragment. They are ordered like this: scheme://host[:port][/path][;params][?query][#fragment] */ params_b = params_e = NULL; query_b = query_e = NULL; fragment_b = fragment_e = NULL; host_b = p; if (*p == '[') { /* Handle IPv6 address inside square brackets. Ideally we'd just look for the terminating ']', but rfc2732 mandates rejecting invalid IPv6 addresses. */ /* The address begins after '['. */ host_b = p + 1; host_e = strchr (host_b, ']'); if (!host_e) { error_code = PE_UNTERMINATED_IPV6_ADDRESS; goto err; }#ifdef ENABLE_IPV6 /* Check if the IPv6 address is valid. */ if (!is_valid_ipv6_address(host_b, host_e)) { error_code = PE_INVALID_IPV6_ADDRESS; goto err; } /* Continue parsing after the closing ']'. */ p = host_e + 1;#else error_code = PE_IPV6_NOT_SUPPORTED; goto err;#endif /* The closing bracket must be followed by a separator or by the null char. */ /* http://[::1]... */ /* ^ */ if (!strchr (":/;?#", *p)) { /* Trailing garbage after []-delimited IPv6 address. */ error_code = PE_INVALID_HOST_NAME; goto err; } } else { p = strpbrk_or_eos (p, ":/;?#"); host_e = p; } if (host_b == host_e) { error_code = PE_INVALID_HOST_NAME; goto err; } port = scheme_default_port (scheme); if (*p == ':') { const char *port_b, *port_e, *pp; /* scheme://host:port/tralala */ /* ^ */ ++p; port_b = p; p = strpbrk_or_eos (p, "/;?#"); port_e = p; /* Allow empty port, as per rfc2396. */ if (port_b != port_e) { for (port = 0, pp = port_b; pp < port_e; pp++) { if (!ISDIGIT (*pp)) { /* http://host:12randomgarbage/blah */ /* ^ */ error_code = PE_BAD_PORT_NUMBER; goto err; } port = 10 * port + (*pp - '0'); /* Check for too large port numbers here, before we have a chance to overflow on bogus port values. */ if (port > 65535) { error_code = PE_BAD_PORT_NUMBER; goto err; } } } } if (*p == '/') { ++p; path_b = p; p = strpbrk_or_eos (p, ";?#"); path_e = p; } else { /* Path is not allowed not to exist. */ path_b = path_e = p; } if (*p == ';') { ++p; params_b = p; p = strpbrk_or_eos (p, "?#"); params_e = p; } if (*p == '?') { ++p; query_b = p; p = strpbrk_or_eos (p, "#"); query_e = p; /* Hack that allows users to use '?' (a wildcard character) in FTP URLs without it being interpreted as a query string delimiter. */ if (scheme == SCHEME_FTP) { query_b = query_e = NULL; path_e = p; } } if (*p == '#') { ++p; fragment_b = p; p += strlen (p); fragment_e = p; } assert (*p == 0); if (uname_b != uname_e) { /* http://user:pass@host */ /* ^ ^ */ /* uname_b uname_e */ if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd)) { error_code = PE_INVALID_USER_NAME; goto err; } } u = xnew0 (struct url); u->scheme = scheme; u->host = strdupdelim (host_b, host_e); u->port = port; u->user = user; u->passwd = passwd; u->path = strdupdelim (path_b, path_e); path_modified = path_simplify (u->path); split_path (u->path, &u->dir, &u->file); host_modified = lowercase_str (u->host); /* Decode %HH sequences in host name. This is important not so much to support %HH sequences in host names (which other browser don't), but to support binary characters (which will have been converted to %HH by reencode_escapes). */ if (strchr (u->host, '%')) { url_unescape (u->host); host_modified = 1; } if (params_b) u->params = strdupdelim (params_b, params_e); if (query_b) u->query = strdupdelim (query_b, query_e); if (fragment_b) u->fragment = strdupdelim (fragment_b, fragment_e); if (path_modified || u->fragment || host_modified || path_b == path_e) { /* If we suspect that a transformation has rendered what url_string might return different from URL_ENCODED, rebuild u->url using url_string. */ u->url = url_string (u, 0); if (url_encoded != url) xfree ((char *) url_encoded); } else { if (url_encoded == url) u->url = xstrdup (url); else u->url = url_encoded; } return u; err: /* Cleanup in case of error: */ if (url_encoded && url_encoded != url) xfree (url_encoded); /* Transmit the error code to the caller, if the caller wants to know. */ if (error) *error = error_code; return NULL;}/* Return the error message string from ERROR_CODE, which should have been retrieved from url_parse. The error message is translated. */const char *url_error (int error_code){ assert (error_code >= 0 && error_code < countof (parse_errors)); return _(parse_errors[error_code]);}/* Split PATH into DIR and FILE. PATH comes from the URL and is expected to be URL-escaped. The path is split into directory (the part up to the last slash) and file (the part after the last slash), which are subsequently unescaped. Examples: PATH DIR FILE "foo/bar/baz" "foo/bar" "baz" "foo/bar/" "foo/bar" "" "foo" "" "foo" "foo/bar/baz%2fqux" "foo/bar" "baz/qux" (!) DIR and FILE are freshly allocated. */static voidsplit_path (const char *path, char **dir, char **file){ char *last_slash = strrchr (path, '/'); if (!last_slash) { *dir = xstrdup (""); *file = xstrdup (path); } else { *dir = strdupdelim (path, last_slash); *file = xstrdup (last_slash + 1); } url_unescape (*dir); url_unescape (*file);}/* Note: URL's "full path" is the path with the query string and params appended. The "fragment" (#foo) is intentionally ignored, but that might be changed. For example, if the original URL was "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment", the full path will be "/foo/bar/baz;bullshit?querystring". *//* Return the length of the full path, without the terminating zero. */static intfull_path_length (const struct url *url){ int len = 0;#define FROB(el) if (url->el) len += 1 + strlen (url->el) FROB (path); FROB (params); FROB (query);#undef FROB return len;}/* Write out the full path. */static voidfull_path_write (const struct url *url, char *where){#define FROB(el, chr) do { \ char *f_el = url->el; \ if (f_el) { \ int l = strlen (f_el); \ *where++ = chr; \ memcpy (where, f_el, l); \ where += l; \ } \} while (0) FROB (path, '/'); FROB (params, ';'); FROB (query, '?');#undef FROB}/* Public function for getting the "full path". E.g. if u->path is "foo/bar" and u->query is "param=value", full_path will be "/foo/bar?param=value". */char *url_full_path (const struct url *url){ int length = full_path_length (url); char *full_path = (char *) xmalloc (length + 1); full_path_write (url, full_path); full_path[length] = '\0'; return full_path;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -