📄 uri.c
字号:
/* URL parser and translator; implementation of RFC 2396. *//* $Id: uri.c,v 1.304.2.4 2005/05/01 20:57:46 jonas Exp $ */#ifdef HAVE_CONFIG_H#include "config.h"#endif#include <ctype.h>#include <errno.h>#ifdef HAVE_IDNA_H#include <idna.h>#endif#include <stdio.h>#include <stdlib.h>#include <string.h>#include <sys/types.h>#ifdef HAVE_NETDB_H#include <netdb.h> /* OS/2 needs this after sys/types.h */#endif#ifdef HAVE_SYS_SOCKET_H#include <sys/socket.h>#endif#ifdef HAVE_NETINET_IN_H#include <netinet/in.h>#endif#ifdef HAVE_ARPA_INET_H#include <arpa/inet.h>#endif#include "elinks.h"#include "protocol/protocol.h"#include "protocol/uri.h"#include "util/conv.h"#include "util/error.h"#include "util/file.h"#include "util/hash.h"#include "util/memory.h"#include "util/object.h"#include "util/string.h"static inline intend_of_dir(unsigned char c){ return c == POST_CHAR || c == '#' || c == ';' || c == '?';}static inline intis_uri_dir_sep(struct uri *uri, unsigned char pos){ return (uri->protocol == PROTOCOL_FILE ? dir_sep(pos) : pos == '/');}intis_ip_address(unsigned char *address, int addresslen){ /* The @address has well defined limits so it would be a shame to * allocate it. */ unsigned char buffer[IP_ADDRESS_BUFFER_SIZE]; if (addresslen >= sizeof(buffer)) return 0; safe_strncpy(buffer, address, addresslen + 1);#ifdef HAVE_INET_PTON#ifdef CONFIG_IPV6 { struct sockaddr_in6 addr6; if (inet_pton(AF_INET6, buffer, &addr6.sin6_addr) > 0) return 1; }#endif /* CONFIG_IPV6 */ { struct in_addr addr4; if (inet_pton(AF_INET, buffer, &addr4) > 0) return 1; } return 0;#else /* FIXME: Is this ever the case? */ return 0;#endif /* HAVE_INET_PTON */}intend_with_known_tld(unsigned char *s, int slen){ int i; static const unsigned char *tld[] = { "com", "edu", "net", "org", "gov", "mil", "int", "biz", "arpa", "aero", "coop", "info", "museum", "name", "pro", NULL }; if (!slen) return -1; if (slen < 0) slen = strlen(s); for (i = 0; tld[i]; i++) { int tldlen = strlen(tld[i]); int pos = slen - tldlen; if (pos >= 0 && !strncasecmp(&s[pos], tld[i], tldlen)) return pos; } return -1;}static intcheck_uri_file(unsigned char *name){ /* Check POST_CHAR etc ... */ static const unsigned char chars[] = POST_CHAR_S "#?"; int i; if (file_exists(name)) return strlen(name); for (i = 0; i < sizeof(chars) - 1; i++) { unsigned char *pos = strchr(name, chars[i]); int namelen = -1; if (!pos) continue; *pos = 0; if (file_exists(name)) namelen = strlen(name); *pos = chars[i]; if (namelen >= 0) return namelen; } return -1;}/* Encodes URIs without encoding stuff like fragments and query separators. */static voidencode_file_uri_string(struct string *string, unsigned char *uristring){ int filenamelen = check_uri_file(uristring); unsigned char saved = 0; if (filenamelen >= 0) { saved = uristring[filenamelen]; uristring[filenamelen] = 0; } encode_uri_string(string, uristring, 0); if (filenamelen >= 0) { uristring[filenamelen] = saved; add_to_string(string, &uristring[filenamelen]); }}static inline intget_protocol_length(const unsigned char *url){ unsigned char *end = (unsigned char *) url; /* Seek the end of the protocol name if any. */ /* RFC1738: * scheme = 1*[ lowalpha | digit | "+" | "-" | "." ] * (but per its recommendations we accept "upalpha" too) */ while (isalnum(*end) || *end == '+' || *end == '-' || *end == '.') end++; /* Now we make something to support our "IP version in protocol scheme * name" hack and silently chop off the last digit if it's there. The * IETF's not gonna notice I hope or it'd be going after us hard. */ if (end != url && isdigit(end[-1])) end--; /* Also return 0 if there's no protocol name (@end == @url). */ return (*end == ':' || isdigit(*end)) ? end - url : 0;}enum uri_errnoparse_uri(struct uri *uri, unsigned char *uristring){ unsigned char *prefix_end, *host_end;#ifdef CONFIG_IPV6 unsigned char *lbracket, *rbracket;#endif assertm(uristring, "No uri to parse."); memset(uri, 0, sizeof(*uri)); /* Nothing to do for an empty url. */ if_assert_failed return 0; if (!*uristring) return URI_ERRNO_EMPTY; uri->string = uristring; uri->protocollen = get_protocol_length(uristring); /* Invalid */ if (!uri->protocollen) return URI_ERRNO_INVALID_PROTOCOL; /* Figure out whether the protocol is known */ uri->protocol = get_protocol(struri(uri), uri->protocollen); prefix_end = uristring + uri->protocollen; /* ':' */ /* Check if there's a digit after the protocol name. */ if (isdigit(*prefix_end)) { uri->ip_family = uristring[uri->protocollen] - '0'; prefix_end++; } if (*prefix_end != ':') return URI_ERRNO_INVALID_PROTOCOL; prefix_end++; /* Skip slashes */ if (prefix_end[0] == '/' && prefix_end[1] == '/') { if (prefix_end[2] == '/' && get_protocol_need_slash_after_host(uri->protocol)) return URI_ERRNO_TOO_MANY_SLASHES; prefix_end += 2; } else if (get_protocol_need_slashes(uri->protocol)) { return URI_ERRNO_NO_SLASHES; } if (get_protocol_free_syntax(uri->protocol)) { uri->data = prefix_end; uri->datalen = strlen(prefix_end); return URI_ERRNO_OK; } else if (uri->protocol == PROTOCOL_FILE) { int datalen = check_uri_file(prefix_end); /* Extract the fragment part. */ if (datalen >= 0 && prefix_end[datalen] == '#') { uri->fragment = prefix_end + datalen + 1; uri->fragmentlen = strlen(uri->fragment); } else { datalen = strlen(prefix_end); } uri->data = prefix_end; uri->datalen = datalen; return URI_ERRNO_OK; } /* Isolate host */#ifdef CONFIG_IPV6 /* Get brackets enclosing IPv6 address */ lbracket = strchr(prefix_end, '['); if (lbracket) { rbracket = strchr(lbracket, ']'); /* [address] is handled only inside of hostname part (surprisingly). */ if (rbracket && rbracket < prefix_end + strcspn(prefix_end, "/")) uri->ipv6 = 1; else lbracket = rbracket = NULL; } else { rbracket = NULL; }#endif /* Possibly skip auth part */ host_end = prefix_end + strcspn(prefix_end, "@"); if (prefix_end + strcspn(prefix_end, "/") > host_end && *host_end) { /* we have auth info here */ unsigned char *user_end; /* Allow '@' in the password component */ while (strcspn(host_end + 1, "@") < strcspn(host_end + 1, "/?")) host_end = host_end + 1 + strcspn(host_end + 1, "@"); user_end = strchr(prefix_end, ':'); if (!user_end || user_end > host_end) { uri->user = prefix_end; uri->userlen = host_end - prefix_end; } else { uri->user = prefix_end; uri->userlen = user_end - prefix_end; uri->password = user_end + 1; uri->passwordlen = host_end - user_end - 1; } prefix_end = host_end + 1; }#ifdef CONFIG_IPV6 if (uri->ipv6) host_end = rbracket + strcspn(rbracket, ":/?"); else#endif host_end = prefix_end + strcspn(prefix_end, ":/?");#ifdef CONFIG_IPV6 if (uri->ipv6) { int addrlen = rbracket - lbracket - 1; /* Check for valid length. * addrlen >= sizeof(hostbuf) is theorically impossible * but i keep the test in case of... Safer, imho --Zas */ assertm(addrlen >= 0 && addrlen < NI_MAXHOST, "parse_uri(): addrlen value is bad (%d) for URL '%s'. " "Problems are likely to be encountered. Please report " "this, it is a security bug!", addrlen, uristring); if_assert_failed return URI_ERRNO_IPV6_SECURITY; uri->host = lbracket + 1; uri->hostlen = addrlen; } else#endif { uri->host = prefix_end; uri->hostlen = host_end - prefix_end; /* Trim trailing '.'s */ if (uri->hostlen && uri->host[uri->hostlen - 1] == '.') return URI_ERRNO_TRAILING_DOTS; } if (*host_end == ':') { /* we have port here */ unsigned char *port_end = host_end + 1 + strcspn(host_end + 1, "/"); host_end++; uri->port = host_end; uri->portlen = port_end - host_end; if (uri->portlen == 0) return URI_ERRNO_NO_PORT_COLON; /* We only use 8 bits for portlen so better check */ if (uri->portlen != port_end - host_end) return URI_ERRNO_INVALID_PORT; /* test if port is number */ /* TODO: possibly lookup for the service otherwise? --pasky */ for (; host_end < port_end; host_end++) if (!isdigit(*host_end)) return URI_ERRNO_INVALID_PORT; /* Check valid port value, and let show an error message * about invalid url syntax. */ if (uri->port && uri->portlen) { int n; errno = 0; n = strtol(uri->port, NULL, 10); if (errno || !uri_port_is_valid(n)) return URI_ERRNO_INVALID_PORT; } } if (*host_end == '/') { host_end++; } else if (get_protocol_need_slash_after_host(uri->protocol)) { /* The need for slash after the host component depends on the * need for a host component. -- The dangerous mind of Jonah */ if (!uri->hostlen) return URI_ERRNO_NO_HOST; return URI_ERRNO_NO_HOST_SLASH; } /* Look for #fragment or POST_CHAR */ prefix_end = host_end + strcspn(host_end, "#" POST_CHAR_S); uri->data = host_end; uri->datalen = prefix_end - host_end; if (*prefix_end == '#') { uri->fragment = prefix_end + 1; uri->fragmentlen = strcspn(uri->fragment, POST_CHAR_S); prefix_end = uri->fragment + uri->fragmentlen; } if (*prefix_end == POST_CHAR) { uri->post = prefix_end + 1; } return URI_ERRNO_OK;}intget_uri_port(struct uri *uri){ if (uri->port && uri->portlen) { unsigned char *end = uri->port; int port = strtol(uri->port, (char **) &end, 10); if (end != uri->port) { assert(uri_port_is_valid(port)); return port; } } return get_protocol_port(uri->protocol);}#define can_compare_uri_components(comp) !(((comp) & (URI_SPECIAL | URI_IDN)))static inline intcompare_component(unsigned char *a, int alen, unsigned char *b, int blen){ /* Check that the length and the strings are both set or unset */ if (alen != blen || !!a != !!b) return 0; /* Both are unset so that will make a perfect match */ if (!a || !alen) return 1; /* Let the higher forces decide */ return !memcmp(a, b, blen);}#define wants(x) (components & (x))intcompare_uri(struct uri *a, struct uri *b, enum uri_component components){ if (a == b) return 1; if (!components) return 0; assertm(can_compare_uri_components(components), "compare_uri() is a work in progress. Component unsupported"); return (!wants(URI_PROTOCOL) || a->protocol == b->protocol) && (!wants(URI_IP_FAMILY) || a->ip_family == b->ip_family) && (!wants(URI_USER) || compare_component(a->user, a->userlen, b->user, b->userlen)) && (!wants(URI_PASSWORD) || compare_component(a->password, a->passwordlen, b->password, b->passwordlen)) && (!wants(URI_HOST) || compare_component(a->host, a->hostlen, b->host, b->hostlen)) && (!wants(URI_PORT) || compare_component(a->port, a->portlen, b->port, b->portlen)) && (!wants(URI_DATA) || compare_component(a->data, a->datalen, b->data, b->datalen)) && (!wants(URI_FRAGMENT) || compare_component(a->fragment, a->fragmentlen, b->fragment, b->fragmentlen)) && (!wants(URI_POST) || compare_component(a->post, a->post ? strlen(a->post) : 0, b->post, b->post ? strlen(b->post) : 0));}/* We might need something more intelligent than this Swiss army knife. */struct string *add_uri_to_string(struct string *string, struct uri *uri, enum uri_component components){ /* Custom or unknown keep the URI untouched. */ if (uri->protocol == PROTOCOL_UNKNOWN) return add_to_string(string, struri(uri)); if (wants(URI_PROTOCOL)) { add_bytes_to_string(string, uri->string, uri->protocollen); if (wants(URI_IP_FAMILY) && uri->ip_family) add_long_to_string(string, uri->ip_family); add_char_to_string(string, ':'); if (get_protocol_need_slashes(uri->protocol)) add_to_string(string, "//"); } if (wants(URI_USER) && uri->userlen) { add_bytes_to_string(string, uri->user, uri->userlen); if (wants(URI_PASSWORD) && uri->passwordlen) { add_char_to_string(string, ':'); add_bytes_to_string(string, uri->password, uri->passwordlen); } add_char_to_string(string, '@'); } if (wants(URI_HOST) && uri->hostlen) { int add_host = 1;#ifdef CONFIG_IPV6 /* Rationale for wants(URI_PORT): The [notation] was invented * so that you can have an IPv6 addy and a port together. So * we want to use it when that happens, otherwise we need not * bother (that happens only when we want it for DNS anyway). * I insist on an implied elegancy of this way, but YMMV. ;-) * --pasky */ if (uri->ipv6 && wants(URI_PORT)) add_char_to_string(string, '[');#endif#ifdef CONFIG_IDN /* Support for the GNU International Domain Name library. * * http://www.gnu.org/software/libidn/manual/html_node/IDNA-Functions.html * * Now it is probably not perfect because idna_to_ascii_lz() * will be using a ``zero terminated input string encoded in * the current locale's character set''. Anyway I don't know * how to convert anything to UTF-8 or Unicode. --jonas */ if (wants(URI_IDN)) { unsigned char *host = memacpy(uri->host, uri->hostlen); if (host) { char *idname; int code = idna_to_ascii_lz(host, &idname, 0); /* FIXME: Return NULL if it coughed? --jonas */ if (code == IDNA_SUCCESS) { add_to_string(string, idname); free(idname); add_host = 0;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -