📄 url.c
字号:
/* URL handling. Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */#ifdef HAVE_CONFIG_H# include <config.h>#endif /* * HAVE_CONFIG_H */#include <stdio.h>#include <stdlib.h>#include <string.h>#include <ctype.h>#include <sys/types.h>#include <unistd.h>#include <errno.h>#include <assert.h>#include "main.h"#include "url.h"#include "misc.h"/* Is X "."? */#define DOTP(x) ((*(x) == '.') && (!*(x + 1)))/* Is X ".."? */#define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))char *protostrings[] = { "cid:", "clsid:", "file:", "finger:", "ftp:", "gopher:", "hdl:", "http:", "https:", "ilu:", "ior:", "irc:", "java:", "javascript:", "lifn:", "mailto:", "mid:", "news:", "nntp:", "path:", "prospero:", "rlogin:", "service:", "shttp:", "snews:", "stanf:", "telnet:", "tn3270:", "wais:", "whois++:", NULL};/* TODO remove this stupid things... *//* Similar to former, but for supported protocols: */proto_t sup_protos[] = { {"http://", URLHTTP, DEFAULT_HTTP_PORT}, {"ftp://", URLFTP, DEFAULT_FTP_PORT} /* { "file://", URLFILE, DEFAULT_FTP_PORT } */};/* Support for encoding and decoding of URL strings. We determine whether a character is unsafe through table lookup. This code assumes ASCII character set and 8-bit chars. */enum { urlchr_reserved = 1, urlchr_unsafe = 2};#define R urlchr_reserved#define U urlchr_unsafe#define RU R|U#define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))/* rfc1738 reserved chars. We don't use this yet; preservation of reserved chars will be implemented when I integrate the new `reencode_string' function. */#define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)/* Unsafe chars: - anything <= 32; - stuff from rfc1738 ("<>\"#%{}|\\^~[]`"); - '@' and ':'; needed for encoding URL username and password. - anything >= 127. */#define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)/* Convert the ASCII character X to a hex-digit. X should be between '0' and '9', or between 'A' and 'F', or between 'a' and 'f'. The result is a number between 0 and 15. If X is not a hexadecimal digit character, the result is undefined. */#define XCHAR_TO_XDIGIT(x) \ (((x) >= '0' && (x) <= '9') ? \ ((x) - '0') : (toupper(x) - 'A' + 10))/* The reverse of the above: convert a HEX digit in the [0, 15] range to an ASCII character representing it. The A-F characters are always in upper case. */#define XDIGIT_TO_XCHAR(x) (((x) < 10) ? ((x) + '0') : ((x) - 10 + 'A'))#define ARRAY_SIZE(array) (sizeof (array) / sizeof (*(array)))static const unsigned char urlchr_table[256] = { U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */ U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */ U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */ U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */ U, 0, U, U, 0, U, R, 0, /* SP ! " # $ % & ' */ 0, 0, 0, R, 0, 0, 0, R, /* ( ) * + , - . / */ 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */ 0, 0, U, R, U, R, U, R, /* 8 9 : ; < = > ? */ RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */ 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */ 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */ 0, 0, 0, U, U, U, U, 0, /* X Y Z [ \ ] ^ _ */ U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */ 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */ 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */ 0, 0, 0, U, U, U, U, U, /* x y z { | } ~ DEL */ U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,};/* Returns 1 if the URL begins with a protocol (supported or unsupported), 0 otherwise. */int has_proto(const char *url){ char **s; for (s = protostrings; *s; s++) if (strncasecmp(url, *s, strlen(*s)) == 0) return 1; return 0;}/* Skip the username and password, if present here. The function should be called *not* with the complete URL, but with the part right after the protocol. If no username and password are found, return 0. */int skip_uname(const char *url){ const char *p; const char *q = NULL; for (p = url; *p && *p != '/'; p++) if (*p == '@') q = p; /* If a `@' was found before the first occurrence of `/', skip it. */ if (q != NULL) return q - url + 1; else return 0;}/* Decodes the forms %xy in a URL to the character the hexadecimal code of which is xy. xy are hexadecimal digits from [0123456789ABCDEF] (case-insensitive). If x or y are not hex-digits or `%' precedes `\0', the sequence is inserted literally. */void decode_string(char *s){ char *t = s; /* t - tortoise */ char *h = s; /* h - hare */ for (; *h; h++, t++) { if (*h != '%') { copychar: *t = *h; } else { /* Do nothing if '%' is not followed by two hex digits. */ if (!*(h + 1) || !*(h + 2) || !(isxdigit(*(h + 1)) && isxdigit(*(h + 2)))) goto copychar; *t = (XCHAR_TO_XDIGIT(*(h + 1)) << 4) + XCHAR_TO_XDIGIT(*(h + 2)); h += 2; } } *t = '\0';}/* Like encode_string, but return S if there are no unsafe chars. */char *encode_string_maybe(const char *s){ const char *p1; char *p2, *newstr; int newlen; int addition = 0; for (p1 = s; *p1; p1++) if (UNSAFE_CHAR(*p1)) addition += 2; /* Two more characters (hex digits) */ if (!addition) return (char *) s; newlen = (p1 - s) + addition; newstr = kmalloc(newlen + 1); p1 = s; p2 = newstr; while (*p1) { if (UNSAFE_CHAR(*p1)) { const unsigned char c = *p1++; *p2++ = '%'; *p2++ = XDIGIT_TO_XCHAR(c >> 4); *p2++ = XDIGIT_TO_XCHAR(c & 0xf); } else *p2++ = *p1++; } *p2 = '\0'; assert(p2 - newstr == newlen); return newstr;}/* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a given string, returning a malloc-ed %XX encoded string. */char *encode_string(const char *s){ char *encoded = encode_string_maybe(s); if (encoded != s) return encoded; else return kstrdup(s);}/* Encode unsafe characters in PTR to %xx. If such encoding is done, the old value of PTR is freed and PTR is made to point to the newly allocated storage. */#define ENCODE(ptr) do { \ char *e_new = encode_string_maybe (ptr); \ if (e_new != ptr) \ { \ kfree (ptr); \ ptr = e_new; \ } \} while (0)/* Returns the protocol type if URL's protocol is supported, or URLUNKNOWN if not. */uerr_t urlproto(const char *url){ int i; for (i = 0; i < ARRAY_SIZE(sup_protos); i++) if (!strncasecmp(url, sup_protos[i].name, strlen(sup_protos[i].name))) return sup_protos[i].ind; for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++); if (url[i] == ':') { for (++i; url[i] && url[i] != '/'; i++) if (!isdigit(url[i])) return URLBADPORT; if (url[i - 1] == ':') return URLFTP; else return URLHTTP; } else return URLHTTP;}/* If PATH ends with `;type=X', return the character X. */char process_ftp_type(char *path){ int len = strlen(path); if (len >= 7 && !memcmp(path + len - 7, ";type=", 6)) { path[len - 7] = '\0'; return path[len - 1]; } else return '\0';}/* Canonicalize PATH, and return a new path. The new path differs from PATH in that: Multple `/'s are collapsed to a single `/'. Leading `./'s and trailing `/.'s are removed. Trailing `/'s are removed. Non-leading `../'s and trailing `..'s are handled by removing portions of the path. E.g. "a/b/c/./../d/.." will yield "a/b". This function originates from GNU Bash. Changes for Wget: Always use '/' as stub_char. Don't check for local things using canon_stat. Change the original string instead of strdup-ing. React correctly when beginning with `./' and `../'. */void path_simplify(char *path){ register int i, start, ddot; char stub_char; if (!*path) return; /*stub_char = (*path == '/') ? '/' : '.'; */ stub_char = '/'; /* Addition: Remove all `./'-s preceding the string. If `../'-s precede, put `/' in front and remove them too. */ i = 0; ddot = 0; while (1) { if (path[i] == '.' && path[i + 1] == '/') i += 2; else if (path[i] == '.' && path[i + 1] == '.' && path[i + 2] == '/') { i += 3; ddot = 1; } else break; } if (i) strcpy(path, path + i - ddot); /* Replace single `.' or `..' with `/'. */ if ((path[0] == '.' && path[1] == '\0') || (path[0] == '.' && path[1] == '.' && path[2] == '\0')) { path[0] = stub_char; path[1] = '\0'; return; } /* Walk along PATH looking for things to compact. */ i = 0; while (1) { if (!path[i]) break; while (path[i] && path[i] != '/') i++; start = i++; /* If we didn't find any slashes, then there is nothing left to do. */ if (!path[start]) break; /* Handle multiple `/'s in a row. */ while (path[i] == '/') i++; if ((start + 1) != i) { strcpy(path + start + 1, path + i); i = start + 1; } /* Check for trailing `/'. */ if (start && !path[i]) { zero_last: path[--i] = '\0'; break; } /* Check for `../', `./' or trailing `.' by itself. */ if (path[i] == '.') { /* Handle trailing `.' by itself. */ if (!path[i + 1]) goto zero_last; /* Handle `./'. */ if (path[i + 1] == '/') { strcpy(path + i, path + i + 1); i = (start < 0) ? 0 : start; continue; } /* Handle `../' or trailing `..' by itself. */ if (path[i + 1] == '.' && (path[i + 2] == '/' || !path[i + 2])) { while (--start > -1 && path[start] != '/'); strcpy(path + start + 1, path + i + 2); i = (start < 0) ? 0 : start; continue; } } /* path == '.' */ } /* while */ if (!*path) { *path = stub_char; path[1] = '\0'; }}/* Special versions of DOTP and DDOTP for parse_dir(). They work like DOTP and DDOTP, but they also recognize `?' as end-of-string delimiter. This is needed for correct handling of query strings. */#define PD_DOTP(x) ((*(x) == '.') && (!*((x) + 1) || *((x) + 1) == '?'))#define PD_DDOTP(x) ((*(x) == '.') && (*(x) == '.') \ && (!*((x) + 2) || *((x) + 2) == '?'))/* Like strlen(), but allow the URL to be ended with '?'. */int urlpath_length(const char *url){ const char *q = strchr(url, '?'); if (q) return q - url; return strlen(url);}/* Build the directory and filename components of the path. Both components are *separately* malloc-ed strings! It does not change the contents of path. If the path ends with "." or "..", they are (correctly) counted as directories. */void parse_dir(const char *path, char **dir, char **file){ int i, l; l = urlpath_length(path); for (i = l; i && path[i] != '/'; i--); if (!i && *path != '/') /* Just filename */ { if (PD_DOTP(path) || PD_DDOTP(path)) { *dir = strdupdelim(path, path + l); *file = kstrdup(path + l); /* normally empty, but could contain ?... */ } else { *dir = kstrdup(""); /* This is required because of FTP */ *file = kstrdup(path); } } else if (!i) /* /filename */ { if (PD_DOTP(path + 1) || PD_DDOTP(path + 1)) { *dir = strdupdelim(path, path + l); *file = kstrdup(path + l); /* normally empty, but could contain ?... */ } else { *dir = kstrdup("/"); *file = kstrdup(path + 1); } } else /* Nonempty directory with or without a filename */ { if (PD_DOTP(path + i + 1) || PD_DDOTP(path + i + 1)) { *dir = strdupdelim(path, path + l); *file = kstrdup(path + l); /* normally empty, but could contain ?... */ } else { *dir = strdupdelim(path, path + i); *file = kstrdup(path + i + 1); } }}/* Skip the protocol part of the URL, e.g. `http://'. If no protocol part is found, returns 0. */int skip_proto(const char *url){ char **s; int l; for (s = protostrings; *s; s++) if (!strncasecmp(*s, url, strlen(*s))) break; if (!*s) return 0; l = strlen(*s); /* HTTP and FTP protocols are expected to yield exact host names (i.e. the `//' part must be skipped, too). */ if (!strcmp(*s, "http:") || !strcmp(*s, "ftp:")) l += 2; return l;}/* Find the optional username and password within the URL, as per RFC1738. The returned user and passwd char pointers are malloc-ed. */static uerr_t parse_uname(const char *url, char **user, char **passwd){ int l; const char *p, *q, *col; char **where; *user = NULL; *passwd = NULL; /* Look for the end of the protocol string. */ l = skip_proto(url);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -