📄 url.c
字号:
/* URL handling. Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* $Id: url.c,v 1.23 2001/10/27 11:24:40 kalum Exp $ */#include "common.h"#include "prozilla.h"#include "url.h"#include "misc.h"/* NULL-terminated list of strings to be recognized as prototypes (URL schemes). Note that recognized doesn't mean supported -- only HTTP and FTP are supported for now. However, a string that does not match anything in the list will be considered a relative URL. Thus it's important that this list has anything anyone could think of being legal. There are wild things here. :-) Take a look at <URL:http://www.w3.org/pub/WWW/Addressing/schemes.html> to see more fun. *//* Is X "."? */#define DOTP(x) ((*(x) == '.') && (!*(x + 1)))/* Is X ".."? */#define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))char *protostrings[] = { "cid:", "clsid:", "file:", "finger:", "ftp:", "gopher:", "hdl:", "http:", "https:", "ilu:", "ior:", "irc:", "java:", "javascript:", "lifn:", "mailto:", "mid:", "news:", "nntp:", "path:", "prospero:", "rlogin:", "service:", "shttp:", "snews:", "stanf:", "telnet:", "tn3270:", "wais:", "whois++:", NULL};/* TODO remove this stupid things... *//* Similar to former, but for supported protocols: */proto_t sup_protos[] = { {"http://", URLHTTP, DEFAULT_HTTP_PORT}, {"ftp://", URLFTP, DEFAULT_FTP_PORT} /* { "file://", URLFILE, DEFAULT_FTP_PORT } */};/* Support for encoding and decoding of URL strings. We determine whether a character is unsafe through table lookup. This code assumes ASCII character set and 8-bit chars. */enum { urlchr_reserved = 1, urlchr_unsafe = 2};#define R urlchr_reserved#define U urlchr_unsafe#define RU R|U#define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))/* rfc1738 reserved chars. We don't use this yet; preservation of reserved chars will be implemented when I integrate the new `reencode_string' function. */#define RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)/* Unsafe chars: - anything <= 32; - stuff from rfc1738 ("<>\"#%{}|\\^~[]`"); - '@' and ':'; needed for encoding URL username and password. - anything >= 127. */#define UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)/* Convert the ASCII character X to a hex-digit. X should be between '0' and '9', or between 'A' and 'F', or between 'a' and 'f'. The result is a number between 0 and 15. If X is not a hexadecimal digit character, the result is undefined. */#define XCHAR_TO_XDIGIT(x) \ (((x) >= '0' && (x) <= '9') ? \ ((x) - '0') : (toupper(x) - 'A' + 10))/* The reverse of the above: convert a HEX digit in the [0, 15] range to an ASCII character representing it. The A-F characters are always in upper case. */#define XDIGIT_TO_XCHAR(x) (((x) < 10) ? ((x) + '0') : ((x) - 10 + 'A'))#define ARRAY_SIZE(array) (sizeof (array) / sizeof (*(array)))const static unsigned char urlchr_table[256] = { U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */ U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */ U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */ U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */ U, 0, U, U, 0, U, R, 0, /* SP ! " # $ % & ' */ 0, 0, 0, R, 0, 0, 0, R, /* ( ) * + , - . / */ 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */ 0, 0, U, R, U, R, U, R, /* 8 9 : ; < = > ? */ RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */ 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */ 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */ 0, 0, 0, U, U, U, U, 0, /* X Y Z [ \ ] ^ _ */ U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */ 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */ 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */ 0, 0, 0, U, U, U, U, U, /* x y z { | } ~ DEL */ U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,};/* Returns 1 if the URL begins with a protocol (supported or unsupported), 0 otherwise. */int has_proto(const char *url){ char **s; for (s = protostrings; *s; s++) if (strncasecmp(url, *s, strlen(*s)) == 0) return 1; return 0;}/* Skip the username and password, if present here. The function should be called *not* with the complete URL, but with the part right after the protocol. If no username and password are found, return 0. */int skip_uname(const char *url){ const char *p; const char *q = NULL; for (p = url; *p && *p != '/'; p++) if (*p == '@') q = p; /* If a `@' was found before the first occurrence of `/', skip it. */ if (q != NULL) return q - url + 1; else return 0;}/* Decodes the forms %xy in a URL to the character the hexadecimal code of which is xy. xy are hexadecimal digits from [0123456789ABCDEF] (case-insensitive). If x or y are not hex-digits or `%' precedes `\0', the sequence is inserted literally. */void decode_string(char *s){ char *t = s; /* t - tortoise */ char *h = s; /* h - hare */ for (; *h; h++, t++) { if (*h != '%')// if(1) { copychar: *t = *h; } else { /* Do nothing if '%' is not followed by two hex digits. */ if (!*(h + 1) || !*(h + 2) || !(isxdigit(*(h + 1)) && isxdigit(*(h + 2)))) goto copychar; *t = (XCHAR_TO_XDIGIT(*(h + 1)) << 4) + XCHAR_TO_XDIGIT(*(h + 2)); h += 2; } } *t = '\0';}/* Like encode_string, but return S if there are no unsafe chars. */char *encode_string_maybe(const char *s){ const char *p1; char *p2, *newstr; int newlen; int addition = 0; /*Changes Grendel: (*p1!='%') added */ for (p1 = s; *p1; p1++) if ((*p1!='%') && UNSAFE_CHAR(*p1)) addition += 2; /* Two more characters (hex digits) */ if (!addition) return (char *) s; newlen = (p1 - s) + addition; newstr = (char *) kmalloc(newlen + 1); p1 = s; p2 = newstr; while (*p1) { // if (UNSAFE_CHAR(*p1))if ((*p1!='%') && UNSAFE_CHAR(*p1))/* if(0)*/ { const unsigned char c = *p1++; *p2++ = '%'; *p2++ = XDIGIT_TO_XCHAR(c >> 4); *p2++ = XDIGIT_TO_XCHAR(c & 0xf); } else *p2++ = *p1++; } *p2 = '\0'; assert(p2 - newstr == newlen); return newstr;}/* Encode the unsafe characters (as determined by UNSAFE_CHAR) in a given string, returning a malloc-ed %XX encoded string. */char *encode_string(const char *s){ char *encoded = encode_string_maybe(s); if (encoded != s) return encoded; else return kstrdup(s);}/* Encode unsafe characters in PTR to %xx. If such encoding is done, the old value of PTR is freed and PTR is made to point to the newly allocated storage. */#define ENCODE(ptr) do { \ char *e_new = encode_string_maybe (ptr); \ if (e_new != ptr) \ { \ kfree (ptr); \ ptr = e_new; \ } \} while (0)/* Returns the protocol type if URL's protocol is supported, or URLUNKNOWN if not. */uerr_t urlproto(const char *url){ int i; for (i = 0; i < ARRAY_SIZE(sup_protos); i++) if (!strncasecmp(url, sup_protos[i].name, strlen(sup_protos[i].name))) return sup_protos[i].ind; for (i = 0; url[i] && url[i] != ':' && url[i] != '/'; i++); if (url[i] == ':') { for (++i; url[i] && url[i] != '/'; i++) if (!isdigit(url[i])) return URLBADPORT; if (url[i - 1] == ':') return URLFTP; else return URLHTTP; } else return URLHTTP;}/* If PATH ends with `;type=X', return the character X. */char process_ftp_type(char *path){ int len = strlen(path); if (len >= 7 && !memcmp(path + len - 7, ";type=", 6)) { path[len - 7] = '\0'; return path[len - 1]; } else return '\0';}/* Canonicalize PATH, and return a new path. The new path differs from PATH in that: Multple `/'s are collapsed to a single `/'. Leading `./'s and trailing `/.'s are removed. Trailing `/'s are removed. Non-leading `../'s and trailing `..'s are handled by removing portions of the path. E.g. "a/b/c/./../d/.." will yield "a/b". This function originates from GNU Bash. Changes for Wget: Always use '/' as stub_char. Don't check for local things using canon_stat. Change the original string instead of strdup-ing. React correctly when beginning with `./' and `../'. */void path_simplify(char *path){ register int i, start, ddot; char stub_char; if (!*path) return; /*stub_char = (*path == '/') ? '/' : '.'; */ stub_char = '/'; /* Addition: Remove all `./'-s preceding the string. If `../'-s precede, put `/' in front and remove them too. */ i = 0; ddot = 0; while (1) { if (path[i] == '.' && path[i + 1] == '/') i += 2; else if (path[i] == '.' && path[i + 1] == '.' && path[i + 2] == '/') { i += 3; ddot = 1; } else break; } if (i) strcpy(path, path + i - ddot); /* Replace single `.' or `..' with `/'. */ if ((path[0] == '.' && path[1] == '\0') || (path[0] == '.' && path[1] == '.' && path[2] == '\0')) { path[0] = stub_char;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -