📄 url.c
字号:
/***************************************************************************//* This code is part of WWW grabber called pavuk *//* Copyright (c) 1997 - 2001 Stefan Ondrejicka *//* Distributed under GPL 2 or later *//***************************************************************************/#include "config.h"#include <assert.h>#include <ctype.h>#include <sys/types.h>#include <sys/stat.h>#include <stdlib.h>#include <stdio.h>#include <string.h>#include <unistd.h>#include <limits.h>#include <errno.h>#include "gui.h"#include "http.h"#include "ftp.h"#include "gopher.h"#include "url.h"#include "html.h"#include "tools.h"#include "authinfo.h"#include "tr.h"#include "dinfo.h"#include "form.h"#include "gui_api.h"#include "lfname.h"static char *url_decode_html(const char *, int);/* here can you specify characters, *//* which are unsafe in file names */#ifdef __CYGWIN__#define FS_UNSAFE_CHARACTERS "\\:*?\"<>|"#endif/* for hexadecimal encoding */static const char hexa[] = "0123456789ABCDEF";#define HEXASC2HEXNR(x) (((x) >= '0' && (x) <= '9') ? \ ((x) - '0') : (tl_ascii_toupper(x) - 'A' + 10))#define HEX2CHAR(x) (HEXASC2HEXNR(*(x + 1)) << 4) + HEXASC2HEXNR(*(x + 2))const protinfo prottable[] = { {URLT_UNKNOWN, NULL, "unknown", NULL, 0, FALSE}, {URLT_HTTP, "http", "http", "http://", 80, TRUE},#ifdef USE_SSL {URLT_HTTPS, "https", "https", "https://", 443, TRUE},#else {URLT_HTTPS, "https", "https", "https://", 443, FALSE},#endif {URLT_FTP, "ftp", "ftp", "ftp://", 21, TRUE},#ifdef USE_SSL {URLT_FTPS, "ftps", "ftps", "ftps://", 21, TRUE},#else {URLT_FTPS, "ftps", "ftps", "ftps://", 21, FALSE},#endif {URLT_FILE, NULL, "file", "file://", 0, TRUE}, {URLT_GOPHER, "gopher", "gopher", "gopher://", 70, TRUE}, {URLT_FROMPARENT, NULL, "//", "//", 80, TRUE}};#define _STRCLS_LOWER "abcdefghijklmnopqrstuvwxyz"#define _STRCLS_UPER "ABCDEFGHIJKLMNOPQRSTUVWXYZ"#define _STRCLS_DIGIT "0123456789"char *url_parse_scheme(char *urlstr){ char *p; char *retv = NULL; if((p = strchr(urlstr, ':')) && tl_ascii_isalpha(*urlstr)) { int l1 = strspn(urlstr, _STRCLS_LOWER _STRCLS_UPER _STRCLS_DIGIT "+-."); if(l1 == (p - urlstr)) { retv = tl_strndup(urlstr, l1); lowerstr(retv); } } else { if(urlstr[0] == '/' && urlstr[1] == '/') retv = strdup("//"); } return retv;}static char *url_parse_authority(char *urlschpart){ char *retv = NULL; if(urlschpart[0] == '/' && urlschpart[1] == '/') { int l1 = strcspn(urlschpart + 2, "/?#;"); retv = tl_strndup(urlschpart + 2, l1); } return retv;}static int url_split_authority(char *authority, char **user, char **password, char **host, unsigned short *port){ char *p, *p2; if(user) *user = NULL; if(password) *password = NULL; *host = NULL; *port = 0; if(user && (p = strrchr(authority, '@'))) { p2 = strchr(authority, ':'); if(p2 && p2 < p) { *user = tl_strndup(authority, p2 - authority); *password = tl_strndup(p2 + 1, p - p2 - 1); } else { *user = tl_strndup(authority, p - authority); } p++; } else p = authority; if((p2 = strrchr(p, ':'))) { *host = tl_strndup(p, p2 - p); *port = _atoi(p2 + 1); } else { *host = tl_strdup(p); } lowerstr(*host); return 0;}static int url_split_path(char *urlpath, char **path, char **query, char **anchor){ char *p = NULL, *p1 = NULL, *p2 = NULL; *path = NULL; if(query) *query = NULL; if(anchor) *anchor = NULL; if(anchor) p1 = strchr(urlpath, '#'); if(query) p2 = strchr(urlpath, '?'); if(p1 && p2) { if(p1 > p2) { *anchor = tl_strdup(p1 + 1); *query = url_decode_html(p2 + 1, p1 - (p2 + 1)); p = p2; } else { *query = url_decode_html(p2 + 1, strlen(p2 + 1)); *anchor = tl_strndup(p1 + 1, p2 - (p1 + 1)); p = p1; } } else if(p1) { *anchor = tl_strdup(p1 + 1); p = p1; } else if(p2) { *query = url_decode_html(p2 + 1, strlen(p2 + 1)); p = p2; } if(p) { if(p - urlpath) { *path = tl_strndup(urlpath, p - urlpath); if(**path == '/') { p = *path; *path = get_abs_file_path(_strtrchr(p, '\\', '/')); free(p); } } } else { if(*urlpath) { *path = tl_strdup(urlpath); if(**path == '/') { p = *path; *path = get_abs_file_path(_strtrchr(p, '\\', '/')); free(p); } } } return 0;}protocol url_scheme_to_schemeid(char *scheme){ int i; for(i = 0; i < NUM_ELEM(prottable); i++) { if(prottable[i].urlid && !strcmp(prottable[i].urlid, scheme)) { return prottable[i].id; } } return URLT_UNKNOWN;}/* * If a path is relative and starts // we need to get the type from * the parent, which only the caller can do. This function is called * by the caller of url_parse when url_parse has returned type = URTL_FROMPARENT * and the parent can figure out the path. It basically does all the work * that url_parse would do once it knew the scheme. * however, we start with the urlstr in url->p.unsup.urlstr rather * than as an argument */static void url_finishpath(url * url){ char *authority = NULL; char *p; if(url->type == URLT_FROMPARENT) url->type = URLT_UNKNOWN; if(url->type == URLT_UNKNOWN) return; /* can't help here */ p = url->p.unsup.urlstr; authority = url_parse_authority(p); if(authority) p += strlen(authority) + 2; if(authority && *authority) { switch (url->type) { case URLT_FROMPARENT: break; case URLT_HTTP: case URLT_HTTPS: url_split_authority(authority, &(url->p.http.user), &(url->p.http.password), &(url->p.http.host), &(url->p.http.port)); if(!url->p.http.port) url->p.http.port = prottable[url->type].default_port; url_split_path(p, &(url->p.http.document), &(url->p.http.searchstr), &(url->p.http.anchor_name)); if(!url->p.http.document) url->p.http.document = tl_strdup("/"); break; case URLT_FTP: case URLT_FTPS: url_split_authority(authority, &(url->p.ftp.user), &(url->p.ftp.password), &(url->p.ftp.host), &(url->p.ftp.port)); if(!url->p.ftp.port) url->p.ftp.port = prottable[url->type].default_port; url_split_path(p, &url->p.ftp.path, NULL, &url->p.ftp.anchor_name); if(!url->p.ftp.path) url->p.ftp.path = tl_strdup("/"); if(p && p[0] == '/' && p[1] == '/') { char *pp = tl_str_concat(NULL, "/", url->p.ftp.path, NULL); _free(url->p.ftp.path); url->p.ftp.path = pp; } if((p = strrchr(url->p.ftp.path, ';')) && !strncasecmp(p, ";type=", 6)) *p = '\0'; url->p.ftp.dir = tl_is_dirname(url->p.ftp.path) != 0; break; case URLT_GOPHER: url_split_authority(authority, NULL, NULL, &(url->p.gopher.host), &(url->p.gopher.port)); if(!url->p.gopher.port) url->p.gopher.port = prottable[url->type].default_port; if(*(p + 1)) url->p.gopher.selector = tl_strdup(p + 1); else url->p.gopher.selector = tl_strdup("1"); break; case URLT_FILE: url_split_path(p, &(url->p.file.filename), &(url->p.file.searchstr), &(url->p.file.anchor_name)); if(!url->p.file.filename) url->p.file.filename = tl_strdup(""); break; default: return; } } if(!authority || !*authority) { switch (url->type) { case URLT_FILE: case URLT_FTP: case URLT_FTPS: case URLT_HTTP: case URLT_HTTPS: url->type = URLT_FILE; url_split_path(p, &(url->p.file.filename), &(url->p.file.searchstr), &(url->p.file.anchor_name)); if(!url->p.file.filename) url->p.file.filename = tl_strdup(""); break; default: url->type = URLT_UNKNOWN; return; break; } } _free(authority); return;}url *url_parse(char *urlstr){ char *scheme = NULL; char *authority = NULL; char *p; url ret_url; ret_url.type = URLT_UNKNOWN; ret_url.status = 0; ret_url.parent_url = NULL; ret_url.moved_to = NULL; ret_url.ref_cnt = 1; ret_url.level = 0; ret_url.extension = NULL; ret_url.local_name = NULL;#ifdef WITH_TREE#ifdef I_FACE ret_url.prop = NULL; ret_url.tree_nfo = NULL;#endif#endif#ifdef HAVE_MT pthread_mutex_init(&ret_url.lock, NULL);#endif p = urlstr; if(p) scheme = url_parse_scheme(urlstr); if(scheme) { ret_url.type = url_scheme_to_schemeid(scheme); /* If the string starts with // then we */ /* don't know the scheme type so we have */ /* to wait for the parent to set it. */ if(ret_url.type == URLT_FROMPARENT) { ret_url.p.unsup.urlstr = tl_strdup(urlstr); authority = url_parse_authority(urlstr); } else { /* We do know the scheme type, so move past it */ /* and get the 'authority' */ p += strlen(scheme) + 1; authority = url_parse_authority(p); } if(authority) p += strlen(authority) + 2; if(authority && *authority) { switch (ret_url.type) { case URLT_FROMPARENT: break; case URLT_HTTP: case URLT_HTTPS: url_split_authority(authority, &ret_url.p.http.user, &ret_url.p.http.password, &ret_url.p.http.host, &ret_url.p.http.port); if(!ret_url.p.http.port) ret_url.p.http.port = prottable[ret_url.type].default_port; url_split_path(p, &ret_url.p.http.document, &ret_url.p.http.searchstr, &ret_url.p.http.anchor_name); if(!ret_url.p.http.document) ret_url.p.http.document = tl_strdup("/"); break; case URLT_FTP: case URLT_FTPS: url_split_authority(authority, &ret_url.p.ftp.user, &ret_url.p.ftp.password, &ret_url.p.ftp.host, &ret_url.p.ftp.port); if(!ret_url.p.ftp.port) ret_url.p.ftp.port = prottable[ret_url.type].default_port; url_split_path(p, &ret_url.p.ftp.path, NULL, &ret_url.p.ftp.anchor_name); if(!ret_url.p.ftp.path) ret_url.p.ftp.path = tl_strdup("/"); if(p && p[0] == '/' && p[1] == '/') { char *pp = tl_str_concat(NULL, "/", ret_url.p.ftp.path, NULL); _free(ret_url.p.ftp.path); ret_url.p.ftp.path = pp; } if((p = strrchr(ret_url.p.ftp.path, ';')) && !strncasecmp(p, ";type=", 6)) *p = '\0'; ret_url.p.ftp.dir = tl_is_dirname(ret_url.p.ftp.path) != 0; break; case URLT_GOPHER: url_split_authority(authority, NULL, NULL, &ret_url.p.gopher.host, &ret_url.p.gopher.port); if(!ret_url.p.gopher.port) ret_url.p.gopher.port = prottable[ret_url.type].default_port; if(*(p + 1)) ret_url.p.gopher.selector = tl_strdup(p + 1); else ret_url.p.gopher.selector = tl_strdup("1"); break; case URLT_FILE: url_split_path(p, &ret_url.p.file.filename, &ret_url.p.file.searchstr, &ret_url.p.file.anchor_name); if(!ret_url.p.file.filename) ret_url.p.file.filename = tl_strdup(""); break; default: ret_url.p.unsup.urlstr = tl_strdup(urlstr); break; } } } if(!scheme || !authority || !*authority) { if(!scheme) ret_url.type = URLT_FILE; switch (ret_url.type) { case URLT_FILE: case URLT_FTP: case URLT_FTPS: case URLT_HTTP: case URLT_HTTPS: case URLT_FROMPARENT: ret_url.type = URLT_FILE; url_split_path(p, &ret_url.p.file.filename, &ret_url.p.file.searchstr, &ret_url.p.file.anchor_name); if(!ret_url.p.file.filename) ret_url.p.file.filename = tl_strdup(""); break; default: ret_url.type = URLT_UNKNOWN; ret_url.p.unsup.urlstr = tl_strdup(urlstr); break; } } _free(authority); _free(scheme); return new_url(&ret_url);}url *url_dup_url(url * src){ url dst; dst.type = src->type; dst.parent_url = NULL; dst.moved_to = NULL; dst.level = src->level; dst.ref_cnt = 1; dst.status = src->status & (URL_INLINE_OBJ | URL_STYLE | URL_ISHTML | URL_NORECURSE | URL_FORM_ACTION | URL_ISSCRIPT | URL_ISSTARTING); dst.extension = NULL; dst.local_name = NULL;#ifdef WITH_TREE#ifdef I_FACE dst.prop = NULL; dst.tree_nfo = NULL;#endif#endif#ifdef HAVE_MT pthread_mutex_init(&dst.lock, NULL);#endif switch (dst.type) { case URLT_FILE: dst.p.file.filename = tl_strdup(src->p.file.filename); dst.p.file.searchstr = tl_strdup(src->p.file.searchstr); dst.p.file.anchor_name = tl_strdup(src->p.file.anchor_name); break;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -