⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 url.c

📁 这是一个同样来自贝尔实验室的和UNIX有着渊源的操作系统, 其简洁的设计和实现易于我们学习和理解
💻 C
📖 第 1 页 / 共 2 页
字号:
/* * This is a URL parser, written to parse "Common Internet Scheme" URL * syntax as described in RFC1738 and updated by RFC2396.  Only absolute URLs  * are supported, using "server-based" naming authorities in the schemes. * Support for literal IPv6 addresses is included, per RFC2732. * * Current "known" schemes: http, ftp, file. * * We can do all the parsing operations without Runes since URLs are * defined to be composed of US-ASCII printable characters. * See RFC1738, RFC2396. */#include <u.h>#include <libc.h>#include <ctype.h>#include <regexp.h>#include <plumb.h>#include <thread.h>#include <fcall.h>#include <9p.h>#include "dat.h"#include "fns.h"int urldebug;/* If set, relative paths with leading ".." segments will have them trimmed */#define RemoveExtraRelDotDots	0#define ExpandCurrentDocUrls	1static char*schemestrtab[] ={	nil,	"http",	"https",	"ftp",	"file",};static intischeme(char *s){	int i;	for(i=0; i<nelem(schemestrtab); i++)		if(schemestrtab[i] && strcmp(s, schemestrtab[i])==0)			return i;	return USunknown;}/* * URI splitting regexp is from RFC2396, Appendix B:  *		^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? *		 12            3  4          5       6  7        8 9 * * Example: "http://www.ics.uci.edu/pub/ietf/uri/#Related" * $2 = scheme			"http" * $4 = authority		"www.ics.uci.edu" * $5 = path			"/pub/ietf/uri/" * $7 = query			<undefined> * $9 = fragment		"Related" *//* * RFC2396, Sec 3.1, contains: * * Scheme names consist of a sequence of characters beginning with a * lower case letter and followed by any combination of lower case * letters, digits, plus ("+"), period ("."), or hyphen ("-").  For * resiliency, programs interpreting URI should treat upper case letters * as equivalent to lower case in scheme names (e.g., allow "HTTP" as * well as "http"). *//* * For server-based naming authorities (RFC2396 Sec 3.2.2): *    server        = [ [ userinfo "@" ] hostport ] *    userinfo      = *( unreserved | escaped | *                      ";" | ":" | "&" | "=" | "+" | "$" | "," ) *    hostport      = host [ ":" port ] *    host          = hostname | IPv4address *    hostname      = *( domainlabel "." ) toplabel [ "." ] *    domainlabel   = alphanum | alphanum *( alphanum | "-" ) alphanum *    toplabel      = alpha | alpha *( alphanum | "-" ) alphanum *    IPv4address   = 1*digit "." 1*digit "." 1*digit "." 1*digit *    port          = *digit * *  The host is a domain name of a network host, or its IPv4 address as a *  set of four decimal digit groups separated by ".".  Literal IPv6 *  addresses are not supported. * * Note that literal IPv6 address support is outlined in RFC2732: *    host          = hostname | IPv4address | IPv6reference *    ipv6reference = "[" IPv6address "]"		(RFC2373) * * Since hostnames and numbers will have to be resolved by the OS anyway, * we don't have to parse them too pedantically (counting '.'s, checking  * for well-formed literal IP addresses, etc.). * * In FTP/file paths, we reject most ";param"s and querys.  In HTTP paths, * we just pass them through. * * Instead of letting a "path" be 0-or-more characters as RFC2396 suggests,  * we'll say it's 1-or-more characters, 0-or-1 times.  This way, an absent * path yields a nil substring match, instead of an empty one. * * We're more restrictive than RFC2396 indicates with "userinfo" strings, * insisting they have the form "[user[:password]]".  This may need to * change at some point, however. *//* RE character-class components -- these go in brackets */#define PUNCT			"\\-_.!~*'()"#define RES			";/?:@&=+$,"#define ALNUM		"a-zA-Z0-9"#define HEX			"0-9a-fA-F"#define UNRES			ALNUM PUNCT/* RE components; _N => has N parenthesized subexpressions when expanded */#define ESCAPED_1			"(%[" HEX "][" HEX "])"#define URIC_2			"([" RES UNRES "]|" ESCAPED_1 ")"#define URICNOSLASH_2		"([" UNRES ";?:@&=+$,]|" ESCAPED_1 ")"#define USERINFO_2		"([" UNRES ";:&=+$,]|" ESCAPED_1 ")"#define PCHAR_2			"([" UNRES ":@&=+$,]|" ESCAPED_1 ")"#define PSEGCHAR_3		"([/;]|" PCHAR_2 ")"typedef struct Retab Retab;struct Retab{	char	*str;	Reprog	*prog;	int		size;	int		ind[5];};enum{	REsplit = 0,	REscheme,	REunknowndata,	REauthority,	REhost,	REuserinfo,	REabspath,	REquery,	REfragment,	REhttppath,	REftppath,	REfilepath,	MaxResub=	20,};Retab retab[] =	/* view in constant width Font */{[REsplit]	"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]+)?(\\?([^#]*))?(#(.*))?$", nil, 0,	/* |-scheme-|      |-auth.-|  |path--|    |query|     |--|frag */	{  2,              4,         5,          7,          9},[REscheme]	"^[a-z][a-z0-9+-.]*$", nil, 0,	{ 0, },[REunknowndata]	"^" URICNOSLASH_2 URIC_2 "*$", nil, 0,	{ 0, },[REauthority]	"^(((" USERINFO_2 "*)@)?(((\\[[^\\]@]+\\])|([^:\\[@]+))(:([0-9]*))?)?)?$", nil, 0,	/* |----user info-----|  |--------host----------------|  |-port-| */	{  2,                    7,                              12, },[REhost]	"^(([a-zA-Z0-9\\-.]+)|(\\[([a-fA-F0-9.:]+)\\]))$", nil, 0,	/* |--regular host--|     |-IPv6 literal-| */	{  2,                     4, },[REuserinfo]	"^(([^:]*)(:([^:]*))?)$", nil, 0,	/* |user-|  |pass-| */	{  2,       4, },[REabspath]	"^/" PSEGCHAR_3 "*$", nil, 0,	{ 0, },[REquery]	"^" URIC_2 "*$", nil, 0,	{ 0, },[REfragment]	"^" URIC_2 "*$", nil, 0,	{ 0, },[REhttppath]	"^.*$", nil, 0,	{ 0, },[REftppath]	"^(.+)(;[tT][yY][pP][eE]=([aAiIdD]))?$", nil, 0,	/*|--|-path              |ftptype-| */	{ 1,                     3, }, [REfilepath]	"^.*$", nil, 0,	{ 0, },};static intcountleftparen(char *s){	int n;	n = 0;	for(; *s; s++)		if(*s == '(')			n++;	return n;}voidiniturl(void){	int i, j;	for(i=0; i<nelem(retab); i++){		retab[i].prog = regcomp(retab[i].str);		if(retab[i].prog == nil)			sysfatal("recomp(%s): %r", retab[i].str);		retab[i].size = countleftparen(retab[i].str)+1;		for(j=0; j<nelem(retab[i].ind); j++)			if(retab[i].ind[j] >= retab[i].size)				sysfatal("bad index in regexp table: retab[%d].ind[%d] = %d >= %d",					i, j, retab[i].ind[j], retab[i].size);		if(MaxResub < retab[i].size)			sysfatal("MaxResub too small: %d < %d", MaxResub, retab[i].size);	}}typedef struct SplitUrl SplitUrl;struct SplitUrl{	struct {		char *s;		char *e;	} url, scheme, authority, path, query, fragment;};/* * Implements the algorithm in RFC2396 sec 5.2 step 6. * Returns number of chars written, excluding NUL terminator. * dest is known to be >= strlen(base)+rel_len. */static voidmerge_relative_path(char *base, char *rel_st, int rel_len, char *dest){	char *s, *p, *e, *pdest;	pdest = dest;	/* 6a: start with base, discard last segment */	if(base){		/* Empty paths don't match in our scheme; 'base' should be nil */		assert(base[0] == '/');		e = strrchr(base, '/');		e++;		memmove(pdest, base, e-base);		pdest += e-base;	}else{		/* Artistic license on my part */		*pdest++ = '/';	}	/* 6b: append relative component */	if(rel_st){		memmove(pdest, rel_st, rel_len);		pdest += rel_len;	}	/* 6c: remove any occurrences of "./" as a complete segment */	s = dest;	*pdest = '\0';	while(e = strstr(s, "./")){		if((e == dest) || (*(e-1) == '/')){ 			memmove(e, e+2, pdest+1-(e+2));	/* +1 for NUL */			pdest -= 2;		}else			s = e+1;	}	/* 6d: remove a trailing "." as a complete segment */	if(pdest>dest && *(pdest-1)=='.' && 	  (pdest==dest+1 || *(pdest-2)=='/'))		*--pdest = '\0';	/* 6e: remove occurences of "seg/../", where seg != "..", left->right */	s = dest+1;	while(e = strstr(s, "/../")){		p = e - 1;		while(p >= dest && *p != '/')			p--;		if(memcmp(p, "/../", 4) != 0){			memmove(p+1, e+4, pdest+1-(e+4));			pdest -= (e+4) - (p+1);		}else			s = e+1;	}	/* 6f: remove a trailing "seg/..", where seg isn't ".."  */	if(pdest-3 > dest && memcmp(pdest-3, "/..", 3)==0){		p = pdest-3 - 1;		while(p >= dest && *p != '/')			p--;		if(memcmp(p, "/../", 4) != 0){			pdest = p+1;			*pdest = '\0';		}	}	/* 6g: leading ".." segments are errors -- we'll just blat them out. */	if(RemoveExtraRelDotDots){		p = dest;		if (p[0] == '/')			p++;		s = p;		while(s[0]=='.' && s[1]=='.' && (s[2]==0 || s[2]=='/'))			s += 3;		if(s > p){			memmove(p, s, pdest+1-s);			pdest -= s-p;		}	}	USED(pdest);	if(urldebug)		fprint(2, "merge_relative_path: '%s' + '%.*s' -> '%s'\n", base, rel_len, 			rel_st, dest);}/* * See RFC2396 sec 5.2 for info on resolving relative URIs to absolute form. * * If successful, this just ends up freeing and replacing "u->url". */static intresolve_relative(SplitUrl *su, Url *base, Url *u){	char *url, *path;	char *purl, *ppath;	int currentdoc, ulen, plen;	if(base == nil){		werrstr("relative URI given without base");		return -1;	}	if(base->scheme == nil){		werrstr("relative URI given with no scheme");		return -1;	}	if(base->ischeme == USunknown){		werrstr("relative URI given with unknown scheme");		return -1;	}	if(base->ischeme == UScurrent){		werrstr("relative URI given with incomplete base");		return -1;	}	assert(su->scheme.s == nil);	/* Sec 5.2 step 2 */	currentdoc = 0;	if(su->path.s==nil && su->scheme.s==nil && su->authority.s==nil && su->query.s==nil){		/* Reference is to current document */		if(urldebug)			fprint(2, "url %s is relative to current document\n", u->url);		u->ischeme = UScurrent;		if(!ExpandCurrentDocUrls)			return 0;		currentdoc = 1;	}		/* Over-estimate the maximum lengths, for allocation purposes */	/* (constants are for separators) */	plen = 1;	if(base->path)		plen += strlen(base->path);	if(su->path.s)		plen += 1 + (su->path.e - su->path.s);	ulen = 0;	ulen += strlen(base->scheme) + 1;	if(su->authority.s)		ulen += 2 + (su->authority.e - su->authority.s);	else		ulen += 2 + ((base->authority) ? strlen(base->authority) : 0);	ulen += plen;	if(su->query.s)		ulen += 1 + (su->query.e - su->query.s);	else if(currentdoc && base->query)		ulen += 1 + strlen(base->query);	if(su->fragment.s)		ulen += 1 + (su->fragment.e - su->fragment.s);	else if(currentdoc && base->fragment)		ulen += 1 + strlen(base->fragment);	url = emalloc(ulen+1);	path = emalloc(plen+1);	url[0] = '\0';	purl = url;	path[0] = '\0';	ppath = path;	if(su->authority.s || (su->path.s && (su->path.s[0] == '/'))){		/* Is a "network-path" or "absolute-path"; don't merge with base path */		/* Sec 5.2 steps 4,5 */		if(su->path.s){			memmove(ppath, su->path.s, su->path.e - su->path.s);			ppath += su->path.e - su->path.s;			*ppath = '\0';		}	}else if(currentdoc){		/* Is a current-doc reference; just copy the path from the base URL */		if(base->path){			strcpy(ppath, base->path);			ppath += strlen(ppath);		}		USED(ppath);	}else{		/* Is a relative-path reference; we have to merge it */		/* Sec 5.2 step 6 */		merge_relative_path(base->path,			su->path.s, su->path.e - su->path.s, ppath);	}	/* Build new URL from pieces, inheriting from base where needed */	strcpy(purl, base->scheme);	purl += strlen(purl);	*purl++ = ':';	if(su->authority.s){		strcpy(purl, "//");		purl += strlen(purl);		memmove(purl, su->authority.s, su->authority.e - su->authority.s);		purl += su->authority.e - su->authority.s;	}else if(base->authority){		strcpy(purl, "//");		purl += strlen(purl);		strcpy(purl, base->authority);		purl += strlen(purl);	}	assert((path[0] == '\0') || (path[0] == '/'));	strcpy(purl, path);	purl += strlen(purl);	/*	 * The query and fragment are not inherited from the base,	 * except in case of "current document" URLs, which inherit any query	 * and may inherit the fragment.	 */	if(su->query.s){		*purl++ = '?';		memmove(purl, su->query.s, su->query.e - su->query.s);		purl += su->query.e - su->query.s;	}else if(currentdoc && base->query){		*purl++ = '?';		strcpy(purl, base->query);		purl += strlen(purl);	}	if(su->fragment.s){		*purl++ = '#';		memmove(purl, su->query.s, su->query.e - su->query.s);		purl += su->fragment.e - su->fragment.s;	}else if(currentdoc && base->fragment){		*purl++ = '#';		strcpy(purl, base->fragment);		purl += strlen(purl);	}	USED(purl);	if(urldebug)		fprint(2, "resolve_relative: '%s' + '%s' -> '%s'\n", base->url, u->url, url);	free(u->url);	u->url = url;	free(path);	return 0;}intregx(Reprog *prog, char *s, Resub *m, int nm){	int i;	if(s == nil)		s = m[0].sp;	/* why is this necessary? */	i = regexec(prog, s, m, nm);/*	if(i >= 0)		for(j=0; j<nm; j++)			fprint(2, "match%d: %.*s\n", j, utfnlen(m[j].sp, m[j].ep-m[j].sp), m[j].sp);*/	return i;}static intismatch(int i, char *s, char *desc){	Resub m[1];	m[0].sp = m[0].ep = nil;	if(!regx(retab[i].prog, s, m, 1)){		werrstr("malformed %s: %q", desc, s);		return 0;	}	return 1;}static intspliturl(char *url, SplitUrl *su){	Resub m[MaxResub];	Retab *t;	/*	 * Newlines are not valid in a URI, but regexp(2) treats them specially 	 * so it's best to make sure there are none before proceeding.	 */	if(strchr(url, '\n')){		werrstr("newline in URI");		return -1;	}	/*	 * Because we use NUL-terminated strings, as do many client and server	 * implementations, an escaped NUL ("%00") will quite likely cause problems	 * when unescaped.  We can check for such a sequence once before examining 	 * the components because, per RFC2396 sec. 2.4.1 - 2.4.2, '%' is reserved	 * in URIs to _always_ indicate escape sequences.  Something like "%2500"	 * will still get by, but that's legitimate, and if it ends up causing

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -