📄 uri.l

📁 在linux下的crawler程序,来自北大天网tiny search engine spider
💻 L
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
	int len;	stack_t *stack;	const char *curpos;	int seglen;	const char *next_slash;	int i;	/* This merging algorithm is different from RFC 2396, which uses string,	 * while this algorithm uses stack. */	if (!(stack = stack_create(STACK_INITIAL_SIZE)))		return -1;	/* The "base_path" and the "rel_path" are divided into segments and push	 * all these segments and their length into the stack. If a segment	 * is ".", ignore it; if a segment is "..", pop one segment out. */	len = 0;	for (i = 0; i < 2; i++)	{		/* Both "rel_path" and "base_path" can be NULL. */		if (curpos = base_path)		{			while (next_slash = strchr(curpos, '/'))			{				if (strncmp(curpos, "../", next_slash - curpos + 1) == 0)				{					if (stack_height(stack) > sizeof (char *) + sizeof (int) ||						!stack_empty(stack) && stack_top(int, stack) != 1)					{						len -= stack_pop(int, stack);						stack_pop(const char *, stack);					}				}				else if (strncmp(curpos, "./", next_slash - curpos + 1) != 0)				{					len += next_slash - curpos + 1;					if (stack_push(curpos, const char *, stack) < 0 ||						stack_push(next_slash - curpos + 1, int, stack) < 0)					{						stack_destroy(stack);						return -1;					}				}				curpos = next_slash + 1;			}				base_path = rel_path;		}	}	/* This part deals with the "filename", which may be empty, may be "..",	 * may be ".", and may be something else like "index.html". */	if (curpos)	{		if (strcmp(curpos, "..") == 0)		{			if (stack_height(stack) > sizeof (char *) + sizeof (int) ||				!stack_empty(stack) && stack_top(int, stack) != 1)			{				len -= stack_pop(int, stack);				stack_pop(const char *, stack);			}		}		else if (strcmp(curpos, ".") != 0)		{			len += strlen(curpos);			if (stack_push(curpos, const char *, stack) < 0 ||				stack_push(strlen(curpos), int, stack) < 0)			{				stack_destroy(stack);				return -1;			}		}	}	/* Example:	 * rel_path: "../././../game/../document/rfc/rfc2616.pdf"	 * base_path: "/pub/incoming/./software/linux/nasm.tar.gz",	 * Now the stack is:	 *	 *	+---------------+		<-- stack top	 *	|	11			|	 *	+---------------|	 *	|	rfc2616.pdf	|	 *	+---------------|	 *	|	4			|	 *	|---------------|	 *	|	rfc/		|	 *	|---------------|	 *	|	9			|	 *	|---------------|	 *	|	document/	|	 *	|---------------|	 *	|	9			|	 *	|---------------|	 *	|	incoming/	|	 *	|---------------|	 *	|	4			|	 *	|---------------|	 *	|	pub/		|	 *	|---------------|	 *	|	1			|	 *	|---------------|	 *	|	/			|	 *	+---------------+		<-- stack base	 *	 * len = 1 + 4 + 9 + 9 + 4 + 11 = ??	 *	 * Note that we do NOT copy the segments into the stack, we just push the	 * pointers into the stack.	 *	 * All the information we need to compose the result path has been here.	 */	/* The result path is an "empty path". We should turn it into "no path".	 * "no path" is allowed while "empty path" is illegal. */	if (len == 0)		*result = NULL;	else if (*result = (char *)malloc((len + 1) * sizeof (char)))	{		*result = *result + len;		**result = '\0';		while (!stack_empty(stack))		{			seglen = stack_pop(int, stack);			*result -= seglen;			memcpy(*result, stack_pop(const char *, stack), seglen);		}	}	else		len = -1;	stack_destroy(stack);	return len;}int uri_merge(const struct uri *rel_uri, const struct uri *base_uri,			  struct uri *result){	int len, n;	/* I am lazy. */	#define __STRDUP(str) \	({																	\		int __n;														\		char *__res;													\		if (str)														\		{																\			__n = strlen(str);											\			if (__res = strdupn(str, __n))								\				len += __n;												\			else														\				break;													\		}																\		else															\			__res = NULL;												\		__res;															\	})	/* The following macro is sooooooo big but it's only extended twice	 * and does not matter much. */	#define __AUTH_DUP(auth) \	({																	\		struct authority *__res;										\		if (auth)														\		{																\			if (__res = (struct authority *)							\						malloc(sizeof (struct authority)))				\			{															\				AUTH_INIT(__res, (auth)->type);							\				if ((auth)->type == AT_SERVER)							\				{														\					if (__res->userinfo = __STRDUP((auth)->userinfo))	\						len++;											\					__res->host = __STRDUP((auth)->host);				\					if (__res->port = __STRDUP((auth)->port))			\						len++;											\				}														\				else													\					__res->reg_name = __STRDUP((auth)->reg_name);		\				len += 2;												\			}															\			else														\				break;													\		}																\		else															\			__res = NULL;												\		__res;															\	})	URI_INIT(result);	len = 0;	do {		/* If the relative URI has a scheme, take it; else take the scheme		 * of the base URI. */		if (rel_uri->scheme)		{			result->scheme = __STRDUP(rel_uri->scheme);			len++;		}		else if (result->scheme = __STRDUP(base_uri->scheme))			len++;		/* If the relative URI has a scheme or an authority, take it's		 * authority; else take the authority of the base URI. */		if (rel_uri->scheme || rel_uri->authority)			result->authority = __AUTH_DUP(rel_uri->authority);		else			result->authority = __AUTH_DUP(base_uri->authority);		/* If the relative URI has a scheme or an authority or an absolute		 * path, take it's path; else if the relative URI does not have a		 * path, take the base URI's path; else if base URI has a path,		 * merge the relative URI's path with the base URI's path, and take		 * the result; else if the base URI has no path, merge the relative		 * URI's path with path "/" and take the result; no else. */		if (rel_uri->scheme || rel_uri->authority ||									rel_uri->path && *rel_uri->path == '/')			result->path = __STRDUP(rel_uri->path);		else if (!rel_uri->path)			result->path = __STRDUP(base_uri->path);		else if ((n = __path_merge(rel_uri->path, base_uri->path ?								   base_uri->path : "/", &result->path)) >= 0)			len += n;		else			break;		/* Query is taken from relative URI. */		if (result->query = __STRDUP(rel_uri->query))			len++;		/* Fragment is taken from relative URI. */		if (result->fragment = __STRDUP(rel_uri->fragment))			len++;		return len;	} while (0);	#undef __AUTH_DUP	#undef __STRDUP	uri_destroy(result);	return -1;}/* Recombine a URI structure into a URI string. "flags" indicates what * component(s) would you like to appear in the result string. Note that * the result string is NOT necessarily a legal URI string (When you mask * some components) though the second argument has the name "uristr". */int uri_combine(const struct uri *uri, char *uristr, unsigned int n, int flags){	char *curpos = uristr;	char *end = curpos + n;	do {		if (flags & C_SCHEME && uri->scheme)		{			n = strlen(uri->scheme);			if (curpos + n + 1 < end)			{				MEMCPY_PLUS(curpos, uri->scheme, n);				*curpos++ = ':';			}			else				break;		}		if (flags & C_AUTHORITY && uri->authority)		{			if (curpos + 2 < end)			{				*curpos++ = '/';				*curpos++ = '/';			}			else				break;			if (uri->authority->type == AT_SERVER)			{				if (flags & C_USERINFO && uri->authority->userinfo)				{					n = strlen(uri->authority->userinfo);					if (curpos + n + 1 < end)					{						MEMCPY_PLUS(curpos, uri->authority->userinfo, n);						*curpos++ = '@';					}					else						break;				}				if (flags & C_HOST && uri->authority->host)				{					n = strlen(uri->authority->host);					if (curpos + n < end)						MEMCPY_PLUS(curpos, uri->authority->host, n);					else						break;				}				if (flags & C_PORT && uri->authority->port)				{					n = strlen(uri->authority->port);					if (curpos + n + 1 < end)					{						*curpos++ = ':';						MEMCPY_PLUS(curpos, uri->authority->port, n);					}					else						break;				}			}			else if (flags & C_REG_NAME && uri->authority->reg_name)			{				n = strlen(uri->authority->reg_name);				if (curpos + n < end)					MEMCPY_PLUS(curpos, uri->authority->reg_name, n);				else					break;			}		}		if (flags & C_PATH && uri->path)		{			n = strlen(uri->path);			if (curpos + n < end)				MEMCPY_PLUS(curpos, uri->path, n);			else				break;		}		if (flags & C_QUERY && uri->query)		{			n = strlen(uri->query);			if (curpos + n + 1 < end)			{				*curpos++ = '?';				MEMCPY_PLUS(curpos, uri->query, n);			}			else				break;		}		if (flags & C_FRAGMENT && uri->fragment)		{			n = strlen(uri->fragment);			if (curpos + n + 1 < end)			{				*curpos++ = '#';				MEMCPY_PLUS(curpos, uri->fragment, n);			}			else				break;		}		if (curpos < end)			*curpos = '\0';		else			break;		return curpos - uristr;	} while (0);	errno = ENOSPC;	return -1;}/* Turn some bytes into a string of escaped form. */int uri_escape(const char *bytes, int len, char *escstr, unsigned int n){	const char *tmp = bytes + len;	char *curpos = escstr;	char *end = escstr + n;	while (1)	{		if (bytes == tmp)		{			if (curpos < end)			{				*curpos = '\0';				return curpos - escstr;			}			else				break;		}		if (is_uri_chr(*bytes) && curpos < end)			*curpos++ = *bytes;		else if (curpos + 2 < end)		{			sprintf(curpos, "%%%X%X",					(unsigned char)*bytes >> 4, *bytes & 0x0f);			curpos += 3;		}		else			break;		bytes++;	}	errno = ENOSPC;	return -1;}
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -