📄 uri.l

📁 this is very good for teacher
💻 L
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* * uri.l -- Routines dealing with URI, mainly parsing and merging. * Created: Xie Han, OS lab of Peking University. <me@pku.edu> * * The grammar of URI is quite simply so yacc is not needed. * * Created: Sep 25 04:15am 2003. version 0.1.1 *		# Can read URI from standard input, parse it and print every *		  part of the URI. * * Updated: Sep 25 05:10pm 2003. version 0.1.2 * * Updated: Sep 25 10:30pm 2003. version 0.1.3 *		# A lot of changes in the program structure, much more clear. *		# Now the result is written into a uri structure. *		# A little bug. String "?" is accepted as a valid URI. * * Updated: Sep 26 02:16am 2003. version 0.1.4 *		# 2 states are added to solve the bug in version 0.1.3. *		# The uri structure was redesigned. * * Updated: Sep 27 05:21am 2003. version 1.0.0!!!! *		# All states are changed from exclusive to shared to make *		  the lex program more compatible because some lexer do not *		  support exclusive start condition. *		# "unput" is substituted by "yyless" when putting back excess *		  text, because the latter cost less. *		# 4 parsing functions are completed, each of which has different *		  characteristic. *		# Function "uri_rally_parse" is added, which cooperates with *		  other lex programs. * * Updated: Sep 28 1:00pm 2003. version 1.1.0 *		# The function "uri_rel_to_abs" is under construction. *		# The URI structure is changed again: "path_type" field is remove, *		  and "abs_path", "rel_path", "opaque_path" are substituted by *		  "path" because the original design is too sophisticated. * * Updated: National Day 5:06am 2003. version 1.1.1 *		# The function "__uri_path_merge" which merges the base path and *		  the relative path is finished and tested and proved to be a *		  nice job! This function is the core of merging a base URI and *		  a relative URI into an absolute URI. *		# The previous function "uri_rel_to_abs" is commented out. It's *		  terrible. * * Updated: National Day 5:00pm 2003. version 1.1.2 *		# Now we distinguish between "no" and "empty". Related fields *		  include "userinfo", "port", "query", and "fragment". The previous *		  version treats "no" and "empty" equally, which is not right. * * Updated: Oct 2 2:42am 2003. version 1.1.3 *		# Solve the conflict of sever and reg_name by adding a start *		  condition and some ugly codes that are normally not reached. *		# The "uri_rel_to_abs" is rewritten but has not been tested. * * Updated: Oct 4 6:33pm 2003. version 1.2.0 *		# Function "uri_path_merge" now returns the length of the result *		  path while the result path is returned by a pointer. *		# Function "uri_rel_to_abs" is renamed "uri_merge", and it will *		  return the string length of the result URI. *		# Function "uri_recombine" is coded, which recombine the a URI *		  structure into a URI string, and the string length is returned. *		# The URI merging algorithm in RFC 2396 is found not flawless, and *		  the problem is fixed in "uri_merge" with not very graceful means. * * Updated: Oct 5 2:09am 2003. version 1.2.1 *		# From this version function "uri_parse_string" is implemented by *		  "uri_parse_bytes" which from now on is dependent on *		  "uri_parse_buffer". *		# "uri_validate_string" and "uri_validate_bytes" were added. Their *		  names explain what they do. *		# Key word "static" was taken off from the front of function *		  "__uri_path_merge" while it is not declared in the .h file. I *		  mean to make it a secret interface ^_^ * * Updated: Oct 5 4:43am 2003. version 1.2.2 *		# Cross the first change log of version 1.2.1! *		# "uri_validate_string" is no longer implemented by *		  "uri_validate_bytes". * * Updated: Oct 9 4:49am 2003. version 1.2.3 *		# Function "uri_rally_parse" no longer exists, because it cannot *		  work. The original desire that it can cooperate with other lex *		  programs is indeed out of the question. *		# Macro URI_INIT changed slightly. * * Updated: Oct 10 4:28am 2003. version 1.2.4 *		# The arguments sequence of "rel_xxx" and "base_xxx" of functions *		  "__uri_path_merge" and "uri_merge" are exchanged. "rel_xxx" from *		  now on precedes "base_xxx". * * Updated: Oct 11 3:31am 2003. version 1.2.5 *		# URI merging algorithm is changed again. We check if the path of *		  the relative URI is defined before merging path, and if it's not *		  defined, we take the base URI's path directly other than merging *		  the paths. I thinks that the URI merging algorithm is now at the *		  edge of perfect. (The recommended merging algorithm of RFC 2396 *		  has a lot of problems.) *		# The rule of "<REG_NAME>{reg_name}" is optimized, because "strcat" *		  is slow. * * Updated: Oct 12 4:49am 2003. version 1.2.6 *		# "goto" statements are substituted by "do ... while (0)" and *		  "break" in function "uri_merge" and "uri_recombine". No real *		  improvement is made but showing off continues. * * Updated: Oct 12 4:55pm 2003. version 1.2.7 *		# URIs with chinese character are supported in an obviously *		  not perfect way: the "unreserved" BNF allows non-US-ASCII *		  characters, i.e., characters with value that ranges from *		  128 to 255. * * Updated: Oct 13 1:06am 2003. version 1.2.8 *		# Some codes of "__uri_path_merge" was rewritten because the *		  interface of stack module changed. *		# NOTE!!! NULL can no long be passed to "__uri_path_merge" as *		  a relative path or a base path, and the function from now on *		  won't set NULL as the result path. If the result path is *		  empty, the "abs_path" will point to a pointer that points to *		  an empty string ("\0"). So you can always "free(*abs_path)" *		  if "__uri_path_merge" succeeded. * * Updated: Oct 15 1:13am 2003. version 1.4.0 *		# The update of 1.x.x version continues because version 2.0.x *		  runs slower than 1.x.x (half speed), though 2.0.x is written *		  in much shorter and very nice codes. *		# The new uri structure is derived from 1.3.x. 1.3.x is a test *		  version which has been updated into 2.0.0. Indeed, 1.4.x is *		  compasible with 2.0.x completely, but 2.0.x is NOT compasible *		  with 1.4.x because 2.0.x has only one parsing functions: *		  "uri_parse_bytes". I am unwilling to say that 2.0.x is a *		  failure because the its codes is very nice. It's slow because *		  it scan the string more than once: first time determine the *		  length of the URI, and second time break down every components. *		  And when breaking down authority it must scan the authority *		  again to solve the conflict of server and reg_name. Indeed the *		  authority is scanned 3 times. *		# "__uri_path_merge", "uri_destroy", "uri_merge", "uri_recombine" *		  are taken from 2.0.0 because the uri structure changed. *		# "__uri_path_merge" once again supports "no" path. Forget the *		  second change log of version 1.2.8. *		# Some state names changed. * * Updated: Oct 15 7:51am 2003. version 1.4.1 *		# Fix a bug in "__uri_path_merge": when encountered a "../", *		  the stack won't pop when the stack top segment is "/". This *		  is not right because "/" is not necessarily the root path. *		  Now, when encountered a "../", the stack won't pop when and *		  only when the stack is empty or the stack has 1 segment and *		  it's a "/", i.e., when the following is FALSE: *			stack_height(stack) > sizeof (char *) + sizeof (int) || *			!stack_empty(stack) && stack_peep(stack, int) != 1 *		# Now the parsing process is very neat with "state stack" *		  introduced. All URIs will pass all states when being parsed. I *		  think it nicer than the 2.0.x parsing process. The performance *		  is almost equal to 1.4.0, twice the performance of 2.0.x. *		# Some internal macros and functions change. Some parentheses *		  are added. From now on "{" followes "do" immediately if the *		  corresponding "while" is "while (0)". The aim is to let the *		  readers know that indeed it's not an iteration. * * Updated: Oct 18 11:43am 2003. version 1.4.2 *		# Fix the bug that port can exist without host. If no host found, *		  we should BEGIN REG_NAME instead of BEGIN PORT. *		# The statement in the second change log of version 1.4.1 that *		  "All URIs will pass all states when being parse." is not *		  truth. Forget it. *		# We now check the return value of "yy_scan_xxx" and *		  "yy_create_xxx" because there's no garuantee that they would *		  not fail. (Only "yy_scan_buffer" may fail practically.) * * Updated: Oct 18 22:52pm 2003. version 1.4.3 *		# Function "uri_length_string" and "uri_length_bytes" are added. *		  They return the the same value as "uri_parse_string" and *		  "uri_parse_bytes", but they do not break down the URI so they *		  are faster (hoped). And, form now on "uri_validate_string" and *		  "uri_validate_bytes" are implemented by these two function *		  respectively. *		# State "FINISH" is renamed "ACCEPT", meaning a URI string is *		  accepted. * * Updated: Oct 20 1:28am 2003. version 1.4.4 *		# New version uses stack 2.2.x, and now we must tell the stack *		  what data type we wanna push or pop. A slight mistake in *		  "__uri_path_merge" is fixed: pushing or pop a segment name, *		  the data type should be "const char *", not "char *". * * Updated: Oct 20 6:10am 2003. version 1.4.5 *		# Some code blocks are moved to make the codes look clear. *		# Data type of "stack" variable in function "__uri_path_merge" is *		  changed to "STACK *", because module stack was updated again... *		  Now we use stack 2.4.x. *//* The followings are BNFs of URI-refernce, taken from RFC 2396. */URI-reference	({absoluteURI}|{relativeURI})?("#"{fragment})?absoluteURI		{scheme}":"({hier_part}|{opaque_part})relativeURI		({net_path}|{abs_path}|{rel_path})("?"{query})?hier_part		({net_path}|{abs_path})("?"{query})?opaque_part		{uric_no_slash}{uric}*uric_no_slash	{unreserved}|{escaped}|";"|"?"|":"|"@"|"&"|"="|"+"|"$"|","net_path		"//"{authority}{abs_path}?abs_path		"/"{path_segments}rel_path		{rel_segment}{abs_path}?rel_segment		({unreserved}|{escaped}|";"|"@"|"&"|"="|"+"|"$"|",")+scheme			{alpha}({alpha}|{digit}|"+"|"-"|".")*authority		{server}|{reg_name}reg_name		({unreserved}|{escaped}|"$"|","|";"|":"|"@"|"&"|"="|"+")+server			(({userinfo}"@")?{hostport})?userinfo		({unreserved}|{escaped}|";"|":"|"&"|"="|"+"|"$"|",")*hostport		{host}(":"{port})?host			{hostname}|{IPv4address}hostname		({domainlabel}".")*{toplabel}"."?domainlabel		{alphanum}|{alphanum}({alphanum}|"-")*{alphanum}toplabel		{alpha}|{alpha}({alphanum}|"-")*{alphanum}IPv4address		{digit}+"."{digit}+"."{digit}+"."{digit}+port			{digit}*path			({abs_path}|{opaque_part})?path_segments	{segment}("/"{segment})*segment			{pchar}*(";"{param})*param			{pchar}*pchar			{unreserved}|{escaped}|":"|"@"|"&"|"="|"+"|"$"|","query			{uric}*fragment		{uric}*uric			{reserved}|{unreserved}|{escaped}reserved		";"|"/"|"?"|":"|"@"|"&"|"="|"+"|"$"|","unreserved		{alphanum}|{mark}mark			"-"|"_"|"."|"!"|"~"|"*"|"'"|"("|")"escaped			"%"{hex}{hex}hex				digit|[A-Fa-f]alphanum		{alpha}|{digit}alpha			{lowalpha}|{upalpha}lowalpha		[a-z]upalpha			[A-Z]digit			[0-9]%option stack%s SCHEME REL_PATH AUTHORITY USERINFO HOST PORT REG_NAME ABS_PATH%s OPAQUE_PART QUERY FRAGMENT ACCEPT%{#include <errno.h>#include <stdlib.h>#include <stdio.h>#include <string.h>#include <stack.h>#include <misc.h>#include "uri.h"#define URI_AUTH_INIT(auth, at) \do {											\	if (((auth)->type = (at)) == AT_SERVER)		\	{											\		(auth)->userinfo = NULL;				\		(auth)->host = NULL;					\		(auth)->port = NULL;					\	}											\	else										\		(auth)->reg_name = NULL;				\} while (0)#define URI_FREE(ptr)	if (ptr) free(ptr)#define URI_AUTH_DESTROY(auth) \do {											\	if ((auth)->type == AT_SERVER)				\	{											\		URI_FREE((auth)->userinfo);				\		URI_FREE((auth)->host);					\		URI_FREE((auth)->port);					\	}											\	else										\		URI_FREE((auth)->reg_name);				\} while (0)static int __length;static struct uri *__uri;%}%%<SCHEME>{scheme}:/\/|{uric_no_slash}	{	if (__uri->scheme = strdupn(yytext, yyleng - 1))	{		__length += yyleng;		yy_push_state(AUTHORITY);	}	else	{		uri_destroy(__uri);		return -1;	}}<SCHEME>.|\n		|<SCHEME><<EOF>>		{	yyless(0);	BEGIN REL_PATH;}<REL_PATH>{rel_path}	{	if (__uri->path = strdupn(yytext, yyleng))	{		__length += yyleng;		BEGIN QUERY;	}	else	{		uri_destroy(__uri);		return -1;	}}<REL_PATH>.|\n		|<REL_PATH><<EOF>>	{	yyless(0);	yy_push_state(AUTHORITY);}	/* Authority and abs_path have conflict! If the following is "//",	 * we assume that it's an authority; if the following is "/", it's	 * an abs_path. */<AUTHORITY>"//"		{	yy_pop_state();	__uri->authority = (struct authority *)malloc(sizeof (struct authority));	if (__uri->authority)	{		URI_AUTH_INIT(__uri->authority, AT_SERVER);		__uri->authority->type = AT_SERVER;		__length += yyleng;		BEGIN USERINFO;	}	else	{		uri_destroy(__uri);		return -1;	}}<AUTHORITY>.|\n		|<AUTHORITY><<EOF>>	{	yyless(0);	yy_push_state(ABS_PATH);}<USERINFO>{userinfo}@	{	if (__uri->authority->userinfo = strdupn(yytext, yyleng - 1))	{		__length += yyleng;		BEGIN HOST;	}	else	{		uri_destroy(__uri);		return -1;	}}<USERINFO>.|\n		|<USERINFO><<EOF>>	{	yyless(0);	BEGIN HOST;}<HOST>{host}		{	if (__uri->authority->host = strdupn(yytext, yyleng))	{		__length += yyleng;		BEGIN PORT;	}	else	{		uri_destroy(__uri);		return -1;		}}<HOST>.|\n			|<HOST><<EOF>>		{	yyless(0);	BEGIN REG_NAME;}<PORT>:{port}		{	if (__uri->authority->port = strdupn(yytext + 1, yyleng - 1))	{		__length += yyleng;		BEGIN REG_NAME;	}	else	{		uri_destroy(__uri);		return -1;	}}<PORT>.|\n			|<PORT><<EOF>>		{	yyless(0);	BEGIN REG_NAME;}<REG_NAME>{reg_name}	{	/* We have assumed that the authority is a server, but it seems that	 * we are wrong: it's a reg_name. We should join the userinfo, host	 * and the port together with this yytext into a reg_name. This case	 * seldom happens, almost never. */	char *reg_name, *curpos;	int len = yyleng;	if (__uri->authority->userinfo)		len += strlen(__uri->authority->userinfo) + 1;	if (__uri->authority->host)		len += strlen(__uri->authority->host);	if (__uri->authority->port)		len += strlen(__uri->authority->port) + 1;	if (reg_name = (char *)malloc((len + 1) * sizeof (char)))	{		curpos = reg_name;		if (__uri->authority->userinfo)		{			MEMCPY_PLUS(curpos, __uri->authority->userinfo,										strlen(__uri->authority->userinfo));			*curpos++ = '@';		}		if (__uri->authority->host)			MEMCPY_PLUS(curpos, __uri->authority->host,										strlen(__uri->authority->host));		if (__uri->authority->port)		{			*curpos++ = ':';			MEMCPY_PLUS(curpos, __uri->authority->port,										strlen(__uri->authority->port));		}		MEMCPY_PLUS(curpos, yytext, yyleng);		*curpos = '\0';		URI_AUTH_DESTROY(__uri->authority);		URI_AUTH_INIT(__uri->authority, AT_REG_NAME);		__uri->authority->reg_name = reg_name;		__length += yyleng;		yy_push_state(ABS_PATH);	}	else	{		uri_destroy(__uri);		return -1;	}}<REG_NAME>.|\n		|<REG_NAME><<EOF>>	{	yyless(0);	yy_push_state(ABS_PATH);}<ABS_PATH>{abs_path}	{	yy_pop_state();	if (YY_START == AUTHORITY)		yy_pop_state();	if (__uri->path = strdupn(yytext, yyleng))	{		__length += yyleng;		BEGIN QUERY;	}	else	{		uri_destroy(__uri);		return -1;	}}<ABS_PATH>.|\n		|<ABS_PATH><<EOF>>	{	yyless(0);	yy_pop_state();	/* The previous state is "AUTHORITY" means the URI has NO authority. */	if (YY_START == AUTHORITY)	{		yy_pop_state();		/* The previous state is "SCHEME" means the URI HAS a scheme.		 * It's a little confusing. */		if (YY_START == SCHEME)			BEGIN OPAQUE_PART;		else			BEGIN FRAGMENT;	}	else		BEGIN QUERY;}<OPAQUE_PART>{opaque_part}	{	if (__uri->path = strdupn(yytext, yyleng))	{		__length += yyleng;		BEGIN ACCEPT;	}	else	{		uri_destroy(__uri);		return -1;	}}<OPAQUE_PART>.|\n		|<OPAQUE_PART><<EOF>>	{	yyless(0);	BEGIN FRAGMENT;}<QUERY>\?{query}	{	if (__uri->query = strdupn(yytext + 1, yyleng - 1))	{		__length += yyleng;		BEGIN QUERY;	}	else	{		uri_destroy(__uri);		return -1;	}}<QUERY>.|\n			|<QUERY><<EOF>>		{	yyless(0);	BEGIN FRAGMENT;}<FRAGMENT>#{fragment}	{	if (__uri->fragment = strdupn(yytext + 1, yyleng - 1))	{		__length += yyleng;		BEGIN ACCEPT;	}	else	{		uri_destroy(__uri);		return -1;	}
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -