📄 uri.l

📁 在linux下的crawler程序,来自北大天网tiny search engine spider
💻 L
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/* * uri.l -- Routines dealing with URI, mainly parsing and merging. * Created: Xie Han, OS lab of Peking University. <me@pku.edu> * * The grammar of URI is quite simply so yacc is not needed. * * Created: Sep 25 04:15am 2003. version 0.1.1 *		# Can read URI from standard input, parse it and print every *		  part of the URI. * * Updated: Sep 25 05:10pm 2003. version 0.1.2 * * Updated: Sep 25 10:30pm 2003. version 0.1.3 *		# A lot of changes in the program structure, much more clear. *		# Now the result is written into a uri structure. *		# A little bug. String "?" is accepted as a valid URI. * * Updated: Sep 26 02:16am 2003. version 0.1.4 *		# 2 states are added to solve the bug in version 0.1.3. *		# The uri structure was redesigned. * * Updated: Sep 27 05:21am 2003. version 1.0.0!!!! *		# All states are changed from exclusive to shared to make *		  the lex program more compatible because some lexer do not *		  support exclusive start condition. *		# "unput" is substituted by "yyless" when putting back excess *		  text, because the latter cost less. *		# 4 parsing functions are completed, each of which has different *		  characteristic. *		# Function "uri_rally_parse" is added, which cooperates with *		  other lex programs. * * Updated: Sep 28 1:00pm 2003. version 1.1.0 *		# The function "uri_rel_to_abs" is under construction. *		# The URI structure is changed again: "path_type" field is remove, *		  and "abs_path", "rel_path", "opaque_path" are substituted by *		  "path" because the original design is too sophisticated. * * Updated: National Day 5:06am 2003. version 1.1.1 *		# The function "__uri_path_merge" which merges the base path and *		  the relative path is finished and tested and proved to be a *		  nice job! This function is the core of merging a base URI and *		  a relative URI into an absolute URI. *		# The previous function "uri_rel_to_abs" is commented out. It's *		  terrible. * * Updated: National Day 5:00pm 2003. version 1.1.2 *		# Now we distinguish between "no" and "empty". Related fields *		  include "userinfo", "port", "query", and "fragment". The previous *		  version treats "no" and "empty" equally, which is not right. * * Updated: Oct 2 2:42am 2003. version 1.1.3 *		# Solve the conflict of sever and reg_name by adding a start *		  condition and some ugly codes that are normally not reached. *		# The "uri_rel_to_abs" is rewritten but has not been tested. * * Updated: Oct 4 6:33pm 2003. version 1.2.0 *		# Function "uri_path_merge" now returns the length of the result *		  path while the result path is returned by a pointer. *		# Function "uri_rel_to_abs" is renamed "uri_merge", and it will *		  return the string length of the result URI. *		# Function "uri_recombine" is coded, which recombine the a URI *		  structure into a URI string, and the string length is returned. *		# The URI merging algorithm in RFC 2396 is found not flawless, and *		  the problem is fixed in "uri_merge" with not very graceful means. * * Updated: Oct 5 2:09am 2003. version 1.2.1 *		# From this version function "uri_parse_string" is implemented by *		  "uri_parse_bytes" which from now on is dependent on *		  "uri_parse_buffer". *		# "uri_validate_string" and "uri_validate_bytes" were added. Their *		  names explain what they do. *		# Key word "static" was taken off from the front of function *		  "__uri_path_merge" while it is not declared in the .h file. I *		  mean to make it a secret interface ^_^ * * Updated: Oct 5 4:43am 2003. version 1.2.2 *		# Cross the first change log of version 1.2.1! *		# "uri_validate_string" is no longer implemented by *		  "uri_validate_bytes". * * Updated: Oct 9 4:49am 2003. version 1.2.3 *		# Function "uri_rally_parse" no longer exists, because it cannot *		  work. The original desire that it can cooperate with other lex *		  programs is indeed out of the question. *		# Macro URI_INIT changed slightly. * * Updated: Oct 10 4:28am 2003. version 1.2.4 *		# The arguments sequence of "rel_xxx" and "base_xxx" of functions *		  "__uri_path_merge" and "uri_merge" are exchanged. "rel_xxx" from *		  now on precedes "base_xxx". * * Updated: Oct 11 3:31am 2003. version 1.2.5 *		# URI merging algorithm is changed again. We check if the path of *		  the relative URI is defined before merging path, and if it's not *		  defined, we take the base URI's path directly other than merging *		  the paths. I thinks that the URI merging algorithm is now at the *		  edge of perfect. (The recommended merging algorithm of RFC 2396 *		  has a lot of problems.) *		# The rule of "<REG_NAME>{reg_name}" is optimized, because "strcat" *		  is slow. * * Updated: Oct 12 4:49am 2003. version 1.2.6 *		# "goto" statements are substituted by "do ... while (0)" and *		  "break" in function "uri_merge" and "uri_recombine". No real *		  improvement is made but showing off continues. * * Updated: Oct 12 4:55pm 2003. version 1.2.7 *		# URIs with chinese character are supported in an obviously *		  not perfect way: the "unreserved" BNF allows non-US-ASCII *		  characters, i.e., characters with value that ranges from *		  128 to 255. * * Updated: Oct 13 1:06am 2003. version 1.2.8 *		# Some codes of "__uri_path_merge" was rewritten because the *		  interface of stack module changed. *		# NOTE!!! NULL can no long be passed to "__uri_path_merge" as *		  a relative path or a base path, and the function from now on *		  won't set NULL as the result path. If the result path is *		  empty, the "abs_path" will point to a pointer that points to *		  an empty string ("\0"). So you can always "free(*abs_path)" *		  if "__uri_path_merge" succeeded. * * Updated: Oct 15 1:13am 2003. version 1.4.0 *		# The update of 1.x.x version continues because version 2.0.x *		  runs slower than 1.x.x (half speed), though 2.0.x is written *		  in much shorter and very nice codes. *		# The new uri structure is derived from 1.3.x. 1.3.x is a test *		  version which has been updated into 2.0.0. Indeed, 1.4.x is *		  compasible with 2.0.x completely, but 2.0.x is NOT compasible *		  with 1.4.x because 2.0.x has only one parsing functions: *		  "uri_parse_bytes". I am unwilling to say that 2.0.x is  *		  failure because the its codes is very nice. It's slow because *		  it scan the string more than once: first time determine the *		  length of the URI, and second time break down every components. *		  And when breaking down authority it must scan the authority *		  again to solve the conflict of server and reg_name. Indeed the *		  authority is scanned 3 times. *		# "__uri_path_merge", "uri_destroy", "uri_merge", "uri_recombine" *		  are taken from 2.0.0 because the uri structure changed. *		# "__uri_path_merge" once again supports "no" path. Forget the *		  second change log of version 1.2.8. *		# Some state names changed. * * Updated: Oct 15 7:51am 2003. version 1.4.1 *		# Fix a bug in "__uri_path_merge": when encountered a "../", *		  the stack won't pop when the stack top segment is "/". This *		  is not right because "/" is not necessarily the root path. *		  Now, when encountered a "../", the stack won't pop when and *		  only when the stack is empty or the stack has 1 segment and *		  it's a "/", i.e., when the following is FALSE: *			stack_height(stack) > sizeof (char *) + sizeof (int) || *			!stack_empty(stack) && stack_peep(int, stack) != 1 *		# Now the parsing process is very neat with "state stack" *		  introduced. All URIs will pass all states when being parsed. I *		  think it nicer than the 2.0.x parsing process. The performance *		  is almost equal to 1.4.0, twice the performance of 2.0.x. *		# Some internal macros and functions change. Some parentheses *		  are added. From now on "{" followes "do" immediately if the *		  corresponding "while" is "while (0)". The aim is to let the *		  readers know that indeed it's not an iteration. * * Updated: Oct 18 11:43am 2003. version 1.4.2 *		# Fix the bug that port can exist without host. If no host found, *		  we should BEGIN REG_NAME instead of BEGIN PORT. *		# The statement in the second change log of version 1.4.1 that *		  "All URIs will pass all states when being parse." is not the *		  truth. Forget it. *		# We now check the return value of "yy_scan_xxx" and *		  "yy_create_xxx" because there's no garuantee that they would *		  not fail. (Only "yy_scan_buffer" may fail practically.) * * Updated: Oct 18 22:52pm 2003. version 1.4.3 *		# Function "uri_length_string" and "uri_length_bytes" are added. *		  They return the the same value as "uri_parse_string" and *		  "uri_parse_bytes", but they do not break down the URI so they *		  are faster (hoped). And, from now on "uri_validate_string" and *		  "uri_validate_bytes" are implemented by these two function *		  respectively. *		# State "FINISH" is renamed "ACCEPT", meaning a URI string is *		  accepted. * * Updated: Oct 20 1:28am 2003. version 1.4.4 *		# New version uses stack 2.2.x, and now we must tell the stack *		  what data type we wanna push or pop. A slight mistake in *		  "__uri_path_merge" is fixed: pushing or pop a segment name, *		  the data type should be "const char *", not "char *". * * Updated: Oct 20 6:10am 2003. version 1.4.5 *		# Some code blocks are moved to make the codes look clear. * * Updated: Oct 23 9:31pm 2003. version 1.4.6 *		# I forgot to destroy the stack when the result path is "no path". *		  Fixed. *		# Some names of internal functions and macros are changed to make *		  everything simplier. "__uri_path_merge" -> "__path_merge", *		  "URI_AUTH_INIT" -> "AUTH_INIT", "__URI_STRDUP" -> "__STRDUP", *		  "__URI_AUTH_DUP" -> "__AUTH_DUP". *		# Fix the bug that opaque_part cannot have a fragment. *		# No longer supports Non-US-ASCII characters in URI string. Because *		  the way that we used is imperfect. * * Updated: Oct 27 4:13am 2003. version 1.5.0 *		# Hmmmmmmmmm. A better way to support non-ASCII characters is out. *		  Function "uri_escape" is added to turn some bytes into a string *		  in escaped form, i.e., all non-ASCII characters and excluded *		  ASCII characters are turned into %xx. "xx" is the hex value of *		  that character. *		# Macro uri_isexcl is added for users to examine whether a *		  character is an excluded ASCII character. But notice that this *		  macro is as dangerous as every "isxxx" macro provided by *		  "ctype.h". Make sure that the character's value ranges from *		  0 - 127, or your program may crash. * * Updated: Oct 28 4:49am 2003. version 1.5.1 *		# Macro "uri_isexcl" is no longer available for users, because it *		  is not perfect. *		# Two arguments is added to function "uri_escape". *		  @const char *ignore: Octets that should be treated as if they *		  do not exist; *		  @int ignlen: How many octets in argument "ignore". Note that *		  "ignore" is NOT necessarily a '\0' terminated string. * * Updated: Oct 30 7:09am 2003. version 1.5.2 *		# Function "uri_recombine" is updated by "uri_combine". The latter *		  take one more argument: "int flags" indicating which components *		  you are interested and want to appear in the URI string. "flags" *		  can be any C_xxx macro or their "|" combination. * * Updated: Nov 4 4:30am 2003. version 1.6.0 *		# Function "uri_escape" can no longer ignore some characters. *		  Arguments "ignore" and "ignlen" are removed. * * Updated: Nov 4 3:45am 2003. version 1.6.1 *		# Macro "is_uri_chr" is added for users to check whether a character *		  allowed in a URI string. *		# Function "uri_escape" is rewriten using macro "is_uri_chr". *//* The followings are BNFs generating URI-refernce, taken from RFC 2396. */URI-reference	({absoluteURI}|{relativeURI})?("#"{fragment})?absoluteURI		{scheme}":"({hier_part}|{opaque_part})relativeURI		({net_path}|{abs_path}|{rel_path})("?"{query})?hier_part		({net_path}|{abs_path})("?"{query})?opaque_part		{uric_no_slash}{uric}*uric_no_slash	{unreserved}|{escaped}|";"|"?"|":"|"@"|"&"|"="|"+"|"$"|","net_path		"//"{authority}{abs_path}?abs_path		"/"{path_segments}rel_path		{rel_segment}{abs_path}?rel_segment		({unreserved}|{escaped}|";"|"@"|"&"|"="|"+"|"$"|",")+scheme			{alpha}({alpha}|{digit}|"+"|"-"|".")*authority		{server}|{reg_name}reg_name		({unreserved}|{escaped}|"$"|","|";"|":"|"@"|"&"|"="|"+")+server			(({userinfo}"@")?{hostport})?userinfo		({unreserved}|{escaped}|";"|":"|"&"|"="|"+"|"$"|",")*hostport		{host}(":"{port})?host			{hostname}|{IPv4address}hostname		({domainlabel}".")*{toplabel}"."?domainlabel		{alphanum}|{alphanum}({alphanum}|"-")*{alphanum}toplabel		{alpha}|{alpha}({alphanum}|"-")*{alphanum}IPv4address		{digit}+"."{digit}+"."{digit}+"."{digit}+port			{digit}*path			({abs_path}|{opaque_part})?path_segments	{segment}("/"{segment})*segment			{pchar}*(";"{param})*param			{pchar}*pchar			{unreserved}|{escaped}|":"|"@"|"&"|"="|"+"|"$"|","query			{uric}*fragment		{uric}*uric			{reserved}|{unreserved}|{escaped}reserved		";"|"/"|"?"|":"|"@"|"&"|"="|"+"|"$"|","unreserved		{alphanum}|{mark}mark			"-"|"_"|"."|"!"|"~"|"*"|"'"|"("|")"escaped			"%"{hex}{hex}hex				{digit}|[A-Fa-f]alphanum		{alpha}|{digit}alpha			{lowalpha}|{upalpha}lowalpha		[a-z]upalpha			[A-Z]digit			[0-9]%option stack%s SCHEME REL_PATH AUTHORITY USERINFO HOST PORT REG_NAME ABS_PATH%s OPAQUE_PART QUERY FRAGMENT ACCEPT%{#include <errno.h>#include <stdlib.h>#include <stdio.h>#include <string.h>#include <list.h>#include <stack.h>#include <misc.h>#include "uri.h"#define URI_INIT(uri) \do {											\	(uri)->scheme = NULL;						\	(uri)->authority = NULL;					\	(uri)->path = NULL;							\	(uri)->query = NULL;						\	(uri)->fragment = NULL;						\} while (0)#define AUTH_INIT(auth, at) \do {											\	if (((auth)->type = (at)) == AT_SERVER)		\	{											\		(auth)->userinfo = NULL;				\		(auth)->host = NULL;					\		(auth)->port = NULL;					\	}											\	else										\		(auth)->reg_name = NULL;				\} while (0)#define URI_AUTH_DESTROY(auth) \do {											\	if ((auth)->type == AT_SERVER)				\	{											\		FREE_NOT_NULL((auth)->userinfo);		\		FREE_NOT_NULL((auth)->host);			\		FREE_NOT_NULL((auth)->port);			\	}											\	else										\		FREE_NOT_NULL((auth)->reg_name);		\} while (0)static int __length;static struct uri *__uri;%}%%<SCHEME>{scheme}":"/"/"|{uric_no_slash}		{	if (__uri->scheme = strdupn(yytext, yyleng - 1))	{		__length += yyleng;		yy_push_state(AUTHORITY);	}	else	{		uri_destroy(__uri);		return -1;	}}<SCHEME>.|\n		{	yyless(0);	BEGIN REL_PATH;}<SCHEME><<EOF>>		BEGIN REL_PATH;<REL_PATH>{rel_path}	{	if (__uri->path = strdupn(yytext, yyleng))	{		__length += yyleng;		BEGIN QUERY;	}	else	{		uri_destroy(__uri);		return -1;	}}<REL_PATH>.|\n		{	yyless(0);	yy_push_state(AUTHORITY);}<REL_PATH><<EOF>>	yy_push_state(AUTHORITY);	/* Authority and abs_path have conflict! If the following is "//",	 * we assume that it's an authority; if the following is "/", it's	 * an abs_path. */<AUTHORITY>"//"		{	yy_pop_state();	__uri->authority = (struct authority *)malloc(sizeof (struct authority));	if (__uri->authority)	{		AUTH_INIT(__uri->authority, AT_SERVER);		__length += yyleng;		BEGIN USERINFO;	}	else	{		uri_destroy(__uri);		return -1;	}}<AUTHORITY>.|\n		{	yyless(0);	yy_push_state(ABS_PATH);}
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -