⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 uri.l

📁 Linux TSE 源代码! 保贵十分
💻 L
📖 第 1 页 / 共 2 页
字号:
/** * uri.l -- Routines dealing with URI, mainly parsing and merging. * Created: Xie Han, net lab of Peking University. <e@pku.edu.cn> * * This is the first module of the web crawler. Used widely. * Created: Sep 25 04:15am 2003. version 0.1.1 * Last updated: Oct 13 04:15am 2005. version 1.6.3 *//* The followings are BNFs generating URI-refernce, taken from RFC 2396. */URI-reference	({absoluteURI}|{relativeURI})?("#"{fragment})?absoluteURI		{scheme}":"({hier_part}|{opaque_part})relativeURI		({net_path}|{abs_path}|{rel_path})("?"{query})?hier_part		({net_path}|{abs_path})("?"{query})?opaque_part		{uric_no_slash}{uric}*uric_no_slash	{unreserved}|{escaped}|";"|"?"|":"|"@"|"&"|"="|"+"|"$"|","net_path		"//"{authority}{abs_path}?abs_path		"/"{path_segments}rel_path		{rel_segment}{abs_path}?rel_segment		({unreserved}|{escaped}|";"|"@"|"&"|"="|"+"|"$"|",")+scheme			{alpha}({alpha}|{digit}|"+"|"-"|".")*authority		{server}|{reg_name}reg_name		({unreserved}|{escaped}|"$"|","|";"|":"|"@"|"&"|"="|"+")+server			(({userinfo}"@")?{hostport})?userinfo		({unreserved}|{escaped}|";"|":"|"&"|"="|"+"|"$"|",")*hostport		{host}(":"{port})?host			{hostname}|{IPv4address}hostname		({domainlabel}".")*{toplabel}"."?domainlabel		{alphanum}|{alphanum}({alphanum}|"-")*{alphanum}toplabel		{alpha}|{alpha}({alphanum}|"-")*{alphanum}IPv4address		{digit}+"."{digit}+"."{digit}+"."{digit}+port			{digit}*path			({abs_path}|{opaque_part})?path_segments	{segment}("/"{segment})*segment			{pchar}*(";"{param})*param			{pchar}*pchar			{unreserved}|{escaped}|":"|"@"|"&"|"="|"+"|"$"|","query			{uric}*fragment		{uric}*uric			{reserved}|{unreserved}|{escaped}reserved		";"|"/"|"?"|":"|"@"|"&"|"="|"+"|"$"|","unreserved		{alphanum}|{mark}mark			"-"|"_"|"."|"!"|"~"|"*"|"'"|"("|")"escaped			"%"{hex}{hex}hex				{digit}|[A-Fa-f]alphanum		{alpha}|{digit}alpha			{lowalpha}|{upalpha}lowalpha		[a-z]upalpha			[A-Z]digit			[0-9]%option stack%s SCHEME REL_PATH AUTHORITY USERINFO HOST PORT REG_NAME ABS_PATH%s OPAQUE_PART QUERY FRAGMENT ACCEPT%{#include <errno.h>#include <stdlib.h>#include <stdio.h>#include <string.h>#include <ctype.h>#include <stack.h>#include "uri.h"#define URI_INIT(uri) \do {											\	(uri)->scheme = NULL;						\	(uri)->authority = NULL;					\	(uri)->path = NULL;							\	(uri)->query = NULL;						\	(uri)->fragment = NULL;						\} while (0)#define AUTH_INIT(auth, at) \do {											\	if (((auth)->type = (at)) == AT_SERVER)		\	{											\		(auth)->userinfo = NULL;				\		(auth)->host = NULL;					\		(auth)->port = NULL;					\	}											\	else										\		(auth)->reg_name = NULL;				\} while (0)#define AUTH_DESTROY(auth) \do {											\	if ((auth)->type == AT_SERVER)				\	{											\		free((auth)->userinfo);					\		free((auth)->host);						\		free((auth)->port);						\	}											\	else										\		free((auth)->reg_name);					\} while (0)static int __length;static struct uri *__uri;char *__memtostr(const void *s, int n){	char *str = (char *)malloc((n + 1) * sizeof (char));	if (str)	{		memcpy(str, s, n);		*(str + n) = '\0';	}	return str;}%}%%<SCHEME>{scheme}":"		{	if (__uri->scheme = __memtostr(yytext, yyleng - 1))	{		__length += yyleng;		yy_push_state(AUTHORITY);	}	else	{		uri_destroy(__uri);		return -1;	}}<SCHEME>.|\n		{	yyless(0);	BEGIN REL_PATH;}<SCHEME><<EOF>>		BEGIN REL_PATH;<REL_PATH>{rel_path}	{	if (__uri->path = __memtostr(yytext, yyleng))	{		__length += yyleng;		BEGIN QUERY;	}	else	{		uri_destroy(__uri);		return -1;	}}<REL_PATH>.|\n		{	yyless(0);	yy_push_state(AUTHORITY);}<REL_PATH><<EOF>>	yy_push_state(AUTHORITY);	/* Authority and abs_path have conflict! If the following is "//",	 * we assume that it's an authority; if the following is "/", it's	 * an abs_path. */<AUTHORITY>"//"		{	yy_pop_state();	__uri->authority = (struct authority *)malloc(sizeof (struct authority));	if (__uri->authority)	{		AUTH_INIT(__uri->authority, AT_SERVER);		__length += yyleng;		BEGIN USERINFO;	}	else	{		uri_destroy(__uri);		return -1;	}}<AUTHORITY>.|\n		{	yyless(0);	yy_push_state(ABS_PATH);}<AUTHORITY><<EOF>>	yy_push_state(ABS_PATH);<USERINFO>{userinfo}"@"		{	if (__uri->authority->userinfo = __memtostr(yytext, yyleng - 1))	{		__length += yyleng;		BEGIN HOST;	}	else	{		uri_destroy(__uri);		return -1;	}}<USERINFO>.|\n		{	yyless(0);	BEGIN HOST;}<USERINFO><<EOF>>	BEGIN HOST;<HOST>{host}		{	if (__uri->authority->host = __memtostr(yytext, yyleng))	{		__length += yyleng;		BEGIN PORT;	}	else	{		uri_destroy(__uri);		return -1;	}}<HOST>.|\n			{	yyless(0);	BEGIN REG_NAME;}<HOST><<EOF>>		BEGIN REG_NAME;<PORT>":"{port}		{	if (__uri->authority->port = __memtostr(yytext + 1, yyleng - 1))	{		__length += yyleng;		BEGIN REG_NAME;	}	else	{		uri_destroy(__uri);		return -1;	}}<PORT>.|\n			{	yyless(0);	BEGIN REG_NAME;}<PORT><<EOF>>		BEGIN REG_NAME;<REG_NAME>{reg_name}	{	/* We have assumed that the authority is a server, but it seems that	 * we are wrong: it's a reg_name. We should join the userinfo, host	 * and the port together with this yytext into a reg_name. */	char *reg_name;	int len = yyleng;	char *curpos;	if (__uri->authority->userinfo)		len += strlen(__uri->authority->userinfo) + 1;	if (__uri->authority->host)		len += strlen(__uri->authority->host);	if (__uri->authority->port)		len += strlen(__uri->authority->port) + 1;	if (reg_name = (char *)malloc((len + 1) * sizeof (char)))	{		curpos = reg_name;		if (__uri->authority->userinfo)		{			len = strlen(__uri->authority->userinfo);			memcpy(curpos, __uri->authority->userinfo, len);			curpos += len;			*curpos++ = '@';		}		if (__uri->authority->host)		{			len = strlen(__uri->authority->host);			memcpy(curpos, __uri->authority->host, len);			curpos += len;		}		if (__uri->authority->port)		{			*curpos++ = ':';			len = strlen(__uri->authority->port);			memcpy(curpos, __uri->authority->port, len);			curpos += len;		}		len = strlen(yytext);		memcpy(curpos, yytext, len);		curpos += len;		*curpos = '\0';		AUTH_DESTROY(__uri->authority);		AUTH_INIT(__uri->authority, AT_REG_NAME);		__uri->authority->reg_name = reg_name;		__length += yyleng;		yy_push_state(ABS_PATH);	}	else	{		uri_destroy(__uri);		return -1;	}}<REG_NAME>.|\n		{	yyless(0);	yy_push_state(ABS_PATH);}<REG_NAME><<EOF>>	yy_push_state(ABS_PATH);<ABS_PATH>{abs_path}	{	yy_pop_state();	if (YY_START == AUTHORITY)		yy_pop_state();	if (__uri->path = __memtostr(yytext, yyleng))	{		__length += yyleng;		BEGIN QUERY;	}	else	{		uri_destroy(__uri);		return -1;	}}<ABS_PATH>.|\n		|<ABS_PATH><<EOF>>	{	/* When encountered an EOF we can not yyless. */	if (yyleng == 1)		yyless(0);	yy_pop_state();	/* The previous state is "AUTHORITY" indicates the URI	 * has NO authority. */	if (YY_START == AUTHORITY)	{		yy_pop_state();		/* The previous state is "SCHEME" indicates the URI		 * HAS a scheme. It's a little confusing. */		if (YY_START == SCHEME)			BEGIN OPAQUE_PART;		else			BEGIN FRAGMENT;	}	else		BEGIN QUERY;}<OPAQUE_PART>{opaque_part}	{	if (__uri->path = __memtostr(yytext, yyleng))	{		__length += yyleng;		BEGIN FRAGMENT;	}	else	{		uri_destroy(__uri);		return -1;	}}<OPAQUE_PART>.|\n		{	yyless(0);	BEGIN FRAGMENT;}<OPAQUE_PART><<EOF>>	BEGIN FRAGMENT;<QUERY>"?"{query}	{	if (__uri->query = __memtostr(yytext + 1, yyleng - 1))	{		__length += yyleng;		BEGIN FRAGMENT;	}	else	{		uri_destroy(__uri);		return -1;	}}<QUERY>.|\n			{	yyless(0);	BEGIN FRAGMENT;}<QUERY><<EOF>>		BEGIN FRAGMENT;<FRAGMENT>"#"{fragment}		{	if (__uri->fragment = __memtostr(yytext + 1, yyleng - 1))	{		__length += yyleng;		BEGIN ACCEPT;	}	else	{		uri_destroy(__uri);		return -1;	}}<FRAGMENT>.|\n		{	yyless(0);	BEGIN ACCEPT;}<FRAGMENT><<EOF>>	BEGIN ACCEPT;<ACCEPT>.|\n		{	yyless(0);	return __length;}<ACCEPT><<EOF>>		return __length;<INITIAL>{URI-reference}	return yyleng;<INITIAL>.|\n		{	yyless(0);	return 0;}<INITIAL><<EOF>>	return 0;%%int yywrap(void){	return 1;}char __hex2char[] = {/*  00 nul  01 soh   02 stx  03 etx   04 eot  05 enq   06 ack  07 bel   */    '\0',   '\0',    '\0',   '\0',    '\0',   '\0',    '\0',   '\0',    /*  08 bs   09 ht    0a nl   0b vt    0c np   0d cr    0e so   0f si    */    '\0',   '\0',    '\0',   '\0',    '\0',   '\0',    '\0',   '\0',    /*  10 dle  11 dc1   12 dc2  13 dc3   14 dc4  15 nak   16 syn  17 etb   */    '\0',   '\0',    '\0',   '\0',    '\0',   '\0',    '\0',   '\0',    /*  18 can  19 em    1a sub  1b esc   1c fs   1d gs    1e rs   1f us    */    '\0',   '\0',    '\0',   '\0',    '\0',   '\0',    '\0',   '\0',    /*  20 sp   21 !     22 "    23 #     24 $    25 %     26 &    27 '     */    '\0',   '\0',    '\0',   '\0',    '\0',   '\0',    '\0',   '\0',    /*  28 (    29 )     2a *    2b +     2c ,    2d -     2e .    2f /     */    '\0',   '\0',    '\0',   '\0',    '\0',   '\0',    '\0',   '\0',    /*  30 0    31 1     32 2    33 3     34 4    35 5     36 6    37 7     */    0,      1,       2,      3,       4,      5,       6,      7,    /*  38 8    39 9     3a :    3b ;     3c <    3d =     3e >    3f ?     */    8,      9,       '\0',   '\0',    '\0',   '\0',    '\0',   '\0',    /*  40 @    41 A     42 B    43 C     44 D    45 E     46 F    47 G     */    '\0',   10,      11,     12,      13,     14,      15,     '\0',    /*  48 H    49 I     4a J    4b K     4c L    4d M     4e N    4f O     */    '\0',   '\0',    '\0',   '\0',    '\0',   '\0',    '\0',   '\0',    /*  50 P    51 Q     52 R    53 S     54 T    55 U     56 V    57 W     */    '\0',   '\0',    '\0',   '\0',    '\0',   '\0',    '\0',   '\0',    /*  58 X    59 Y     5a Z    5b [     5c \    5d ]     5e ^    5f _     */    '\0',   '\0',    '\0',   '\0',    '\0',   '\0',    '\0',   '\0',    /*  60 `    61 a     62 b    63 c     64 d    65 e     66 f    67 g     */    '\0',   10,      11,     12,      13,     14,      15,     '\0',    /*  68 h    69 i     6a j    6b k     6c l    6d m     6e n    6f o     */    '\0',   '\0',    '\0',   '\0',    '\0',   '\0',    '\0',   '\0',    /*  70 p    71 q     72 r    73 s     74 t    75 u     76 v    77 w     */    '\0',   '\0',    '\0',   '\0',    '\0',   '\0',    '\0',   '\0',    /*  78 x    79 y     7a z    7b {     7c |    7d }     7e ~    7f del   */    '\0',   '\0',    '\0',   '\0',    '\0',   '\0',    '\0',   '\0',    };char __char2hex[] = "0123456789ABCDEF";char __uri_chr[] = {	0x00, 0x00, 0x00, 0x00,	0x5b, 0xff, 0xff, 0xf5,	0xff, 0xff, 0xff, 0xe1,	0x7f, 0xff, 0xff, 0xe2};static int __uri_parse(struct uri *uri){	__uri = uri;	__length = 0;	URI_INIT(__uri);	BEGIN SCHEME;	return yylex();}/* Scan a string ('\0' terminated) and return the length of the uri. * Return negative number when and only when failed to allocate memory. */int uri_parse_string(const char *string, struct uri *uri){	YY_BUFFER_STATE buf;	int n = -1;	if (buf = yy_scan_string(string))	{		yy_switch_to_buffer(buf);		n = __uri_parse(uri);		yy_delete_buffer(buf);	}	return n;}/* Scan some memory bytes. */int uri_parse_bytes(const char *bytes, int len, struct uri *uri){	YY_BUFFER_STATE buf;	int n = -1;	if (buf = yy_scan_bytes(bytes, len))	{		yy_switch_to_buffer(buf);		n = __uri_parse(uri);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -