📄 uri.l

📁 在linux下的crawler程序,来自北大天网tiny search engine spider
💻 L
📖 第 1 页 / 共 3 页
字号:
<AUTHORITY><<EOF>>	yy_push_state(ABS_PATH);<USERINFO>{userinfo}"@"		{	if (__uri->authority->userinfo = strdupn(yytext, yyleng - 1))	{		__length += yyleng;		BEGIN HOST;	}	else	{		uri_destroy(__uri);		return -1;	}}<USERINFO>.|\n		{	yyless(0);	BEGIN HOST;}<USERINFO><<EOF>>	BEGIN HOST;<HOST>{host}		{	if (__uri->authority->host = strdupn(yytext, yyleng))	{		__length += yyleng;		BEGIN PORT;	}	else	{		uri_destroy(__uri);		return -1;	}}<HOST>.|\n			{	yyless(0);	BEGIN REG_NAME;}<HOST><<EOF>>		BEGIN REG_NAME;<PORT>":"{port}		{	if (__uri->authority->port = strdupn(yytext + 1, yyleng - 1))	{		__length += yyleng;		BEGIN REG_NAME;	}	else	{		uri_destroy(__uri);		return -1;	}}<PORT>.|\n			{	yyless(0);	BEGIN REG_NAME;}<PORT><<EOF>>		BEGIN REG_NAME;<REG_NAME>{reg_name}	{	/* We have assumed that the authority is a server, but it seems that	 * we are wrong: it's a reg_name. We should join the userinfo, host	 * and the port together with this yytext into a reg_name. This case	 * seldom happens, almost never. */	char *reg_name, *curpos;	int len = yyleng;	if (__uri->authority)	{		if (__uri->authority->userinfo)			len += strlen(__uri->authority->userinfo) + 1;		if (__uri->authority->host)			len += strlen(__uri->authority->host);		if (__uri->authority->port)			len += strlen(__uri->authority->port) + 1;	}	if (reg_name = (char *)malloc((len + 1) * sizeof (char)))	{		curpos = reg_name;		if (__uri->authority)		{			if (__uri->authority->userinfo)			{				MEMCPY_PLUS(curpos, __uri->authority->userinfo,							strlen(__uri->authority->userinfo));				*curpos++ = '@';			}			if (__uri->authority->host)				MEMCPY_PLUS(curpos, __uri->authority->host,							strlen(__uri->authority->host));			if (__uri->authority->port)			{				*curpos++ = ':';				MEMCPY_PLUS(curpos, __uri->authority->port,							strlen(__uri->authority->port));			}		}		MEMCPY_PLUS(curpos, yytext, yyleng);		*curpos = '\0';		URI_AUTH_DESTROY(__uri->authority);		AUTH_INIT(__uri->authority, AT_REG_NAME);		__uri->authority->reg_name = reg_name;		__length += yyleng;		yy_push_state(ABS_PATH);	}	else	{		uri_destroy(__uri);		return -1;	}}<REG_NAME>.|\n		{	yyless(0);	yy_push_state(ABS_PATH);}<REG_NAME><<EOF>>	yy_push_state(ABS_PATH);<ABS_PATH>{abs_path}	{	yy_pop_state();	if (YY_START == AUTHORITY)		yy_pop_state();	if (__uri->path = strdupn(yytext, yyleng))	{		__length += yyleng;		BEGIN QUERY;	}	else	{		uri_destroy(__uri);		return -1;	}}<ABS_PATH>.|\n		|<ABS_PATH><<EOF>>	{	/* When encountered an EOF we can not yyless. */	if (yyleng == 1)		yyless(0);	yy_pop_state();	/* The previous state is "AUTHORITY" indicates the URI	 * has NO authority. */	if (YY_START == AUTHORITY)	{		yy_pop_state();		/* The previous state is "SCHEME" indicates the URI		 * HAS a scheme. It's a little confusing. */		if (YY_START == SCHEME)			BEGIN OPAQUE_PART;		else			BEGIN FRAGMENT;	}	else		BEGIN QUERY;}<OPAQUE_PART>{opaque_part}	{	if (__uri->path = strdupn(yytext, yyleng))	{		__length += yyleng;		BEGIN FRAGMENT;	}	else	{		uri_destroy(__uri);		return -1;	}}<OPAQUE_PART>.|\n		{	yyless(0);	BEGIN FRAGMENT;}<OPAQUE_PART><<EOF>>	BEGIN FRAGMENT;<QUERY>"?"{query}	{	if (__uri->query = strdupn(yytext + 1, yyleng - 1))	{		__length += yyleng;		BEGIN FRAGMENT;	}	else	{		uri_destroy(__uri);		return -1;	}}<QUERY>.|\n			{	yyless(0);	BEGIN FRAGMENT;}<QUERY><<EOF>>		BEGIN FRAGMENT;<FRAGMENT>"#"{fragment}		{	if (__uri->fragment = strdupn(yytext + 1, yyleng - 1))	{		__length += yyleng;		BEGIN ACCEPT;	}	else	{		uri_destroy(__uri);		return -1;	}}<FRAGMENT>.|\n		{	yyless(0);	BEGIN ACCEPT;}<FRAGMENT><<EOF>>	BEGIN ACCEPT;<ACCEPT>.|\n		{	yyless(0);	return __length;}<ACCEPT><<EOF>>		return __length;<INITIAL>{URI-reference}	return yyleng;<INITIAL>.|\n		{	yyless(0);	return 0;}<INITIAL><<EOF>>	return 0;%%int yywrap(void){	return 1;}char __uri_chr[] ={	0x00, 0x00, 0x00, 0x00,	0x5b, 0xff, 0xff, 0xf5,	0xff, 0xff, 0xff, 0xe1,	0x7f, 0xff, 0xff, 0xe2};static int __uri_parse(struct uri *uri){	__uri = uri;	__length = 0;	URI_INIT(__uri);	BEGIN SCHEME;	return yylex();}/* Scan a file and return the length of the uri. Return negative number * when and only when failed to allocate memory. */int uri_parse_file(FILE *file, struct uri *uri){	YY_BUFFER_STATE buf;	int n = -1;	yyin = file;	if (buf = yy_create_buffer(yyin, YY_BUF_SIZE))	{		yy_switch_to_buffer(buf);		n = __uri_parse(uri);		yy_delete_buffer(buf);	}	return n;}/* Scan a string ('\0' terminated). */int uri_parse_string(const char *string, struct uri *uri){	YY_BUFFER_STATE buf;	int n = -1;	if (buf = yy_scan_string(string))	{		yy_switch_to_buffer(buf);		n = __uri_parse(uri);		yy_delete_buffer(buf);	}	return n;}/* Scan some memory bytes. */int uri_parse_bytes(const char *bytes, int len, struct uri *uri){	YY_BUFFER_STATE buf;	int n = -1;	if (buf = yy_scan_bytes(bytes, len))	{		yy_switch_to_buffer(buf);		n = __uri_parse(uri);		yy_delete_buffer(buf);	}	return n;}/* Scan some memory bytes. The last two bytes of the memory MUST be '\0', or * the function will return -1 indicating a failure. This function has better * performance than "uri_parse_bytes", but note there is NO "const" key * word before the "base" argument, which means the content of memory may * be changed. */int uri_parse_buffer(char *base, unsigned int size, struct uri *uri){	YY_BUFFER_STATE buf;	int n = -1;	if (buf = yy_scan_buffer(base, size))	{		yy_switch_to_buffer(buf);		n = __uri_parse(uri);		yy_delete_buffer(buf);	}	return n;}void uri_destroy(struct uri *uri){	FREE_NOT_NULL(uri->scheme);	if (uri->authority)	{		URI_AUTH_DESTROY(uri->authority);		free(uri->authority);	}	FREE_NOT_NULL(uri->path);	FREE_NOT_NULL(uri->query);	FREE_NOT_NULL(uri->fragment);}int __uri_length(void){	BEGIN INITIAL;	return yylex();}int uri_length_string(const char *string){	YY_BUFFER_STATE buf;	int n = -1;	if (buf = yy_scan_string(string))	{		yy_switch_to_buffer(buf);		n = __uri_length();		yy_delete_buffer(buf);	}	return n;}int uri_length_bytes(const char *bytes, int len){	YY_BUFFER_STATE buf;	int n = -1;	if (buf = yy_scan_bytes(bytes, len))	{		yy_switch_to_buffer(buf);		n = __uri_length();		yy_delete_buffer(buf);	}	return n;}/* Validate a string ('\0' terminated). */int uri_validate_string(const char *string){	int n = uri_length_string(string);	return n >= 0 ? n == strlen(string) : n;}/* Validate some memory bytes. */int uri_validate_bytes(const char *bytes, int len){	int n = uri_length_bytes(bytes, len);	return n >= 0 ? n == len : n;}/* Merge two path. It sounds easy but indeed quite troublesome if you take * everything into consideration. Core of merging two URIs. The function will * allocate memory for you, which is NOT a very good programming style you * should not follow. */int __path_merge(const char *rel_path, const char *base_path, char **result){
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -