📄 uri.l
字号:
/* * uri.l -- Routines dealing with URI, mainly parsing and merging. * Created: Xie Han, OS lab of Peking University. <me@pku.edu> * * The grammar of URI is quite simply so yacc is not needed. * * Created: Sep 25 04:15am 2003. version 0.1.1 * # Can read URI from standard input, parse it and print every * part of the URI. * * Updated: Sep 25 05:10pm 2003. version 0.1.2 * * Updated: Sep 25 10:30pm 2003. version 0.1.3 * # A lot of changes in the program structure, much more clear. * # Now the result is written into a uri structure. * # A little bug. String "?" is accepted as a valid URI. * * Updated: Sep 26 02:16am 2003. version 0.1.4 * # 2 states are added to solve the bug in version 0.1.3. * # The uri structure was redesigned. * * Updated: Sep 27 05:21am 2003. version 1.0.0!!!! * # All states are changed from exclusive to shared to make * the lex program more compatible because some lexer do not * support exclusive start condition. * # "unput" is substituted by "yyless" when putting back excess * text, because the latter cost less. * # 4 parsing functions are completed, each of which has different * characteristic. * # Function "uri_rally_parse" is added, which cooperates with * other lex programs. * * Updated: Sep 28 1:00pm 2003. version 1.1.0 * # The function "uri_rel_to_abs" is under construction. * # The URI structure is changed again: "path_type" field is remove, * and "abs_path", "rel_path", "opaque_path" are substituted by * "path" because the original design is too sophisticated. * * Updated: National Day 5:06am 2003. version 1.1.1 * # The function "__uri_path_merge" which merges the base path and * the relative path is finished and tested and proved to be a * nice job! This function is the core of merging a base URI and * a relative URI into an absolute URI. * # The previous function "uri_rel_to_abs" is commented out. It's * terrible. * * Updated: National Day 5:00pm 2003. version 1.1.2 * # Now we distinguish between "no" and "empty". Related fields * include "userinfo", "port", "query", and "fragment". The previous * version treats "no" and "empty" equally, which is not right. * * Updated: Oct 2 2:42am 2003. version 1.1.3 * # Solve the conflict of sever and reg_name by adding a start * condition and some ugly codes that are normally not reached. * # The "uri_rel_to_abs" is rewritten but has not been tested. * * Updated: Oct 4 6:33pm 2003. version 1.2.0 * # Function "uri_path_merge" now returns the length of the result * path while the result path is returned by a pointer. * # Function "uri_rel_to_abs" is renamed "uri_merge", and it will * return the string length of the result URI. * # Function "uri_recombine" is coded, which recombine the a URI * structure into a URI string, and the string length is returned. * # The URI merging algorithm in RFC 2396 is found not flawless, and * the problem is fixed in "uri_merge" with not very graceful means. * * Updated: Oct 5 2:09am 2003. version 1.2.1 * # From this version function "uri_parse_string" is implemented by * "uri_parse_bytes" which from now on is dependent on * "uri_parse_buffer". * # "uri_validate_string" and "uri_validate_bytes" were added. Their * names explain what they do. * # Key word "static" was taken off from the front of function * "__uri_path_merge" while it is not declared in the .h file. I * mean to make it a secret interface ^_^ * * Updated: Oct 5 4:43am 2003. version 1.2.2 * # Cross the first change log of version 1.2.1! * # "uri_validate_string" is no longer implemented by * "uri_validate_bytes". * * Updated: Oct 9 4:49am 2003. version 1.2.3 * # Function "uri_rally_parse" no longer exists, because it cannot * work. The original desire that it can cooperate with other lex * programs is indeed out of the question. * # Macro URI_INIT changed slightly. * * Updated: Oct 10 4:28am 2003. version 1.2.4 * # The arguments sequence of "rel_xxx" and "base_xxx" of functions * "__uri_path_merge" and "uri_merge" are exchanged. "rel_xxx" from * now on precedes "base_xxx". * * Updated: Oct 11 3:31am 2003. version 1.2.5 * # URI merging algorithm is changed again. We check if the path of * the relative URI is defined before merging path, and if it's not * defined, we take the base URI's path directly other than merging * the paths. I thinks that the URI merging algorithm is now at the * edge of perfect. (The recommended merging algorithm of RFC 2396 * has a lot of problems.) * # The rule of "<REG_NAME>{reg_name}" is optimized, because "strcat" * is slow. * * Updated: Oct 12 4:49am 2003. version 1.2.6 * # "goto" statements are substituted by "do ... while (0)" and * "break" in function "uri_merge" and "uri_recombine". No real * improvement is made but showing off continues. * * Updated: Oct 12 4:55pm 2003. version 1.2.7 * # URIs with chinese character are supported in an obviously * not perfect way: the "unreserved" BNF allows non-US-ASCII * characters, i.e., characters with value that ranges from * 128 to 255. * * Updated: Oct 13 1:06am 2003. version 1.2.8 * # Some codes of "__uri_path_merge" was rewritten because the * interface of stack module changed. * # NOTE!!! NULL can no long be passed to "__uri_path_merge" as * a relative path or a base path, and the function from now on * won't set NULL as the result path. If the result path is * empty, the "abs_path" will point to a pointer that points to * an empty string ("\0"). So you can always "free(*abs_path)" * if "__uri_path_merge" succeeded. * * Updated: Oct 15 1:13am 2003. version 1.4.0 * # The update of 1.x.x version continues because version 2.0.x * runs slower than 1.x.x (half speed), though 2.0.x is written * in much shorter and very nice codes. * # The new uri structure is derived from 1.3.x. 1.3.x is a test * version which has been updated into 2.0.0. Indeed, 1.4.x is * compasible with 2.0.x completely, but 2.0.x is NOT compasible * with 1.4.x because 2.0.x has only one parsing functions: * "uri_parse_bytes". I am unwilling to say that 2.0.x is a * failure because the its codes is very nice. It's slow because * it scan the string more than once: first time determine the * length of the URI, and second time break down every components. * And when breaking down authority it must scan the authority * again to solve the conflict of server and reg_name. Indeed the * authority is scanned 3 times. * # "__uri_path_merge", "uri_destroy", "uri_merge", "uri_recombine" * are taken from 2.0.0 because the uri structure changed. * # "__uri_path_merge" once again supports "no" path. Forget the * second change log of version 1.2.8. * # Some state names changed. * * Updated: Oct 15 7:51am 2003. version 1.4.1 * # Fix a bug in "__uri_path_merge": when encountered a "../", * the stack won't pop when the stack top segment is "/". This * is not right because "/" is not necessarily the root path. * Now, when encountered a "../", the stack won't pop when and * only when the stack is empty or the stack has 1 segment and * it's a "/", i.e., when the following is FALSE: * stack_height(stack) > sizeof (char *) + sizeof (int) || * !stack_empty(stack) && stack_peep(stack, int) != 1 * # Now the parsing process is very neat with "state stack" * introduced. All URIs will pass all states when being parsed. I * think it nicer than the 2.0.x parsing process. The performance * is almost equal to 1.4.0, twice the performance of 2.0.x. * # Some internal macros and functions change. Some parentheses * are added. From now on "{" followes "do" immediately if the * corresponding "while" is "while (0)". The aim is to let the * readers know that indeed it's not an iteration. * * Updated: Oct 18 11:43am 2003. version 1.4.2 * # Fix the bug that port can exist without host. If no host found, * we should BEGIN REG_NAME instead of BEGIN PORT. * # The statement in the second change log of version 1.4.1 that * "All URIs will pass all states when being parse." is not * truth. Forget it. * # We now check the return value of "yy_scan_xxx" and * "yy_create_xxx" because there's no garuantee that they would * not fail. (Only "yy_scan_buffer" may fail practically.) * * Updated: Oct 18 22:52pm 2003. version 1.4.3 * # Function "uri_length_string" and "uri_length_bytes" are added. * They return the the same value as "uri_parse_string" and * "uri_parse_bytes", but they do not break down the URI so they * are faster (hoped). And, form now on "uri_validate_string" and * "uri_validate_bytes" are implemented by these two function * respectively. * # State "FINISH" is renamed "ACCEPT", meaning a URI string is * accepted. * * Updated: Oct 20 1:28am 2003. version 1.4.4 * # New version uses stack 2.2.x, and now we must tell the stack * what data type we wanna push or pop. A slight mistake in * "__uri_path_merge" is fixed: pushing or pop a segment name, * the data type should be "const char *", not "char *". * * Updated: Oct 20 6:10am 2003. version 1.4.5 * # Some code blocks are moved to make the codes look clear. * # Data type of "stack" variable in function "__uri_path_merge" is * changed to "STACK *", because module stack was updated again... * Now we use stack 2.4.x. *//* The followings are BNFs of URI-refernce, taken from RFC 2396. */URI-reference ({absoluteURI}|{relativeURI})?("#"{fragment})?absoluteURI {scheme}":"({hier_part}|{opaque_part})relativeURI ({net_path}|{abs_path}|{rel_path})("?"{query})?hier_part ({net_path}|{abs_path})("?"{query})?opaque_part {uric_no_slash}{uric}*uric_no_slash {unreserved}|{escaped}|";"|"?"|":"|"@"|"&"|"="|"+"|"$"|","net_path "//"{authority}{abs_path}?abs_path "/"{path_segments}rel_path {rel_segment}{abs_path}?rel_segment ({unreserved}|{escaped}|";"|"@"|"&"|"="|"+"|"$"|",")+scheme {alpha}({alpha}|{digit}|"+"|"-"|".")*authority {server}|{reg_name}reg_name ({unreserved}|{escaped}|"$"|","|";"|":"|"@"|"&"|"="|"+")+server (({userinfo}"@")?{hostport})?userinfo ({unreserved}|{escaped}|";"|":"|"&"|"="|"+"|"$"|",")*hostport {host}(":"{port})?host {hostname}|{IPv4address}hostname ({domainlabel}".")*{toplabel}"."?domainlabel {alphanum}|{alphanum}({alphanum}|"-")*{alphanum}toplabel {alpha}|{alpha}({alphanum}|"-")*{alphanum}IPv4address {digit}+"."{digit}+"."{digit}+"."{digit}+port {digit}*path ({abs_path}|{opaque_part})?path_segments {segment}("/"{segment})*segment {pchar}*(";"{param})*param {pchar}*pchar {unreserved}|{escaped}|":"|"@"|"&"|"="|"+"|"$"|","query {uric}*fragment {uric}*uric {reserved}|{unreserved}|{escaped}reserved ";"|"/"|"?"|":"|"@"|"&"|"="|"+"|"$"|","unreserved {alphanum}|{mark}mark "-"|"_"|"."|"!"|"~"|"*"|"'"|"("|")"escaped "%"{hex}{hex}hex digit|[A-Fa-f]alphanum {alpha}|{digit}alpha {lowalpha}|{upalpha}lowalpha [a-z]upalpha [A-Z]digit [0-9]%option stack%s SCHEME REL_PATH AUTHORITY USERINFO HOST PORT REG_NAME ABS_PATH%s OPAQUE_PART QUERY FRAGMENT ACCEPT%{#include <errno.h>#include <stdlib.h>#include <stdio.h>#include <string.h>#include <stack.h>#include <misc.h>#include "uri.h"#define URI_AUTH_INIT(auth, at) \do { \ if (((auth)->type = (at)) == AT_SERVER) \ { \ (auth)->userinfo = NULL; \ (auth)->host = NULL; \ (auth)->port = NULL; \ } \ else \ (auth)->reg_name = NULL; \} while (0)#define URI_FREE(ptr) if (ptr) free(ptr)#define URI_AUTH_DESTROY(auth) \do { \ if ((auth)->type == AT_SERVER) \ { \ URI_FREE((auth)->userinfo); \ URI_FREE((auth)->host); \ URI_FREE((auth)->port); \ } \ else \ URI_FREE((auth)->reg_name); \} while (0)static int __length;static struct uri *__uri;%}%%<SCHEME>{scheme}:/\/|{uric_no_slash} { if (__uri->scheme = strdupn(yytext, yyleng - 1)) { __length += yyleng; yy_push_state(AUTHORITY); } else { uri_destroy(__uri); return -1; }}<SCHEME>.|\n |<SCHEME><<EOF>> { yyless(0); BEGIN REL_PATH;}<REL_PATH>{rel_path} { if (__uri->path = strdupn(yytext, yyleng)) { __length += yyleng; BEGIN QUERY; } else { uri_destroy(__uri); return -1; }}<REL_PATH>.|\n |<REL_PATH><<EOF>> { yyless(0); yy_push_state(AUTHORITY);} /* Authority and abs_path have conflict! If the following is "//", * we assume that it's an authority; if the following is "/", it's * an abs_path. */<AUTHORITY>"//" { yy_pop_state(); __uri->authority = (struct authority *)malloc(sizeof (struct authority)); if (__uri->authority) { URI_AUTH_INIT(__uri->authority, AT_SERVER); __uri->authority->type = AT_SERVER; __length += yyleng; BEGIN USERINFO; } else { uri_destroy(__uri); return -1; }}<AUTHORITY>.|\n |<AUTHORITY><<EOF>> { yyless(0); yy_push_state(ABS_PATH);}<USERINFO>{userinfo}@ { if (__uri->authority->userinfo = strdupn(yytext, yyleng - 1)) { __length += yyleng; BEGIN HOST; } else { uri_destroy(__uri); return -1; }}<USERINFO>.|\n |<USERINFO><<EOF>> { yyless(0); BEGIN HOST;}<HOST>{host} { if (__uri->authority->host = strdupn(yytext, yyleng)) { __length += yyleng; BEGIN PORT; } else { uri_destroy(__uri); return -1; }}<HOST>.|\n |<HOST><<EOF>> { yyless(0); BEGIN REG_NAME;}<PORT>:{port} { if (__uri->authority->port = strdupn(yytext + 1, yyleng - 1)) { __length += yyleng; BEGIN REG_NAME; } else { uri_destroy(__uri); return -1; }}<PORT>.|\n |<PORT><<EOF>> { yyless(0); BEGIN REG_NAME;}<REG_NAME>{reg_name} { /* We have assumed that the authority is a server, but it seems that * we are wrong: it's a reg_name. We should join the userinfo, host * and the port together with this yytext into a reg_name. This case * seldom happens, almost never. */ char *reg_name, *curpos; int len = yyleng; if (__uri->authority->userinfo) len += strlen(__uri->authority->userinfo) + 1; if (__uri->authority->host) len += strlen(__uri->authority->host); if (__uri->authority->port) len += strlen(__uri->authority->port) + 1; if (reg_name = (char *)malloc((len + 1) * sizeof (char))) { curpos = reg_name; if (__uri->authority->userinfo) { MEMCPY_PLUS(curpos, __uri->authority->userinfo, strlen(__uri->authority->userinfo)); *curpos++ = '@'; } if (__uri->authority->host) MEMCPY_PLUS(curpos, __uri->authority->host, strlen(__uri->authority->host)); if (__uri->authority->port) { *curpos++ = ':'; MEMCPY_PLUS(curpos, __uri->authority->port, strlen(__uri->authority->port)); } MEMCPY_PLUS(curpos, yytext, yyleng); *curpos = '\0'; URI_AUTH_DESTROY(__uri->authority); URI_AUTH_INIT(__uri->authority, AT_REG_NAME); __uri->authority->reg_name = reg_name; __length += yyleng; yy_push_state(ABS_PATH); } else { uri_destroy(__uri); return -1; }}<REG_NAME>.|\n |<REG_NAME><<EOF>> { yyless(0); yy_push_state(ABS_PATH);}<ABS_PATH>{abs_path} { yy_pop_state(); if (YY_START == AUTHORITY) yy_pop_state(); if (__uri->path = strdupn(yytext, yyleng)) { __length += yyleng; BEGIN QUERY; } else { uri_destroy(__uri); return -1; }}<ABS_PATH>.|\n |<ABS_PATH><<EOF>> { yyless(0); yy_pop_state(); /* The previous state is "AUTHORITY" means the URI has NO authority. */ if (YY_START == AUTHORITY) { yy_pop_state(); /* The previous state is "SCHEME" means the URI HAS a scheme. * It's a little confusing. */ if (YY_START == SCHEME) BEGIN OPAQUE_PART; else BEGIN FRAGMENT; } else BEGIN QUERY;}<OPAQUE_PART>{opaque_part} { if (__uri->path = strdupn(yytext, yyleng)) { __length += yyleng; BEGIN ACCEPT; } else { uri_destroy(__uri); return -1; }}<OPAQUE_PART>.|\n |<OPAQUE_PART><<EOF>> { yyless(0); BEGIN FRAGMENT;}<QUERY>\?{query} { if (__uri->query = strdupn(yytext + 1, yyleng - 1)) { __length += yyleng; BEGIN QUERY; } else { uri_destroy(__uri); return -1; }}<QUERY>.|\n |<QUERY><<EOF>> { yyless(0); BEGIN FRAGMENT;}<FRAGMENT>#{fragment} { if (__uri->fragment = strdupn(yytext + 1, yyleng - 1)) { __length += yyleng; BEGIN ACCEPT; } else { uri_destroy(__uri); return -1; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -