📄 xmerl_regexp.erl

📁 OTP是开放电信平台的简称
💻 ERL
📖 第 1 页 / 共 4 页
字号:
reg4([], Sc) -> {epsilon,Sc,[]}.char($\\, [O1,O2,O3|S]) when  O1 >= $0, O1 =< $7, O2 >= $0, O2 =< $7, O3 >= $0, O3 =< $7 ->    {(O1*8 + O2)*8 + O3 - 73*$0,S};char($\\, [C|S]) -> {escape_char(C),S};char($\\, []) -> parse_error({unterminated,"\\"});char(C, S) -> {C,S}.escape_char($n) -> $\n;				%\n = LFescape_char($r) -> $\r;				%\r = CRescape_char($t) -> $\t;				%\t = TABescape_char($v) -> $\v;				%\v = VTescape_char($b) -> $\b;				%\b = BSescape_char($f) -> $\f;				%\f = FFescape_char($e) -> $\e;				%\e = ESCescape_char($s) -> $\s;				%\s = SPACEescape_char($d) -> $\d;				%\d = DELescape_char(C) -> C.char_class([$]|S0]) ->    {Cc,S1} = char_class(S0, [$]]),    {pack_cc(Cc),S1};char_class(S0) ->    {Cc,S1} = char_class(S0, []),    {pack_cc(Cc),S1}.pack_cc(Cc0) ->    %% First sort the list.    Cc1 = lists:usort(fun ({Cf1,_}, {Cf2,_}) -> Cf1 < Cf2;			  ({Cf1,_}, C) -> Cf1 < C;			  (C, {Cf,_}) -> C < Cf;			  (C1, C2) -> C1 =< C2		      end, Cc0),    pack_cc1(Cc1).pack_cc1([{Cf1,Cl1},{Cf2,Cl2}|Cc]) when Cl1 >= Cf2, Cl1 =< Cl2 ->    pack_cc1([{Cf1,Cl2}|Cc]);pack_cc1([{Cf1,Cl1},{Cf2,Cl2}|Cc]) when Cl1 >= Cf2, Cl1 >= Cl2 ->    pack_cc1([{Cf1,Cl1}|Cc]);pack_cc1([{Cf1,Cl1},{Cf2,Cl2}|Cc]) when Cl1+1 == Cf2 ->    pack_cc1([{Cf1,Cl2}|Cc]);pack_cc1([{Cf,Cl},C|Cc]) when Cl >= C -> pack_cc1([{Cf,Cl}|Cc]);pack_cc1([{Cf,Cl},C|Cc]) when Cl+1 == C -> pack_cc1([{Cf,C}|Cc]);pack_cc1([C,{Cf,Cl}|Cc]) when C == Cf-1 -> pack_cc1([{C,Cl}|Cc]);pack_cc1([C1,C2|Cc]) when C1+1 == C2 -> pack_cc1([{C1,C2}|Cc]);pack_cc1([C|Cc]) -> [C|pack_cc1(Cc)];pack_cc1([]) -> [].char_class("[:" ++ S0, Cc0) ->			%Start of POSIX char class    case posix_cc(S0, Cc0) of	{Cc1,":]" ++ S1} -> char_class(S1, Cc1);	{_,_S1} -> parse_error({posix_cc,"[:" ++ S0})    end;char_class([C1|S0], Cc) when C1 /= $] ->    case char(C1, S0) of	{Cf,[$-,C2|S1]} when C2 /= $] ->	    case char(C2, S1) of		{Cl,S2} when Cf < Cl -> char_class(S2, [{Cf,Cl}|Cc]); 		{_Cl,_S2} -> parse_error({char_class,[C1|S0]})	    end;	{C,S1} -> char_class(S1, [C|Cc])    end;char_class(S, Cc) -> {Cc,S}.%% posix_cc(String, CharClass) -> {NewCharClass,RestString}.%%  Handle POSIX character classes, use Latin-1 character set.posix_cc("alnum" ++ S, Cc) ->    {[{$0,$9},{$A,$Z},{192,214},{216,223},{$a,$z},{224,246},{248,255}|Cc],S};posix_cc("alpha" ++ S, Cc) ->    {[{$A,$Z},{192,214},{216,223},{$a,$z},{224,246},{248,255}|Cc],S};posix_cc("blank" ++ S, Cc) -> {[$\s,$\t,160|Cc],S};posix_cc("cntrl" ++ S, Cc) -> {[{0,31},{127,159}|Cc],S};posix_cc("digit" ++ S, Cc) -> {[{$0,$9}|Cc],S};posix_cc("graph" ++ S, Cc) -> {[{33,126},{161,255}|Cc],S};posix_cc("lower" ++ S, Cc) -> {[{$a,$z},{224,246},{248,255}|Cc],S};posix_cc("print" ++ S, Cc) -> {[{32,126},{160,255}|Cc],S};posix_cc("punct" ++ S, Cc) -> {[{$!,$/},{$:,$?},{${,$~},{161,191}|Cc],S};posix_cc("space" ++ S, Cc) -> {[$\s,$\t,$\f,$\r,$\v,160|Cc],S};posix_cc("upper" ++ S, Cc) -> {[{$A,$Z},{192,214},{216,223}|Cc],S};posix_cc("xdigit" ++ S, Cc) -> {[{$a,$f},{$A,$F},{$0,$9}|Cc],S};posix_cc(S, _Cc) -> parse_error({posix_cc,"[:" ++ S}).interval_range(Cs0) ->    case number(Cs0) of	{none,Cs1} -> {none,none,Cs1};	{N,[$,|Cs1]} ->	    case number(Cs1) of		{none,Cs2} -> {N,any,Cs2};		{M,Cs2} -> {N,M,Cs2}	    end;	{N,Cs1} -> {N,none,Cs1}    end.number([C|Cs]) when C >= $0, C =< $9 ->    number(Cs, C - $0);number(Cs) -> {none,Cs}.number([C|Cs], Acc) when C >= $0, C =< $9 ->    number(Cs, 10*Acc + (C - $0));number(Cs, Acc) -> {Acc,Cs}.parse_error(E) -> throw({error,E}).%char_string([C|S]) when C /= $" -> char_string(S, C);%char_string(S) -> {epsilon,S}.%char_string([C|S0], L) when C /= $" ->%    char_string(S0, {concat,L,C});%char_string(S, L) -> {L,S}.%% re_apply(String, StartPos, RegExp) ->%%      {match,RestPos,Rest,SubExprs} | nomatch.%%%%  Apply the (parse of the) regular expression RegExp to String.  If%%  there is a match return the position of the remaining string and%%  the string if else return 'nomatch'.%%%%  StartPos should be the real start position as it is used to decide%%  if we are at the beginning of the string.re_apply(S, St, {RE,Sc}) ->    Subs = erlang:make_tuple(Sc, none),		%Make a sub-regexp table.    Res = re_apply(RE, [], S, St, Subs),    %% io:format("~p x ~p -> ~p\n", [RE,S,Res]),    Res.re_apply(epsilon, More, S, P, Subs) ->		%This always matches    re_apply_more(More, S, P, Subs);re_apply({'or',RE1,RE2}, More, S, P, Subs) ->    re_apply_or(re_apply(RE1, More, S, P, Subs),		re_apply(RE2, More, S, P, Subs));re_apply({concat,RE1,RE2}, More, S0, P, Subs) ->    re_apply(RE1, [RE2|More], S0, P, Subs);re_apply({literal,[C|Lcs]}, More, [C|S], P, Subs) ->    re_apply_lit(Lcs, More, S, P+1, Subs);	%Have matched first charre_apply({kclosure,RE}, More, S0, P0, Subs0) ->    %% Greedy so try RE first, no difference here actually.    Loop = case re_apply(RE, [], S0, P0, Subs0) of	       {match,P0,_S1,_Subs1} ->		%0 length match, don't loop!		   nomatch;	       {match,P1,S1,Subs1} ->		   re_apply_more([{kclosure,RE}|More], S1, P1, Subs1);	       nomatch -> nomatch;	       never_match -> never_match	   end,    re_apply_or(Loop, re_apply_more(More, S0, P0, Subs0));re_apply({pclosure,RE}, More, S, P, Subs) ->    re_apply(RE, [{kclosure,RE}|More], S, P, Subs);re_apply({optional,RE}, More, S, P, Subs) ->    %% Greedy so try RE first, no difference here actually.    re_apply_or(re_apply(RE, More, S, P, Subs),		re_apply_more(More, S, P, Subs));re_apply({iclosure,RE,N,M}, More, S, P, Subs) when N > 0 ->    re_apply(RE, [{iclosure,RE,N-1,M}|More], S, P, Subs);re_apply({iclosure,RE,0,M}, More, S, P, Subs) ->    Exp = expand_opt(RE, M),    re_apply(Exp, More, S, P, Subs);re_apply({subexpr,N,RE}, More, S, P, Subs) ->    re_apply(RE, [{endsub,N,P}|More], S, P, Subs);re_apply({endsub,N,St}, More, S, P, Subs0) ->    Subs1 = setelement(N, Subs0, {St,P-St}),	%Record sub-expr    re_apply_more(More, S, P, Subs1);re_apply(bos, More, S, 1, Subs) -> re_apply_more(More, S, 1, Subs);re_apply(bos, _More, _S, _, _) -> never_match;re_apply(eos, More, [$\n], P, Subs) -> re_apply_more(More, [], P, Subs);re_apply(eos, More, [], P, Subs) -> re_apply_more(More, [], P, Subs);re_apply({char_class,Cc}, More, [C|S], P, Subs) ->    case in_char_class(C, Cc) of	true -> re_apply_more(More, S, P+1, Subs);	false -> nomatch    end;re_apply({comp_class,Cc}, More, [C|S], P, Subs) ->    case in_char_class(C, Cc) of	true -> nomatch;	false -> re_apply_more(More, S, P+1, Subs)    end;re_apply(C, More, [C|S], P, Subs) when integer(C) ->    re_apply_more(More, S, P+1, Subs);re_apply(_RE, _More, _S, _P, _Subs) ->    %% io:format("~p : ~p\n", [_RE,_S]),    nomatch.%% re_apply_more([RegExp], String, Length, SubsExprs) ->%%      {match,RestPos,Rest,SubExprs} | nomatch.re_apply_more([RE|More], S, P, Subs) -> re_apply(RE, More, S, P, Subs);re_apply_more([], S, P, Subs) -> {match,P,S,Subs}.%% re_apply_lit(Literal, More, String, Position, SubExprs) ->%%      {match,RestPos,Rest,SubExprs} | nomatch.re_apply_lit([C|Lit], More, [C|Cs], P, Subs) ->    re_apply_lit(Lit, More, Cs, P+1, Subs);re_apply_lit([], More, Cs, P, Subs) ->    re_apply_more(More, Cs, P, Subs);re_apply_lit(_Lit, _More, _Cs, _P, _Subs) ->    nomatch.%% expand_iclosure(RE, N, M) -> RE.expand_iclosure(RE, 0, M) -> expand_opt(RE, M);expand_iclosure(RE, N, M) ->    {concat,RE,expand_iclosure(RE, N-1, M)}.%% expand_opt(RegExp, Count) -> RE.%% Handle all the cases.expand_opt(_RE, none) -> epsilon;expand_opt(RE, any) -> {kclosure,RE};expand_opt(_RE, 0) -> epsilon;expand_opt(RE, 1) -> {optional,RE};expand_opt(RE, N) ->    {optional,{concat,RE,expand_opt(RE, N-1)}}.%% find_prefix(PrefixStr, SourceStr)%% if PrefixStr is a prefix of Str then return {ok,RemainingStr}%% otherwise return false%% find_prefix([C|Prest], [C|Rest]) ->%%     find_prefix(Prest, Rest);%% find_prefix([], Rest) -> {yes,Rest};%% find_prefix(_, _) -> no.%% in_char_class(Char, Class) -> bool().in_char_class(C, [{C1,C2}|_Cc]) when C >= C1, C =< C2 -> true;in_char_class(C, [C|_Cc]) -> true;in_char_class(C, [_|Cc]) -> in_char_class(C, Cc);in_char_class(_C, []) -> false.%% re_apply_or(Match1, Match2, SubExprs) ->%%      {match,RestPos,Rest,SubExprs} | nomatch.%%  If we want the best match then choose the longest match, else just%%  choose one by trying sequentially.re_apply_or(M1={match,P1,_,_},{match,P2,_,_}) when P1 >= P2 -> M1;re_apply_or({match,_,_,_},  M2={match,_,_,_}) -> M2;re_apply_or(never_match, R2) -> R2;re_apply_or(R1, never_match) -> R1;re_apply_or(nomatch, R2) -> R2;re_apply_or(R1, nomatch) -> R1.%% Record definitions for the NFA, DFA and compiler.-record(nfa_state, {no,edges=[],accept=no}).-record(dfa_state, {no,nfa=[],trans=[],accept=no}).-record(c_state, {no,trans=[],tmin=0,smin=none,tmax=0,smax=none,		  accept=false,spec=[]}).%% We use standard methods, Thompson's construction and subset%% construction, to create first an NFA and then a DFA from the%% regexps. A non-standard feature is that we work with sets of%% character ranges (crs) instead sets of characters. This is most%% noticeable when constructing DFAs. The major benefit is that we can%% handle characters from any set, not just limited ASCII or 8859,%% even 16/32 bit unicode.%%%% The whole range of characters is 0-maxchar, where maxchar is a BIG%% number. We don't make any assumptions about the size of maxchar, it%% is just bigger than any character.%%%% Using character ranges makes describing many regexps very simple,%% for example the regexp "." just becomes the range%% [{0-9},{11-maxchar}].%% make_nfa(RegExpActions) -> {ok,{NFA,StartState}} | {error,E}.%% Build a complete nfa from a list of {RegExp,Action}. The NFA field%% accept has values {yes,Action}|no. The NFA is a list of states.make_nfa(REAs0) ->    case parse_reas(REAs0) of	{ok,REAs1} ->	    {NFA,Start} = build_combined_nfa(REAs1),	    {ok,{NFA,Start}};	{error,E} -> {error,E}    end.%% make_dfa(RegExpActions) -> {ok,{DFA,StartState}} | {error,E}.%% make_dfa(RegExpActions, LowestState) -> {ok,{DFA,StartState}} | {error,E}.%% Build a complete dfa from a list of {RegExp,Action}. The DFA field%% accept has values {yes,Action}|no. If multiple Regexps can result%% in same match string then RegExpActions list define priority.make_dfa(REAs) -> make_dfa(REAs, 0).make_dfa(REAs0, Low) ->    case parse_reas(REAs0) of	{ok,REAs1} ->	    {NFA,Start0} = build_combined_nfa(REAs1),	    {DFA0,Start1} = build_dfa(NFA, Start0),	    {DFA,Start} = minimise_dfa(DFA0, Start1, Low),	    {ok,{DFA,Start}};	{error,E} -> {error,E}    end.parse_reas(REAs) -> parse_reas(REAs, []).parse_reas([{{regexp,{R,_Sc}},A}|REAs], S) ->	%Already parsed    parse_reas(REAs, [{R,A}|S]);parse_reas([{RegExp,A}|REAs], S) ->    case parse(RegExp) of	{ok,{regexp,{R,_Sc}}} -> parse_reas(REAs, [{R,A}|S]);	{error,E} -> {error,E}    end;parse_reas([], Stack) -> {ok,reverse(Stack)}.%% build_combined_nfa(RegExpActionList) -> {NFA,StartState}.%%  Build the combined NFA using Thompson's construction straight out%%  of the book. Build the separate NFAs in the same order as the%%  rules so that the accepting have ascending states have ascending%%  state numbers.  Start numbering the states from 1 as we put the%%  states in a tuple with the state number as the index.build_combined_nfa(REAs) ->    {NFA,Starts,Next} = build_nfa_list(REAs, [], [], 1),    F = #nfa_state{no=Next,edges=epsilon_trans(Starts),accept=no},    {[F|NFA],Next}.build_nfa_list([{RE,Action}|REAs], NFA0, Starts, Next0) ->    {NFA1,Next1,Start} = build_nfa(RE, Next0, Action),    build_nfa_list(REAs, NFA1 ++ NFA0, [Start|Starts], Next1);build_nfa_list([], NFA, Starts, Next) ->    {NFA,reverse(Starts),Next}.epsilon_trans(Sts) -> [ {epsilon,S} || S <- Sts ].%% build_nfa(RegExp, NextState, Action) -> {NFA,NextFreeState,StartState}.%%  When building the NFA states for a ??? we don't build the end%%  state, just allocate a State for it and return this state%%  number. This allows us to avoid building unnecessary states for%%  concatenation which would then have to be removed by overwriting%%  an existing state.build_nfa(RE, Next, Action) ->    {NFA,N,E} = build_nfa(RE, Next+1, Next, []),    {[#nfa_state{no=E,accept={yes,Action}}|NFA],N,Next}.%% build_nfa(RegExp, NextState, StartState, NFA) -> {NFA,NextState,EndState}.%%  The NFA is a list of nfa_state is no predefined order. The state%%  number of the returned EndState is already allocated!build_nfa({'or',RE1,RE2}, N0, S, NFA0) ->    {NFA1,N1,E1} = build_nfa(RE1, N0+1, N0, NFA0),    {NFA2,N2,E2} = build_nfa(RE2, N1+1, N1, NFA1),    E = N2,    {[#nfa_state{no=S,edges=[{epsilon,N0},{epsilon,N1}]},      #nfa_state{no=E1,edges=[{epsilon,E}]},      #nfa_state{no=E2,edges=[{epsilon,E}]}|NFA2],     N2+1,E};build_nfa({literal,[]}, N, S, NFA) ->    {NFA,N,S};build_nfa({literal,[C|Cs]}, N0, S, NFA0) ->    {NFA1,N1,E1} = build_nfa(C, N0, S, NFA0),    build_nfa({literal,Cs}, N1, E1, NFA1);build_nfa({concat,RE1,RE2}, N0, S, NFA0) ->    {NFA1,N1,E1} = build_nfa(RE1, N0, S, NFA0),    {NFA2,N2,E2} = build_nfa(RE2, N1, E1, NFA1),
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -