📄 xmerl_regexp.erl
字号:
{NFA2,N2,E2};build_nfa({kclosure,RE}, N0, S, NFA0) -> {NFA1,N1,E1} = build_nfa(RE, N0+1, N0, NFA0), E = N1, {[#nfa_state{no=S,edges=[{epsilon,N0},{epsilon,E}]}, #nfa_state{no=E1,edges=[{epsilon,N0},{epsilon,E}]}|NFA1], N1+1,E};build_nfa({pclosure,RE}, N0, S, NFA0) -> {NFA1,N1,E1} = build_nfa(RE, N0+1, N0, NFA0), E = N1, {[#nfa_state{no=S,edges=[{epsilon,N0}]}, #nfa_state{no=E1,edges=[{epsilon,N0},{epsilon,E}]}|NFA1], N1+1,E};build_nfa({optional,RE}, N0, S, NFA0) -> {NFA1,N1,E1} = build_nfa(RE, N0+1, N0, NFA0), E = N1, {[#nfa_state{no=S,edges=[{epsilon,N0},{epsilon,E}]}, #nfa_state{no=E1,edges=[{epsilon,E}]}|NFA1], N1+1,E};build_nfa({iclosure,RE,I1,I2}, N, S, NFA) -> Exp = expand_iclosure(RE, I1, I2), build_nfa(Exp, N, S, NFA);build_nfa({char_class,Cc}, N, S, NFA) -> {[#nfa_state{no=S,edges=[{nfa_char_class(Cc),N}]}|NFA],N+1,N};build_nfa({comp_class,Cc}, N, S, NFA) -> {[#nfa_state{no=S,edges=[{nfa_comp_class(Cc),N}]}|NFA],N+1,N};build_nfa(epsilon, N, S, NFA) -> {NFA,N,S};build_nfa({group,RE}, N, S, NFA) -> %%% FIXME %%%%%%% build_nfa(RE, N, S, NFA);build_nfa({subexpr,_N,RE}, N, S, NFA) -> %%% FIXME %%%%%%% build_nfa(RE, N, S, NFA);build_nfa(bos, N, S, NFA) -> {[#nfa_state{no=S,edges=[{[bos],N}]}|NFA],N+1,N};build_nfa(eos, N, S, NFA) -> {[#nfa_state{no=S,edges=[{[eos],N}]}|NFA],N+1,N};%%{[#nfa_state{no=S,edges=[{[eos],N}]}|NFA],N+1,N};build_nfa(C, N, S, NFA) when integer(C) -> {[#nfa_state{no=S,edges=[{[{C,C}],N}]}|NFA],N+1,N}.nfa_char_class(Cc) -> Crs = lists:foldl(fun({C1,C2}, Set) -> add_element({C1,C2}, Set); (C, Set) -> add_element({C,C}, Set) end, [], Cc), %% io:fwrite("cc: ~p\n", [Crs]), pack_crs(Crs).pack_crs([{C1,C2}=Cr,{C3,C4}|Crs]) when C1 =< C3, C2 >= C4 -> %% C1 C2 %% C3 C4 pack_crs([Cr|Crs]);pack_crs([{C1,C2},{C3,C4}|Crs]) when C2 >= C3, C2 < C4 -> %% C1 C2 %% C3 C4 pack_crs([{C1,C4}|Crs]);pack_crs([{C1,C2},{C3,C4}|Crs]) when C2 + 1 == C3 -> %% C1 C2 %% C3 C4 pack_crs([{C1,C4}|Crs]);pack_crs([Cr|Crs]) -> [Cr|pack_crs(Crs)];pack_crs([]) -> [].nfa_comp_class(Cc) -> Crs = nfa_char_class(Cc), %% io:fwrite("comp: ~p\n", [Crs]), comp_crs(Crs, 0).comp_crs([{C1,C2}|Crs], Last) -> [{Last,C1-1}|comp_crs(Crs, C2+1)];comp_crs([], Last) -> [{Last,maxchar}].%% build_dfa(NFA, NfaStartState) -> {DFA,DfaStartState}.%% Build a DFA from an NFA using "subset construction". The major%% difference from the book is that we keep the marked and unmarked%% DFA states in seperate lists. New DFA states are added to the%% unmarked list and states are marked by moving them to the marked%% list. We assume that the NFA accepting state numbers are in%% ascending order for the rules and use ordsets to keep this order.build_dfa(NFA0, Start) -> %% We want NFA as sorted tuple for fast access, assume lowest state 1. NFA1 = list_to_tuple(keysort(#nfa_state.no, NFA0)), D = #dfa_state{no=0,nfa=eclosure([Start], NFA1),accept=no}, {build_dfa([D], 1, [], NFA1),0}.%% build_dfa([UnMarked], NextState, [Marked], NFA) -> DFA.%% Traverse the unmarked states. Temporarily add the current unmarked%% state to the marked list before calculating translation, this is%% to avoid adding too many duplicate states. Add it properly to the%% marked list afterwards with correct translations.build_dfa([U|Us0], N0, Ms, NFA) -> {Ts,Us1,N1} = build_dfa(U#dfa_state.nfa, Us0, N0, [], [U|Ms], NFA), M = U#dfa_state{trans=Ts,accept=accept(U#dfa_state.nfa, NFA)}, build_dfa(Us1, N1, [M|Ms], NFA);build_dfa([], _N, Ms, _NFA) -> Ms.%% build_dfa([NfaState], [Unmarked], NextState, [Transition], [Marked], NFA) ->%% {Transitions,UnmarkedStates,NextState}.%% Foreach NFA state set calculate the legal translations. N.B. must%% search *BOTH* the unmarked and marked lists to check if DFA state%% already exists. As the range of characters is potentially VERY%% large we cannot explicitly test all characters. Instead we first%% calculate the set of all disjoint character ranges which are%% possible candidates to the set of NFA states.build_dfa(Set, Us, N, Ts, Ms, NFA) -> %% List of all transition sets. Crs0 = [Cr || S <- Set, {Crs,_St} <- (element(S, NFA))#nfa_state.edges, list(Crs), Cr <- Crs ], Crs1 = lists:usort(Crs0), %Must remove duplicates! %% Build list of disjoint test ranges. Test = disjoint_crs(Crs1), %% io:fwrite("bd: ~p\n ~p\n ~p\n ~p\n", [Set,Crs0,Crs1,Test]), build_dfa(Test, Set, Us, N, Ts, Ms, NFA).%% disjoint_crs([CharRange]) -> [CharRange].%% Take a sorted list of char ranges and make a sorted list of%% disjoint char ranges. No new char range extends past an existing%% char range.disjoint_crs([{_C1,C2}=Cr1,{C3,_C4}=Cr2|Crs]) when C2 < C3 -> %% C1 C2 %% C3 C4 [Cr1|disjoint_crs([Cr2|Crs])];disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 == C3 -> %% C1 C2 %% C3 C4 [{C1,C2}|disjoint_crs(add_element({C2+1,C4}, Crs))];disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 < C3, C2 >= C3, C2 < C4 -> %% C1 C2 %% C3 C4 [{C1,C3-1}|disjoint_crs(union([{C3,C2},{C2+1,C4}], Crs))];disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 < C3, C2 == C4 -> %% C1 C2 %% C3 C4 [{C1,C3-1}|disjoint_crs(add_element({C3,C4}, Crs))];disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 < C3, C2 > C4 -> %% C1 C2 %% C3 C4 [{C1,C3-1}|disjoint_crs(union([{C3,C4},{C4+1,C2}], Crs))];disjoint_crs([Cr|Crs]) -> [Cr|disjoint_crs(Crs)];disjoint_crs([]) -> [].build_dfa([Cr|Crs], Set, Us, N, Ts, Ms, NFA) -> case eclosure(move(Set, Cr, NFA), NFA) of S when S /= [] -> case keysearch(S, #dfa_state.nfa, Us) of {value,#dfa_state{no=T}} -> build_dfa(Crs, Set, Us, N, [{Cr,T}|Ts], Ms, NFA); false -> case keysearch(S, #dfa_state.nfa, Ms) of {value,#dfa_state{no=T}} -> build_dfa(Crs, Set, Us, N, [{Cr,T}|Ts], Ms, NFA); false -> U = #dfa_state{no=N,nfa=S}, build_dfa(Crs, Set, [U|Us], N+1, [{Cr,N}|Ts], Ms, NFA) end end; [] -> build_dfa(Crs, Set, Us, N, Ts, Ms, NFA) end;build_dfa([], _Set, Us, N, Ts, _Ms, _NFA) -> {Ts,Us,N}. %% eclosure([State], NFA) -> [State].%% move([State], Char, NFA) -> [State].%% These are straight out of the book. As eclosure uses ordsets then%% the generated state sets are in ascending order.eclosure(Sts, NFA) -> eclosure(Sts, NFA, []).eclosure([St|Sts], NFA, Ec) -> #nfa_state{edges=Es} = element(St, NFA), eclosure([ N || {epsilon,N} <- Es, not is_element(N, Ec) ] ++ Sts, NFA, add_element(St, Ec));eclosure([], _NFA, Ec) -> Ec.move(Sts, Cr, NFA) -> [ St || N <- Sts, {Crs,St} <- (element(N, NFA))#nfa_state.edges, list(Crs),%% begin%% io:fwrite("move1: ~p\n", [{Sts,Cr,Crs,in_crs(Cr,Crs)}]),%% true%% end, in_crs(Cr, Crs) ].in_crs({C1,C2}, [{C3,C4}|_Crs]) when C1 >= C3, C2 =< C4 -> true;in_crs(Cr, [Cr|_Crs]) -> true; %Catch bos and eos.in_crs(Cr, [_|Crs]) -> in_crs(Cr, Crs);in_crs(_Cr, []) -> false.%% accept([State], NFA) -> true | false.%% Scan down the state list until we find an accepting state.accept([St|Sts], NFA) -> case element(St, NFA) of #nfa_state{accept={yes,A}} -> {yes,A}; #nfa_state{accept=no} -> accept(Sts, NFA) end;accept([], _NFA) -> no.%% minimise_dfa(DFA, StartState, FirstState) -> {DFA,StartState}.%% Minimise the DFA by removing equivalent states. We consider a%% state if both the transitions and the their accept state is the%% same. First repeatedly run throught the DFA state list removing%% equivalent states and updating remaining transitions with%% remaining equivalent state numbers. When no more reductions are%% possible then pack the remaining state numbers to get consecutive%% states.minimise_dfa(DFA0, Start, N) -> case min_dfa(DFA0) of {DFA1,[]} -> %No reduction! {DFA2,Rs} = pack_dfa(DFA1, N), {min_update(DFA2, Rs),min_new_state(Start, Rs)}; {DFA1,Rs} -> minimise_dfa(min_update(DFA1, Rs), min_new_state(Start, Rs), N) end.min_dfa(DFA) -> min_dfa(DFA, [], []).min_dfa([D|DFA0], Rs0, MDFA) -> {DFA1,Rs1} = min_delete(DFA0, D#dfa_state.trans, D#dfa_state.accept, D#dfa_state.no, Rs0, []), min_dfa(DFA1, Rs1, [D|MDFA]);min_dfa([], Rs, MDFA) -> {MDFA,Rs}.min_delete([#dfa_state{no=N,trans=T,accept=A}|DFA], T, A, NewN, Rs, MDFA) -> min_delete(DFA, T, A, NewN, [{N,NewN}|Rs], MDFA);min_delete([D|DFA], T, A, NewN, Rs, MDFA) -> min_delete(DFA, T, A, NewN, Rs, [D|MDFA]);min_delete([], _T, _A, _NewN, Rs, MDFA) -> {MDFA,Rs}.min_update(DFA, Rs) -> [ D#dfa_state{trans=min_update_trans(D#dfa_state.trans, Rs)} || D <- DFA ].min_update_trans(Tr, Rs) -> [ {C,min_new_state(S, Rs)} || {C,S} <- Tr ].min_new_state(Old, [{Old,New}|_Reds]) -> New;min_new_state(Old, [_R|Reds]) -> min_new_state(Old, Reds);min_new_state(Old, []) -> Old.pack_dfa(DFA, N) -> pack_dfa(DFA, N, [], []).pack_dfa([D|DFA], NewN, Rs, PDFA) -> pack_dfa(DFA, NewN+1, [{D#dfa_state.no,NewN}|Rs], [D#dfa_state{no=NewN}|PDFA]);pack_dfa([], _NewN, Rs, PDFA) -> {PDFA,Rs}.%% comp_apply(String, StartPos, DFAReg) -> {match,RestPos,Rest} | nomatch.%% Apply the DFA of a regular expression to a string. If%% there is a match return the position of the remaining string and%% the string if else return 'nomatch'.%%%% StartPos should be the real start position as it is used to decide%% if we are at the beginning of the string.comp_apply(Cs, P, {DFA,Start,_Fail}) -> comp_apply(element(Start, DFA), Cs, P, DFA, nomatch).comp_apply(#c_state{spec=[]}=St, Cs, P, DFA, Accept) -> comp_apply_tr(St, Cs, P, DFA, Accept);comp_apply(#c_state{spec=Sp}=St, Cs, P, DFA, Accept) -> comp_apply_sp(St, Cs, P, DFA, Accept, Sp).comp_apply_tr(#c_state{trans=none,accept=A}, Cs, P, _DFA, Accept) -> %% End state. accept_value(A, Cs, P, Accept);comp_apply_tr(#c_state{trans=Tr,tmin=Tmin,smin=Smin,tmax=Tmax,smax=Smax,accept=A}, [C|Cs]=Cs0, P, DFA, Accept) -> %% Get the next state number to go to. NextSt = if C =< Tmin -> Smin; %Below transition table C >= Tmax -> Smax; %Above transition table true -> %Otherwise use table element(C - Tmin, Tr) end, comp_apply(element(NextSt, DFA), Cs, P+1, DFA, accept_value(A, Cs0, P, Accept));comp_apply_tr(#c_state{trans=_Tr,accept=A}, [], P, _DFA, Accept) -> accept_value(A, [], P, Accept).comp_apply_sp(_St, Cs, 1, DFA, Accept, [{bos,S}|_]) -> comp_apply(element(S, DFA), Cs, 1, DFA, Accept);comp_apply_sp(_St, [$\n], P, DFA, Accept, [{eos,S}|_]) -> comp_apply(element(S, DFA), [], P, DFA, Accept);comp_apply_sp(_St, [], P, DFA, Accept, [{eos,S}|_]) -> comp_apply(element(S, DFA), [], P, DFA, Accept);comp_apply_sp(St, Cs, P, DFA, Accept, [_|Sp]) -> comp_apply_sp(St, Cs, P, DFA, Accept, Sp);comp_apply_sp(St, Cs, P, DFA, Accept, []) -> comp_apply_tr(St, Cs, P, DFA, Accept). accept_value(true, Cs, P, _Accept) -> {match,P,Cs};accept_value(false, _Cs, _P, Accept) -> Accept.%% compile(RegExp) -> {ok,RE} | {error,E}.%% Parse the regexp described in the string RegExp.compile(RegExp) -> case make_dfa([{RegExp,yes}], 2) of {ok,{DFA0,Start}} -> Fail = 1, DFA1 = [#dfa_state{no=Fail,accept=no,trans=[]}|DFA0], DFA = tuplelise_dfa(DFA1, 1), {ok,{comp_regexp,{DFA,Start,Fail}}}; {error,E} -> {error,E} end.%% tuplelise_dfa(DFAstates, NoAcceptState) -> {{CompState},FirstState}.tuplelise_dfa(DFA0, NoAccept) -> DFA1 = map(fun (#dfa_state{no=N,trans=Ts,accept=A}) -> {Tr,Tmin,Smin,Tmax,Smax,Sp} = build_trans(Ts, NoAccept), #c_state{no=N,trans=Tr,tmin=Tmin,smin=Smin, tmax=Tmax,smax=Smax, accept=fix_accept(A),spec=Sp} end, DFA0), list_to_tuple(keysort(#dfa_state.no, DFA1)).build_trans(Ts0, NoAccept) -> %% Split transitions into character ranges and specials. {Ts1,Sp1} = foldl(fun ({{_,_},_}=T, {Ts,Sp}) -> {[T|Ts],Sp}; ({_,_}=T, {Ts,Sp}) -> {Ts,[T|Sp]} end, {[],[]}, Ts0), if Ts1 == [] -> {none,none,none,none,none,Sp1}; true -> %% Have transitions, convert to tuple. Ts2 = keysort(1, Ts1), {Tmin,Smin,Ts3} = min_trans(Ts2, NoAccept), %% io:fwrite("exptr: ~p\n", [{Ts3,Tmin}]), {Trans,Tmax,Smax} = expand_trans(Ts3, Tmin, NoAccept), {list_to_tuple(Trans),Tmin,Smin,Tmax,Smax,Sp1} end. min_trans([{{0,C2},S}|Crs], _Def) -> {C2,S,Crs};min_trans([{{C1,_C2},_S}|_]=Crs, Def) -> {C1-1,Def,Crs}.expand_trans([{{C1,maxchar},S}], Last, Def) -> Trs = duplicate(C1-(Last+1), Def), {Trs,C1,S};expand_trans([{{C1,C2},S}], Last, Def) -> Trs = duplicate(C1-(Last+1), Def) ++ duplicate(C2-C1+1, S), {Trs,C2+1,Def};expand_trans([{{C1,C2},S}|Crs], Last, Def) -> {Trs0,Tmax,Smax} = expand_trans(Crs, C2, Def), Trs1 = duplicate(C1-(Last+1), Def) ++ duplicate(C2-C1+1, S) ++ Trs0, {Trs1,Tmax,Smax}.fix_accept({yes,_}) -> true;fix_accept(no) -> false.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -