📄 xmerl_regexp.erl
字号:
reg4([], Sc) -> {epsilon,Sc,[]}.char($\\, [O1,O2,O3|S]) when O1 >= $0, O1 =< $7, O2 >= $0, O2 =< $7, O3 >= $0, O3 =< $7 -> {(O1*8 + O2)*8 + O3 - 73*$0,S};char($\\, [C|S]) -> {escape_char(C),S};char($\\, []) -> parse_error({unterminated,"\\"});char(C, S) -> {C,S}.escape_char($n) -> $\n; %\n = LFescape_char($r) -> $\r; %\r = CRescape_char($t) -> $\t; %\t = TABescape_char($v) -> $\v; %\v = VTescape_char($b) -> $\b; %\b = BSescape_char($f) -> $\f; %\f = FFescape_char($e) -> $\e; %\e = ESCescape_char($s) -> $\s; %\s = SPACEescape_char($d) -> $\d; %\d = DELescape_char(C) -> C.char_class([$]|S0]) -> {Cc,S1} = char_class(S0, [$]]), {pack_cc(Cc),S1};char_class(S0) -> {Cc,S1} = char_class(S0, []), {pack_cc(Cc),S1}.pack_cc(Cc0) -> %% First sort the list. Cc1 = lists:usort(fun ({Cf1,_}, {Cf2,_}) -> Cf1 < Cf2; ({Cf1,_}, C) -> Cf1 < C; (C, {Cf,_}) -> C < Cf; (C1, C2) -> C1 =< C2 end, Cc0), pack_cc1(Cc1).pack_cc1([{Cf1,Cl1},{Cf2,Cl2}|Cc]) when Cl1 >= Cf2, Cl1 =< Cl2 -> pack_cc1([{Cf1,Cl2}|Cc]);pack_cc1([{Cf1,Cl1},{Cf2,Cl2}|Cc]) when Cl1 >= Cf2, Cl1 >= Cl2 -> pack_cc1([{Cf1,Cl1}|Cc]);pack_cc1([{Cf1,Cl1},{Cf2,Cl2}|Cc]) when Cl1+1 == Cf2 -> pack_cc1([{Cf1,Cl2}|Cc]);pack_cc1([{Cf,Cl},C|Cc]) when Cl >= C -> pack_cc1([{Cf,Cl}|Cc]);pack_cc1([{Cf,Cl},C|Cc]) when Cl+1 == C -> pack_cc1([{Cf,C}|Cc]);pack_cc1([C,{Cf,Cl}|Cc]) when C == Cf-1 -> pack_cc1([{C,Cl}|Cc]);pack_cc1([C1,C2|Cc]) when C1+1 == C2 -> pack_cc1([{C1,C2}|Cc]);pack_cc1([C|Cc]) -> [C|pack_cc1(Cc)];pack_cc1([]) -> [].char_class("[:" ++ S0, Cc0) -> %Start of POSIX char class case posix_cc(S0, Cc0) of {Cc1,":]" ++ S1} -> char_class(S1, Cc1); {_,_S1} -> parse_error({posix_cc,"[:" ++ S0}) end;char_class([C1|S0], Cc) when C1 /= $] -> case char(C1, S0) of {Cf,[$-,C2|S1]} when C2 /= $] -> case char(C2, S1) of {Cl,S2} when Cf < Cl -> char_class(S2, [{Cf,Cl}|Cc]); {_Cl,_S2} -> parse_error({char_class,[C1|S0]}) end; {C,S1} -> char_class(S1, [C|Cc]) end;char_class(S, Cc) -> {Cc,S}.%% posix_cc(String, CharClass) -> {NewCharClass,RestString}.%% Handle POSIX character classes, use Latin-1 character set.posix_cc("alnum" ++ S, Cc) -> {[{$0,$9},{$A,$Z},{192,214},{216,223},{$a,$z},{224,246},{248,255}|Cc],S};posix_cc("alpha" ++ S, Cc) -> {[{$A,$Z},{192,214},{216,223},{$a,$z},{224,246},{248,255}|Cc],S};posix_cc("blank" ++ S, Cc) -> {[$\s,$\t,160|Cc],S};posix_cc("cntrl" ++ S, Cc) -> {[{0,31},{127,159}|Cc],S};posix_cc("digit" ++ S, Cc) -> {[{$0,$9}|Cc],S};posix_cc("graph" ++ S, Cc) -> {[{33,126},{161,255}|Cc],S};posix_cc("lower" ++ S, Cc) -> {[{$a,$z},{224,246},{248,255}|Cc],S};posix_cc("print" ++ S, Cc) -> {[{32,126},{160,255}|Cc],S};posix_cc("punct" ++ S, Cc) -> {[{$!,$/},{$:,$?},{${,$~},{161,191}|Cc],S};posix_cc("space" ++ S, Cc) -> {[$\s,$\t,$\f,$\r,$\v,160|Cc],S};posix_cc("upper" ++ S, Cc) -> {[{$A,$Z},{192,214},{216,223}|Cc],S};posix_cc("xdigit" ++ S, Cc) -> {[{$a,$f},{$A,$F},{$0,$9}|Cc],S};posix_cc(S, _Cc) -> parse_error({posix_cc,"[:" ++ S}).interval_range(Cs0) -> case number(Cs0) of {none,Cs1} -> {none,none,Cs1}; {N,[$,|Cs1]} -> case number(Cs1) of {none,Cs2} -> {N,any,Cs2}; {M,Cs2} -> {N,M,Cs2} end; {N,Cs1} -> {N,none,Cs1} end.number([C|Cs]) when C >= $0, C =< $9 -> number(Cs, C - $0);number(Cs) -> {none,Cs}.number([C|Cs], Acc) when C >= $0, C =< $9 -> number(Cs, 10*Acc + (C - $0));number(Cs, Acc) -> {Acc,Cs}.parse_error(E) -> throw({error,E}).%char_string([C|S]) when C /= $" -> char_string(S, C);%char_string(S) -> {epsilon,S}.%char_string([C|S0], L) when C /= $" ->% char_string(S0, {concat,L,C});%char_string(S, L) -> {L,S}.%% re_apply(String, StartPos, RegExp) ->%% {match,RestPos,Rest,SubExprs} | nomatch.%%%% Apply the (parse of the) regular expression RegExp to String. If%% there is a match return the position of the remaining string and%% the string if else return 'nomatch'.%%%% StartPos should be the real start position as it is used to decide%% if we are at the beginning of the string.re_apply(S, St, {RE,Sc}) -> Subs = erlang:make_tuple(Sc, none), %Make a sub-regexp table. Res = re_apply(RE, [], S, St, Subs), %% io:format("~p x ~p -> ~p\n", [RE,S,Res]), Res.re_apply(epsilon, More, S, P, Subs) -> %This always matches re_apply_more(More, S, P, Subs);re_apply({'or',RE1,RE2}, More, S, P, Subs) -> re_apply_or(re_apply(RE1, More, S, P, Subs), re_apply(RE2, More, S, P, Subs));re_apply({concat,RE1,RE2}, More, S0, P, Subs) -> re_apply(RE1, [RE2|More], S0, P, Subs);re_apply({literal,[C|Lcs]}, More, [C|S], P, Subs) -> re_apply_lit(Lcs, More, S, P+1, Subs); %Have matched first charre_apply({kclosure,RE}, More, S0, P0, Subs0) -> %% Greedy so try RE first, no difference here actually. Loop = case re_apply(RE, [], S0, P0, Subs0) of {match,P0,_S1,_Subs1} -> %0 length match, don't loop! nomatch; {match,P1,S1,Subs1} -> re_apply_more([{kclosure,RE}|More], S1, P1, Subs1); nomatch -> nomatch; never_match -> never_match end, re_apply_or(Loop, re_apply_more(More, S0, P0, Subs0));re_apply({pclosure,RE}, More, S, P, Subs) -> re_apply(RE, [{kclosure,RE}|More], S, P, Subs);re_apply({optional,RE}, More, S, P, Subs) -> %% Greedy so try RE first, no difference here actually. re_apply_or(re_apply(RE, More, S, P, Subs), re_apply_more(More, S, P, Subs));re_apply({iclosure,RE,N,M}, More, S, P, Subs) when N > 0 -> re_apply(RE, [{iclosure,RE,N-1,M}|More], S, P, Subs);re_apply({iclosure,RE,0,M}, More, S, P, Subs) -> Exp = expand_opt(RE, M), re_apply(Exp, More, S, P, Subs);re_apply({subexpr,N,RE}, More, S, P, Subs) -> re_apply(RE, [{endsub,N,P}|More], S, P, Subs);re_apply({endsub,N,St}, More, S, P, Subs0) -> Subs1 = setelement(N, Subs0, {St,P-St}), %Record sub-expr re_apply_more(More, S, P, Subs1);re_apply(bos, More, S, 1, Subs) -> re_apply_more(More, S, 1, Subs);re_apply(bos, _More, _S, _, _) -> never_match;re_apply(eos, More, [$\n], P, Subs) -> re_apply_more(More, [], P, Subs);re_apply(eos, More, [], P, Subs) -> re_apply_more(More, [], P, Subs);re_apply({char_class,Cc}, More, [C|S], P, Subs) -> case in_char_class(C, Cc) of true -> re_apply_more(More, S, P+1, Subs); false -> nomatch end;re_apply({comp_class,Cc}, More, [C|S], P, Subs) -> case in_char_class(C, Cc) of true -> nomatch; false -> re_apply_more(More, S, P+1, Subs) end;re_apply(C, More, [C|S], P, Subs) when integer(C) -> re_apply_more(More, S, P+1, Subs);re_apply(_RE, _More, _S, _P, _Subs) -> %% io:format("~p : ~p\n", [_RE,_S]), nomatch.%% re_apply_more([RegExp], String, Length, SubsExprs) ->%% {match,RestPos,Rest,SubExprs} | nomatch.re_apply_more([RE|More], S, P, Subs) -> re_apply(RE, More, S, P, Subs);re_apply_more([], S, P, Subs) -> {match,P,S,Subs}.%% re_apply_lit(Literal, More, String, Position, SubExprs) ->%% {match,RestPos,Rest,SubExprs} | nomatch.re_apply_lit([C|Lit], More, [C|Cs], P, Subs) -> re_apply_lit(Lit, More, Cs, P+1, Subs);re_apply_lit([], More, Cs, P, Subs) -> re_apply_more(More, Cs, P, Subs);re_apply_lit(_Lit, _More, _Cs, _P, _Subs) -> nomatch.%% expand_iclosure(RE, N, M) -> RE.expand_iclosure(RE, 0, M) -> expand_opt(RE, M);expand_iclosure(RE, N, M) -> {concat,RE,expand_iclosure(RE, N-1, M)}.%% expand_opt(RegExp, Count) -> RE.%% Handle all the cases.expand_opt(_RE, none) -> epsilon;expand_opt(RE, any) -> {kclosure,RE};expand_opt(_RE, 0) -> epsilon;expand_opt(RE, 1) -> {optional,RE};expand_opt(RE, N) -> {optional,{concat,RE,expand_opt(RE, N-1)}}.%% find_prefix(PrefixStr, SourceStr)%% if PrefixStr is a prefix of Str then return {ok,RemainingStr}%% otherwise return false%% find_prefix([C|Prest], [C|Rest]) ->%% find_prefix(Prest, Rest);%% find_prefix([], Rest) -> {yes,Rest};%% find_prefix(_, _) -> no.%% in_char_class(Char, Class) -> bool().in_char_class(C, [{C1,C2}|_Cc]) when C >= C1, C =< C2 -> true;in_char_class(C, [C|_Cc]) -> true;in_char_class(C, [_|Cc]) -> in_char_class(C, Cc);in_char_class(_C, []) -> false.%% re_apply_or(Match1, Match2, SubExprs) ->%% {match,RestPos,Rest,SubExprs} | nomatch.%% If we want the best match then choose the longest match, else just%% choose one by trying sequentially.re_apply_or(M1={match,P1,_,_},{match,P2,_,_}) when P1 >= P2 -> M1;re_apply_or({match,_,_,_}, M2={match,_,_,_}) -> M2;re_apply_or(never_match, R2) -> R2;re_apply_or(R1, never_match) -> R1;re_apply_or(nomatch, R2) -> R2;re_apply_or(R1, nomatch) -> R1.%% Record definitions for the NFA, DFA and compiler.-record(nfa_state, {no,edges=[],accept=no}).-record(dfa_state, {no,nfa=[],trans=[],accept=no}).-record(c_state, {no,trans=[],tmin=0,smin=none,tmax=0,smax=none, accept=false,spec=[]}).%% We use standard methods, Thompson's construction and subset%% construction, to create first an NFA and then a DFA from the%% regexps. A non-standard feature is that we work with sets of%% character ranges (crs) instead sets of characters. This is most%% noticeable when constructing DFAs. The major benefit is that we can%% handle characters from any set, not just limited ASCII or 8859,%% even 16/32 bit unicode.%%%% The whole range of characters is 0-maxchar, where maxchar is a BIG%% number. We don't make any assumptions about the size of maxchar, it%% is just bigger than any character.%%%% Using character ranges makes describing many regexps very simple,%% for example the regexp "." just becomes the range%% [{0-9},{11-maxchar}].%% make_nfa(RegExpActions) -> {ok,{NFA,StartState}} | {error,E}.%% Build a complete nfa from a list of {RegExp,Action}. The NFA field%% accept has values {yes,Action}|no. The NFA is a list of states.make_nfa(REAs0) -> case parse_reas(REAs0) of {ok,REAs1} -> {NFA,Start} = build_combined_nfa(REAs1), {ok,{NFA,Start}}; {error,E} -> {error,E} end.%% make_dfa(RegExpActions) -> {ok,{DFA,StartState}} | {error,E}.%% make_dfa(RegExpActions, LowestState) -> {ok,{DFA,StartState}} | {error,E}.%% Build a complete dfa from a list of {RegExp,Action}. The DFA field%% accept has values {yes,Action}|no. If multiple Regexps can result%% in same match string then RegExpActions list define priority.make_dfa(REAs) -> make_dfa(REAs, 0).make_dfa(REAs0, Low) -> case parse_reas(REAs0) of {ok,REAs1} -> {NFA,Start0} = build_combined_nfa(REAs1), {DFA0,Start1} = build_dfa(NFA, Start0), {DFA,Start} = minimise_dfa(DFA0, Start1, Low), {ok,{DFA,Start}}; {error,E} -> {error,E} end.parse_reas(REAs) -> parse_reas(REAs, []).parse_reas([{{regexp,{R,_Sc}},A}|REAs], S) -> %Already parsed parse_reas(REAs, [{R,A}|S]);parse_reas([{RegExp,A}|REAs], S) -> case parse(RegExp) of {ok,{regexp,{R,_Sc}}} -> parse_reas(REAs, [{R,A}|S]); {error,E} -> {error,E} end;parse_reas([], Stack) -> {ok,reverse(Stack)}.%% build_combined_nfa(RegExpActionList) -> {NFA,StartState}.%% Build the combined NFA using Thompson's construction straight out%% of the book. Build the separate NFAs in the same order as the%% rules so that the accepting have ascending states have ascending%% state numbers. Start numbering the states from 1 as we put the%% states in a tuple with the state number as the index.build_combined_nfa(REAs) -> {NFA,Starts,Next} = build_nfa_list(REAs, [], [], 1), F = #nfa_state{no=Next,edges=epsilon_trans(Starts),accept=no}, {[F|NFA],Next}.build_nfa_list([{RE,Action}|REAs], NFA0, Starts, Next0) -> {NFA1,Next1,Start} = build_nfa(RE, Next0, Action), build_nfa_list(REAs, NFA1 ++ NFA0, [Start|Starts], Next1);build_nfa_list([], NFA, Starts, Next) -> {NFA,reverse(Starts),Next}.epsilon_trans(Sts) -> [ {epsilon,S} || S <- Sts ].%% build_nfa(RegExp, NextState, Action) -> {NFA,NextFreeState,StartState}.%% When building the NFA states for a ??? we don't build the end%% state, just allocate a State for it and return this state%% number. This allows us to avoid building unnecessary states for%% concatenation which would then have to be removed by overwriting%% an existing state.build_nfa(RE, Next, Action) -> {NFA,N,E} = build_nfa(RE, Next+1, Next, []), {[#nfa_state{no=E,accept={yes,Action}}|NFA],N,Next}.%% build_nfa(RegExp, NextState, StartState, NFA) -> {NFA,NextState,EndState}.%% The NFA is a list of nfa_state is no predefined order. The state%% number of the returned EndState is already allocated!build_nfa({'or',RE1,RE2}, N0, S, NFA0) -> {NFA1,N1,E1} = build_nfa(RE1, N0+1, N0, NFA0), {NFA2,N2,E2} = build_nfa(RE2, N1+1, N1, NFA1), E = N2, {[#nfa_state{no=S,edges=[{epsilon,N0},{epsilon,N1}]}, #nfa_state{no=E1,edges=[{epsilon,E}]}, #nfa_state{no=E2,edges=[{epsilon,E}]}|NFA2], N2+1,E};build_nfa({literal,[]}, N, S, NFA) -> {NFA,N,S};build_nfa({literal,[C|Cs]}, N0, S, NFA0) -> {NFA1,N1,E1} = build_nfa(C, N0, S, NFA0), build_nfa({literal,Cs}, N1, E1, NFA1);build_nfa({concat,RE1,RE2}, N0, S, NFA0) -> {NFA1,N1,E1} = build_nfa(RE1, N0, S, NFA0), {NFA2,N2,E2} = build_nfa(RE2, N1, E1, NFA1),
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -