📄 xmerl_regexp.erl
字号:
%% ``The contents of this file are subject to the Erlang Public License,%% Version 1.1, (the "License"); you may not use this file except in%% compliance with the License. You should have received a copy of the%% Erlang Public License along with this software. If not, it can be%% retrieved via the world wide web at http://www.erlang.org/.%% %% Software distributed under the License is distributed on an "AS IS"%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See%% the License for the specific language governing rights and limitations%% under the License.%% %% The Initial Developer of the Original Code is Ericsson Utvecklings AB.%% Portions created by Ericsson are Copyright 1999, Ericsson Utvecklings%% AB. All Rights Reserved.''%% %% $Id$%%-module(xmerl_regexp).%% This module provides a basic set of regular expression functions%% for strings. The functions provided are taken from AWK.%%%% Note that we interpret the syntax tree of a regular expression%% directly instead of converting it to an NFA and then interpreting%% that. This method seems to go significantly faster.-export([sh_to_awk/1,parse/1,format_error/1,match/2,first_match/2,matches/2]).-export([sub/3,gsub/3,split/2,sub_match/2,sub_first_match/2]).-export([make_nfa/1,make_dfa/1,make_dfa/2,compile/1]).-import(string, [substr/2,substr/3]).-import(lists, [reverse/1,reverse/2,last/1,duplicate/2,seq/2]).-import(lists, [member/2,keysearch/3,keysort/2,map/2,foldl/3]).-import(ordsets, [is_element/2,add_element/2,union/2,subtract/2]).%%-compile([export_all]).-export([setup/1,compile_proc/2]).setup(RE0) -> RE = setup(RE0, [$^]), Pid = spawn(?MODULE,compile_proc,[self(),RE]), receive {ok,Result} -> Result after 2000 -> exit(Pid,force), parse(RE) end. %% compile(RE).%%RE.compile_proc(From,RE) -> Res = compile(RE), From ! {ok,Res}.setup([$\\,$d|S],Acc) -> setup(S,"]9-0[" ++Acc);setup([$\\,$D|S],Acc) -> setup(S,"]9-0^[" ++Acc);setup([$\\,$s|S],Acc) -> setup(S,"]s\\t\\n\\r\\[" ++Acc);setup([$\\,$S|S],Acc) -> setup(S,"]\\s\\t\\n\\r^[" ++Acc);setup([$\\,$i|S],Acc) -> setup(S,"]z-aZ-A_:[" ++Acc); %% Only Latin-1 nowsetup([$\\,$I|S],Acc) -> setup(S,"]z-aZ-A_:^[" ++Acc);setup([$\\,$c|S],Acc) -> setup(S,"]9-0z-aZ-A_:."++[183]++"-[" ++Acc); setup([$\\,$C|S],Acc) -> setup(S,"]9-0z-aZ-A_:."++[183]++"-^[" ++Acc);%% fixme setup([$\\,$w|S]) -> {{char_class,"\s\t\n\r"},S};%% fixme setup([$\\,$W|S]) -> {{comp_class,"\s\t\n\r"},S};%% Letter, Any%% fixme setup(["\\p{L}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{L}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Letter, Uppercase%% fixme setup(["\\p{Lu}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Lu}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Letter, Lowercase%% fixme setup(["\\p{Ll}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Ll}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Letter, Titlecase%% fixme setup(["\\p{Lt}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Lt}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Letter, Modifier%% fixme setup(["\\p{Lm}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Lm}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Letter, Other%% fixme setup(["\\p{Lo}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Lo}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Mark, Any%% fixme setup(["\\p{M}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{M}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Mark, Nonspacing%% fixme setup(["\\p{Mn}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Mn}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Mark, Spacing Combining%% fixme setup(["\\p{Mc}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Mc}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Mark, Enclosing%% fixme setup(["\\p{Me}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Me}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Number, Any%% fixme setup(["\\p{N}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{N}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Number, Decimal Digit%% fixme setup(["\\p{Nd}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Nd}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Number, Letter%% fixme setup(["\\p{Nl}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Nl}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Number, Other%% fixme setup(["\\p{No}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{No}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Punctuation, Any%% fixme setup(["\\p{P}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{P}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Punctuation, Connector%% fixme setup(["\\p{Pc}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Pc}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Punctuation, Dash%% fixme setup(["\\p{Pd}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Pd}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Punctuation, Open%% fixme setup(["\\p{Ps}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Ps}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Punctuation, Close%% fixme setup(["\\p{Pe}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Pe}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Punctuation, Initial quote (may behave like Ps or Pe, depending on usage)%% fixme setup(["\\p{Pi}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Pi}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Punctuation, Final quote (may behave like Ps or Pe, depending on usage)%% fixme setup(["\\p{Pf}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Pf}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Punctuation, Other%% fixme setup(["\\p{Po}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Po}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Symbol, Any%% fixme setup(["\\p{S}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{S}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Symbol, Math%% fixme setup(["\\p{Sm}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Sm}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Symbol, Currency%% fixme setup(["\\p{Sc}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Sc}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Symbol, Modifier%% fixme setup(["\\p{Sk}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Sk}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Symbol, Other%% fixme setup(["\\p{So}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{So}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Separator, Any%% fixme setup(["\\p{Z}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Z}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Separator, Space%% fixme setup(["\\p{Zs}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Zs}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Separator, Line%% fixme setup(["\\p{Zl}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Zl}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Separator, Paragraph%% fixme setup(["\\p{Zp}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Zp}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Other, Any%% fixme setup(["\\p{C}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{C}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Other, Control%% fixme setup(["\\p{Cc}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Cc}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Other, Format%% fixme setup(["\\p{Cf}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Cf}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Other, Surrogate not supported by schema recommendation%% fixme setup(["\\p{Cs}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Cs}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Other, Private Use%% fixme setup(["\\p{Co}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Co}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Other, Not assigned (no characters in the file have this property)%% fixme setup(["\\p{Cn}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Cn}" ++ S) -> {{comp_class,"\s\t\n\r"},S};setup([A|S], Acc) -> setup(S, [A|Acc]);setup([],Acc) -> reverse([$$|Acc]).%% sh_to_awk(ShellRegExp)%% Convert a sh style regexp into a full AWK one. The main difficulty is%% getting character sets right as the conventions are different.sh_to_awk(Sh) -> "^(" ++ sh_to_awk_1(Sh). %Fix the beginningsh_to_awk_1([$*|Sh]) -> %This matches any string ".*" ++ sh_to_awk_1(Sh);sh_to_awk_1([$?|Sh]) -> %This matches any character [$.|sh_to_awk_1(Sh)];sh_to_awk_1([$[,$^,$]|Sh]) -> %This takes careful handling "\\^" ++ sh_to_awk_1(Sh);%% Must move '^' to end.sh_to_awk_1("[^" ++ Sh) -> [$[|sh_to_awk_2(Sh, true)];sh_to_awk_1("[!" ++ Sh) -> "[^" ++ sh_to_awk_2(Sh, false);sh_to_awk_1([$[|Sh]) -> [$[|sh_to_awk_2(Sh, false)];sh_to_awk_1([C|Sh]) -> %% Unspecialise everything else which is not an escape character. case sh_special_char(C) of true -> [$\\,C|sh_to_awk_1(Sh)]; false -> [C|sh_to_awk_1(Sh)] end;sh_to_awk_1([]) -> ")$". %Fix the endsh_to_awk_2([$]|Sh], UpArrow) -> [$]|sh_to_awk_3(Sh, UpArrow)];sh_to_awk_2(Sh, UpArrow) -> sh_to_awk_3(Sh, UpArrow).sh_to_awk_3([$]|Sh], true) -> "^]" ++ sh_to_awk_1(Sh);sh_to_awk_3([$]|Sh], false) -> [$]|sh_to_awk_1(Sh)];sh_to_awk_3([C|Sh], UpArrow) -> [C|sh_to_awk_3(Sh, UpArrow)];sh_to_awk_3([], true) -> [$^|sh_to_awk_1([])];sh_to_awk_3([], false) -> sh_to_awk_1([]).%% -type sh_special_char(char()) -> bool().%% Test if a character is a special character.sh_special_char($|) -> true;sh_special_char($*) -> true;sh_special_char($+) -> true;sh_special_char($?) -> true;sh_special_char($() -> true;sh_special_char($)) -> true;sh_special_char($\\) -> true;sh_special_char($^) -> true;sh_special_char($$) -> true;sh_special_char($.) -> true;sh_special_char($[) -> true;sh_special_char($]) -> true;sh_special_char($") -> true;sh_special_char(_C) -> false.%% parse(RegExp) -> {ok,RE} | {error,E}.%% Parse the regexp described in the string RegExp.parse(S) -> case catch reg(S, 0) of {R,Sc,[]} -> {ok,{regexp,{R,Sc}}}; {_R,_Sc,[C|_]} -> {error,{illegal,[C]}}; {error,E} -> {error,E} end.%% format_error(Error) -> String.format_error({interval_range,What}) -> ["illegal interval range",io_lib:write_string(What)];format_error({illegal,What}) -> ["illegal character `",What,"'"];format_error({unterminated,What}) -> ["unterminated `",What,"'"];format_error({posix_cc,What}) -> ["illegal POSIX character class ",io_lib:write_string(What)];format_error({char_class,What}) -> ["illegal character class ",io_lib:write_string(What)].%% match(String, RegExp) -> {match,Start,Length} | nomatch | {error,E}.%% Find the longest match of RegExp in String.match(S, RegExp) when list(RegExp) -> case parse(RegExp) of {ok,RE} -> match(S, RE); {error,E} -> {error,E} end;match(S, {regexp,RE}) -> case match_re(RE, S, 1, 0, -1) of {Start,Len} when Len >= 0 -> {match,Start,Len}; {_Start,_Len} -> nomatch end;match(S, {comp_regexp,RE}) -> case match_comp(RE, S, 1, 0, -1) of {Start,Len} when Len >= 0 -> {match,Start,Len}; {_Start,_Len} -> nomatch end.match_re(RE, [_|Cs]=S0, P0, Mst, Mlen) -> case re_apply(S0, P0, RE) of {match,P1,_S1,_Subs} -> Len = P1-P0, if Len > Mlen -> match_re(RE, Cs, P0+1, P0, Len); true -> match_re(RE, Cs, P0+1, Mst, Mlen) end; nomatch -> match_re(RE, Cs, P0+1, Mst, Mlen); never_match -> {Mst,Mlen} %No need to go on end;match_re(_RE, _S, _P, Mst, Mlen) -> {Mst,Mlen}.match_comp(RE, [_|Cs]=S0, P0, Mst, Mlen) -> case comp_apply(S0, P0, RE) of {match,P1,_S1} -> Len = P1-P0, if Len > Mlen -> match_comp(RE, Cs, P0+1, P0, Len); true -> match_comp(RE, Cs, P0+1, Mst, Mlen) end; nomatch -> match_comp(RE, Cs, P0+1, Mst, Mlen) end;match_comp(_RE, _S, _P, Mst, Mlen) -> {Mst,Mlen}.%% match_re(RE, S0, Pos0, Mst, Mlen) ->%% case first_match_re(RE, S0, Pos0) of%% {St,Len,_} -> %Found a match%% Pos1 = St + 1, %Where to start next match%% S1 = lists:nthtail(Pos1-Pos0, S0),%% if Len > Mlen -> match_re(RE, S1, Pos1, St, Len);%% true -> match_re(RE, S1, Pos1, Mst, Mlen)%% end;%% nomatch -> {Mst,Mlen}%% end.%% match_comp(RE, S0, Pos0, Mst, Mlen) ->%% case first_match_comp(RE, S0, Pos0) of%% {St,Len} -> %Found a match%% Pos1 = St + 1, %Where to start next match%% S1 = lists:nthtail(Pos1-Pos0, S0),%% if Len > Mlen -> match_comp(RE, S1, Pos1, St, Len);%% true -> match_comp(RE, S1, Pos1, Mst, Mlen)%% end;%% nomatch -> {Mst,Mlen}%% end.%% first_match(String, RegExp) -> {match,Start,Length} | nomatch | {error,E}.%% Find the first match of RegExp in String.first_match(S, RegExp) when list(RegExp) -> case parse(RegExp) of {ok,RE} -> first_match(S, RE); {error,E} -> {error,E} end;first_match(S, {regexp,RE}) -> case first_match_re(RE, S, 1) of {Start,Len,_} -> {match,Start,Len}; nomatch -> nomatch end;first_match(S, {comp_regexp,RE}) -> case first_match_comp(RE, S, 1) of {Start,Len} -> {match,Start,Len}; nomatch -> nomatch end.first_match_re(RE, S, St) when S /= [] -> case re_apply(S, St, RE) of {match,P,_Rest,Subs} -> {St,P-St,Subs}; nomatch -> first_match_re(RE, tl(S), St+1); never_match -> nomatch end;first_match_re(_RE, [], _St) -> nomatch.first_match_comp(RE, S, St) when S /= [] -> case comp_apply(S, St, RE) of {match,P,_Rest} -> {St,P-St}; nomatch -> first_match_comp(RE, tl(S), St+1) end;first_match_comp(_RE, [], _St) -> nomatch.%% matches(String, RegExp) -> {match,[{Start,Length}]} | {error,E}.%% Return the all the non-overlapping matches of RegExp in String.matches(S, RegExp) when list(RegExp) -> case parse(RegExp) of {ok,RE} -> matches(S, RE);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -