📄 xmerl_regexp.erl

📁 OTP是开放电信平台的简称
💻 ERL
📖 第 1 页 / 共 4 页
字号:
12 3 4 下一页
%% ``The contents of this file are subject to the Erlang Public License,%% Version 1.1, (the "License"); you may not use this file except in%% compliance with the License. You should have received a copy of the%% Erlang Public License along with this software. If not, it can be%% retrieved via the world wide web at http://www.erlang.org/.%% %% Software distributed under the License is distributed on an "AS IS"%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See%% the License for the specific language governing rights and limitations%% under the License.%% %% The Initial Developer of the Original Code is Ericsson Utvecklings AB.%% Portions created by Ericsson are Copyright 1999, Ericsson Utvecklings%% AB. All Rights Reserved.''%% %%     $Id$%%-module(xmerl_regexp).%% This module provides a basic set of regular expression functions%% for strings. The functions provided are taken from AWK.%%%% Note that we interpret the syntax tree of a regular expression%% directly instead of converting it to an NFA and then interpreting%% that. This method seems to go significantly faster.-export([sh_to_awk/1,parse/1,format_error/1,match/2,first_match/2,matches/2]).-export([sub/3,gsub/3,split/2,sub_match/2,sub_first_match/2]).-export([make_nfa/1,make_dfa/1,make_dfa/2,compile/1]).-import(string, [substr/2,substr/3]).-import(lists, [reverse/1,reverse/2,last/1,duplicate/2,seq/2]).-import(lists, [member/2,keysearch/3,keysort/2,map/2,foldl/3]).-import(ordsets, [is_element/2,add_element/2,union/2,subtract/2]).%%-compile([export_all]).-export([setup/1,compile_proc/2]).setup(RE0) ->    RE = setup(RE0, [$^]),    Pid = spawn(?MODULE,compile_proc,[self(),RE]),    receive	{ok,Result} ->	    Result    after 2000 ->	    exit(Pid,force),	    parse(RE)    end.    %% compile(RE).%%RE.compile_proc(From,RE) ->    Res = compile(RE),    From ! {ok,Res}.setup([$\\,$d|S],Acc) -> setup(S,"]9-0[" ++Acc);setup([$\\,$D|S],Acc) -> setup(S,"]9-0^[" ++Acc);setup([$\\,$s|S],Acc) -> setup(S,"]s\\t\\n\\r\\[" ++Acc);setup([$\\,$S|S],Acc) -> setup(S,"]\\s\\t\\n\\r^[" ++Acc);setup([$\\,$i|S],Acc) -> setup(S,"]z-aZ-A_:[" ++Acc);   %% Only Latin-1 nowsetup([$\\,$I|S],Acc) -> setup(S,"]z-aZ-A_:^[" ++Acc);setup([$\\,$c|S],Acc) -> setup(S,"]9-0z-aZ-A_:."++[183]++"-[" ++Acc); setup([$\\,$C|S],Acc) -> setup(S,"]9-0z-aZ-A_:."++[183]++"-^[" ++Acc);%% fixme setup([$\\,$w|S]) -> {{char_class,"\s\t\n\r"},S};%% fixme setup([$\\,$W|S]) -> {{comp_class,"\s\t\n\r"},S};%% Letter, Any%% fixme setup(["\\p{L}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{L}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Letter, Uppercase%% fixme setup(["\\p{Lu}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Lu}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Letter, Lowercase%% fixme setup(["\\p{Ll}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Ll}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Letter, Titlecase%% fixme setup(["\\p{Lt}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Lt}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Letter, Modifier%% fixme setup(["\\p{Lm}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Lm}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Letter, Other%% fixme setup(["\\p{Lo}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Lo}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Mark, Any%% fixme setup(["\\p{M}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{M}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Mark, Nonspacing%% fixme setup(["\\p{Mn}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Mn}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Mark, Spacing Combining%% fixme setup(["\\p{Mc}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Mc}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Mark, Enclosing%% fixme setup(["\\p{Me}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Me}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Number, Any%% fixme setup(["\\p{N}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{N}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Number, Decimal Digit%% fixme setup(["\\p{Nd}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Nd}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Number, Letter%% fixme setup(["\\p{Nl}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Nl}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Number, Other%% fixme setup(["\\p{No}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{No}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Punctuation, Any%% fixme setup(["\\p{P}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{P}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Punctuation, Connector%% fixme setup(["\\p{Pc}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Pc}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Punctuation, Dash%% fixme setup(["\\p{Pd}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Pd}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Punctuation, Open%% fixme setup(["\\p{Ps}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Ps}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Punctuation, Close%% fixme setup(["\\p{Pe}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Pe}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Punctuation, Initial quote (may behave like Ps or Pe, depending on usage)%% fixme setup(["\\p{Pi}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Pi}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Punctuation, Final quote (may behave like Ps or Pe, depending on usage)%% fixme setup(["\\p{Pf}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Pf}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Punctuation, Other%% fixme setup(["\\p{Po}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Po}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Symbol, Any%% fixme setup(["\\p{S}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{S}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Symbol, Math%% fixme setup(["\\p{Sm}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Sm}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Symbol, Currency%% fixme setup(["\\p{Sc}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Sc}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Symbol, Modifier%% fixme setup(["\\p{Sk}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Sk}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Symbol, Other%% fixme setup(["\\p{So}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{So}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Separator, Any%% fixme setup(["\\p{Z}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Z}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Separator, Space%% fixme setup(["\\p{Zs}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Zs}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Separator, Line%% fixme setup(["\\p{Zl}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Zl}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Separator, Paragraph%% fixme setup(["\\p{Zp}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Zp}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Other, Any%% fixme setup(["\\p{C}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{C}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Other, Control%% fixme setup(["\\p{Cc}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Cc}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Other, Format%% fixme setup(["\\p{Cf}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Cf}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Other, Surrogate not supported by schema recommendation%% fixme setup(["\\p{Cs}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Cs}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Other, Private Use%% fixme setup(["\\p{Co}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Co}" ++ S) -> {{comp_class,"\s\t\n\r"},S};%% Other, Not assigned (no characters in the file have this property)%% fixme setup(["\\p{Cn}" ++ S) -> {{char_class,"\s\t\n\r"},S};%% fixme setup(["\\P{Cn}" ++ S) -> {{comp_class,"\s\t\n\r"},S};setup([A|S], Acc) -> setup(S, [A|Acc]);setup([],Acc) ->  reverse([$$|Acc]).%% sh_to_awk(ShellRegExp)%%  Convert a sh style regexp into a full AWK one. The main difficulty is%%  getting character sets right as the conventions are different.sh_to_awk(Sh) -> "^(" ++ sh_to_awk_1(Sh).	%Fix the beginningsh_to_awk_1([$*|Sh]) ->				%This matches any string    ".*" ++ sh_to_awk_1(Sh);sh_to_awk_1([$?|Sh]) ->				%This matches any character    [$.|sh_to_awk_1(Sh)];sh_to_awk_1([$[,$^,$]|Sh]) ->			%This takes careful handling    "\\^" ++ sh_to_awk_1(Sh);%% Must move '^' to end.sh_to_awk_1("[^" ++ Sh) -> [$[|sh_to_awk_2(Sh, true)];sh_to_awk_1("[!" ++ Sh) -> "[^" ++ sh_to_awk_2(Sh, false);sh_to_awk_1([$[|Sh]) -> [$[|sh_to_awk_2(Sh, false)];sh_to_awk_1([C|Sh]) ->    %% Unspecialise everything else which is not an escape character.    case sh_special_char(C) of	true -> [$\\,C|sh_to_awk_1(Sh)];	false -> [C|sh_to_awk_1(Sh)]    end;sh_to_awk_1([]) -> ")$".			%Fix the endsh_to_awk_2([$]|Sh], UpArrow) -> [$]|sh_to_awk_3(Sh, UpArrow)];sh_to_awk_2(Sh, UpArrow) -> sh_to_awk_3(Sh, UpArrow).sh_to_awk_3([$]|Sh], true) -> "^]" ++ sh_to_awk_1(Sh);sh_to_awk_3([$]|Sh], false) -> [$]|sh_to_awk_1(Sh)];sh_to_awk_3([C|Sh], UpArrow) -> [C|sh_to_awk_3(Sh, UpArrow)];sh_to_awk_3([], true) -> [$^|sh_to_awk_1([])];sh_to_awk_3([], false) -> sh_to_awk_1([]).%% -type sh_special_char(char()) -> bool().%%  Test if a character is a special character.sh_special_char($|) -> true;sh_special_char($*) -> true;sh_special_char($+) -> true;sh_special_char($?) -> true;sh_special_char($() -> true;sh_special_char($)) -> true;sh_special_char($\\) -> true;sh_special_char($^) -> true;sh_special_char($$) -> true;sh_special_char($.) -> true;sh_special_char($[) -> true;sh_special_char($]) -> true;sh_special_char($") -> true;sh_special_char(_C) -> false.%% parse(RegExp) -> {ok,RE} | {error,E}.%%  Parse the regexp described in the string RegExp.parse(S) ->    case catch reg(S, 0) of	{R,Sc,[]} -> {ok,{regexp,{R,Sc}}};	{_R,_Sc,[C|_]} -> {error,{illegal,[C]}};	{error,E} -> {error,E}    end.%% format_error(Error) -> String.format_error({interval_range,What}) ->    ["illegal interval range",io_lib:write_string(What)];format_error({illegal,What}) -> ["illegal character `",What,"'"];format_error({unterminated,What}) -> ["unterminated `",What,"'"];format_error({posix_cc,What}) ->    ["illegal POSIX character class ",io_lib:write_string(What)];format_error({char_class,What}) ->    ["illegal character class ",io_lib:write_string(What)].%% match(String, RegExp) -> {match,Start,Length} | nomatch | {error,E}.%%  Find the longest match of RegExp in String.match(S, RegExp) when list(RegExp) ->    case parse(RegExp) of	{ok,RE} -> match(S, RE);	{error,E} -> {error,E}    end;match(S, {regexp,RE}) ->    case match_re(RE, S, 1, 0, -1) of	{Start,Len} when Len >= 0 ->	    {match,Start,Len};	{_Start,_Len} -> nomatch    end;match(S, {comp_regexp,RE}) ->    case match_comp(RE, S, 1, 0, -1) of	{Start,Len} when Len >= 0 ->	    {match,Start,Len};	{_Start,_Len} -> nomatch    end.match_re(RE, [_|Cs]=S0, P0, Mst, Mlen) ->    case re_apply(S0, P0, RE) of	{match,P1,_S1,_Subs} ->	    Len = P1-P0,	    if Len > Mlen -> match_re(RE, Cs, P0+1, P0, Len);	       true -> match_re(RE, Cs, P0+1, Mst, Mlen)	    end;	nomatch -> match_re(RE, Cs, P0+1, Mst, Mlen);	never_match -> {Mst,Mlen}		%No need to go on    end;match_re(_RE, _S, _P, Mst, Mlen) -> {Mst,Mlen}.match_comp(RE, [_|Cs]=S0, P0, Mst, Mlen) ->    case comp_apply(S0, P0, RE) of	{match,P1,_S1} ->	    Len = P1-P0,	    if Len > Mlen -> match_comp(RE, Cs, P0+1, P0, Len);	       true -> match_comp(RE, Cs, P0+1, Mst, Mlen)	    end;	nomatch -> match_comp(RE, Cs, P0+1, Mst, Mlen)    end;match_comp(_RE, _S, _P, Mst, Mlen) -> {Mst,Mlen}.%% match_re(RE, S0, Pos0, Mst, Mlen) ->%%     case first_match_re(RE, S0, Pos0) of%% 	{St,Len,_} ->				%Found a match%% 	    Pos1 = St + 1,			%Where to start next match%% 	    S1 = lists:nthtail(Pos1-Pos0, S0),%% 	    if Len > Mlen -> match_re(RE, S1, Pos1, St, Len);%% 	       true -> match_re(RE, S1, Pos1, Mst, Mlen)%% 	    end;%% 	nomatch -> {Mst,Mlen}%%     end.%% match_comp(RE, S0, Pos0, Mst, Mlen) ->%%     case first_match_comp(RE, S0, Pos0) of%% 	{St,Len} ->				%Found a match%% 	    Pos1 = St + 1,			%Where to start next match%% 	    S1 = lists:nthtail(Pos1-Pos0, S0),%% 	    if Len > Mlen -> match_comp(RE, S1, Pos1, St, Len);%% 	       true -> match_comp(RE, S1, Pos1, Mst, Mlen)%% 	    end;%% 	nomatch -> {Mst,Mlen}%%     end.%% first_match(String, RegExp) -> {match,Start,Length} | nomatch | {error,E}.%%  Find the first match of RegExp in String.first_match(S, RegExp) when list(RegExp) ->    case parse(RegExp) of	{ok,RE} -> first_match(S, RE);	{error,E} -> {error,E}    end;first_match(S, {regexp,RE}) ->    case first_match_re(RE, S, 1) of	{Start,Len,_} -> {match,Start,Len};	nomatch -> nomatch    end;first_match(S, {comp_regexp,RE}) ->    case first_match_comp(RE, S, 1) of	{Start,Len} -> {match,Start,Len};	nomatch -> nomatch    end.first_match_re(RE, S, St) when S /= [] ->    case re_apply(S, St, RE) of	{match,P,_Rest,Subs} -> {St,P-St,Subs};	nomatch -> first_match_re(RE, tl(S), St+1);	never_match -> nomatch    end;first_match_re(_RE, [], _St) -> nomatch.first_match_comp(RE, S, St) when S /= [] ->    case comp_apply(S, St, RE) of	{match,P,_Rest} -> {St,P-St};	nomatch -> first_match_comp(RE, tl(S), St+1)    end;first_match_comp(_RE, [], _St) -> nomatch.%% matches(String, RegExp) -> {match,[{Start,Length}]} | {error,E}.%%  Return the all the non-overlapping matches of RegExp in String.matches(S, RegExp) when list(RegExp) ->    case parse(RegExp) of	{ok,RE} -> matches(S, RE);
12 3 4 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -