xmerl_ucs.erl

来自「OTP是开放电信平台的简称」· ERL 代码 · 共 556 行 · 第 1/2 页

ERL
556
字号
%%% UTF-16 support%%% Possible errors encoding UTF-16%%%	- Non-character values (something other than 0 .. 2^31-1)%%%	- Surrogate-pair code in string.%%%	- 16#FFFE or 16#FFFF character in string.%%% NB: the UCS replacement char (U+FFFD) will be quietly substituted%%% for unrepresentable chars (i.e. those geq to 2^20+2^16).%%% Possible errors decoding UTF-16:%%%	- Unmatched surrogate-pair code in string.%%%	- 16#FFFE or 16#FFFF character in string.char_to_utf16be(Ch) when integer(Ch), Ch >= 0 ->    if Ch =< 16#FFFF ->	    if Ch < 16#D800; Ch >= 16#E000, Ch < 16#FFFE ->		    [Ch bsr 8, Ch band 16#FF]	    end;       Ch < 16#110000 ->	    %% Encode with surrogate pair	    X = Ch - 16#10000,	    [16#D8 + (X bsr 18),	     (X bsr 10) band 16#FF,	     16#DC + ((X bsr 8) band 3),	     X band 16#FF];       Ch =< 16#7FFFFFFF ->	    %% Unrepresentable char: use REPLACEMENT CHARACTER (U+FFFD)	    [16#FF, 16#FD]    end.from_utf16be(<<Ch:16/big-unsigned-integer, Rest/binary>>, Acc, Tail)  when Ch < 16#D800; Ch > 16#DFFF ->    if Ch < 16#FFFE -> from_utf16be(Rest,[Ch|Acc],Tail) end;from_utf16be(<<Hi:16/big-unsigned-integer, Lo:16/big-unsigned-integer,	       Rest/binary>>, Acc, Tail)  when Hi >= 16#D800, Hi < 16#DC00, Lo >= 16#DC00, Lo =< 16#DFFF ->    %% Surrogate pair    Ch = ((Hi band 16#3FF) bsl 10) + (Lo band 16#3FF) + 16#10000,    from_utf16be(Rest, [Ch|Acc], Tail);from_utf16be(<<>>,Acc,Tail) ->    lists:reverse(Acc,Tail);from_utf16be(Bin,Acc,Tail) ->    io:format("ucs Error: Bin=~p~n     Acc=~p~n     Tail=~p~n",[Bin,Acc,Tail]),    {error,not_utf16be}.char_to_utf16le(Ch) when integer(Ch), Ch >= 0 ->    if Ch =< 16#FFFF ->	    if Ch < 16#D800; Ch >= 16#E000, Ch < 16#FFFE ->		    [Ch band 16#FF, Ch bsr 8]	    end;       Ch < 16#110000 ->	    %% Encode with surrogate pair	    X = Ch - 16#10000,	    [(X bsr 10) band 16#FF,	     16#D8 + (X bsr 18),	     X band 16#FF,	     16#DC + ((X bsr 8) band 3)];       Ch =< 16#7FFFFFFF ->	    %% Unrepresentable char: use REPLACEMENT CHARACTER (U+FFFD)	    [16#FD, 16#FF]    end.from_utf16le(<<Ch:16/little-unsigned-integer, Rest/binary>>, Acc, Tail)  when Ch < 16#D800; Ch > 16#DFFF ->    if Ch < 16#FFFE -> from_utf16le(Rest, [Ch|Acc], Tail) end;from_utf16le(<<Hi:16/little-unsigned-integer, Lo:16/little-unsigned-integer,	       Rest/binary>>, Acc, Tail)  when Hi >= 16#D800, Hi < 16#DC00, Lo >= 16#DC00, Lo =< 16#DFFF ->    %% Surrogate pair    Ch = ((Hi band 16#3FF) bsl 10) + (Lo band 16#3FF) + 16#10000,    from_utf16le(Rest, [Ch|Acc], Tail);from_utf16le(<<>>,Acc,Tail) ->    lists:reverse(Acc,Tail);from_utf16le(Bin,Acc,Tail) ->    io:format("ucs Error: Bin=~p~n     Acc=~p~n     Tail=~p~n",[Bin,Acc,Tail]),    {error,not_utf16le}.%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% UTF-8 support%%% Possible errors encoding UTF-8:%%%	- Non-character values (something other than 0 .. 2^31-1).%%%	- Surrogate pair code in string.%%%	- 16#FFFE or 16#FFFF character in string.%%% Possible errors decoding UTF-8:%%%	- 10xxxxxx or 1111111x as initial byte.%%%	- Insufficient number of 10xxxxxx octets following an initial octet of%%%	multi-octet sequence.%%% 	- Non-canonical encoding used.%%%	- Surrogate-pair code encoded as UTF-8.%%%	- 16#FFFE or 16#FFFF character in string.char_to_utf8(Ch) when integer(Ch), Ch >= 0 ->    if Ch < 128 ->	    %% 0yyyyyyy	    [Ch];       Ch < 16#800 ->	    %% 110xxxxy 10yyyyyy	    [16#C0 + (Ch bsr 6),	     128+(Ch band 16#3F)];       Ch < 16#10000 ->	    %% 1110xxxx 10xyyyyy 10yyyyyy	    if Ch < 16#D800; Ch > 16#DFFF, Ch < 16#FFFE ->		    [16#E0 + (Ch bsr 12),		     128+((Ch bsr 6) band 16#3F),		     128+(Ch band 16#3F)]	    end;       Ch < 16#200000 ->	    %% 11110xxx 10xxyyyy 10yyyyyy 10yyyyyy	    [16#F0+(Ch bsr 18),	     128+((Ch bsr 12) band 16#3F),	     128+((Ch bsr 6) band 16#3F),	     128+(Ch band 16#3F)];       Ch < 16#4000000 ->	    %% 111110xx 10xxxyyy 10yyyyyy 10yyyyyy 10yyyyyy	    [16#F8+(Ch bsr 24),	     128+((Ch bsr 18) band 16#3F),	     128+((Ch bsr 12) band 16#3F),	     128+((Ch bsr 6) band 16#3F),	     128+(Ch band 16#3F)];       Ch < 16#80000000 ->	    %% 1111110x 10xxxxyy 10yyyyyy 10yyyyyy 10yyyyyy 10yyyyyy	    [16#FC+(Ch bsr 30),	     128+((Ch bsr 24) band 16#3F),	     128+((Ch bsr 18) band 16#3F),	     128+((Ch bsr 12) band 16#3F),	     128+((Ch bsr 6) band 16#3F),	     128+(Ch band 16#3F)]    end.%% expand_utf8([Byte]) -> {[UnicodeChar],NumberOfBadBytes}%%  Expand UTF8 byte sequences to ISO 10646/Unicode%%  charactes. Any illegal bytes are removed and the number of%%  bad bytes are returned.%%%%  Reference:%%     RFC 3629: "UTF-8, a transformation format of ISO 10646".expand_utf8(Str) ->    expand_utf8_1(Str, [], 0).expand_utf8_1([C|Cs], Acc, Bad) when C < 16#80 ->    %% Plain Ascii character.    expand_utf8_1(Cs, [C|Acc], Bad);expand_utf8_1([C1,C2|Cs], Acc, Bad) when C1 band 16#E0 =:= 16#C0,					 C2 band 16#C0 =:= 16#80 ->    case ((C1 band 16#1F) bsl 6) bor (C2 band 16#3F) of	C when 16#80 =< C ->	    expand_utf8_1(Cs, [C|Acc], Bad);	_ ->	    %% Bad range.	    expand_utf8_1(Cs, Acc, Bad+1)    end;expand_utf8_1([C1,C2,C3|Cs], Acc, Bad) when C1 band 16#F0 =:= 16#E0,					    C2 band 16#C0 =:= 16#80,					    C3 band 16#C0 =:= 16#80 ->    case ((((C1 band 16#0F) bsl 6) bor (C2 band 16#3F)) bsl 6) bor	(C3 band 16#3F) of	C when 16#800 =< C ->	    expand_utf8_1(Cs, [C|Acc], Bad);	_ ->	    %% Bad range.	    expand_utf8_1(Cs, Acc, Bad+1)    end;expand_utf8_1([C1,C2,C3,C4|Cs], Acc, Bad) when C1 band 16#F8 =:= 16#F0,					       C2 band 16#C0 =:= 16#80,					       C3 band 16#C0 =:= 16#80,					       C4 band 16#C0 =:= 16#80 ->    case ((((((C1 band 16#0F) bsl 6) bor (C2 band 16#3F)) bsl 6) bor	(C3 band 16#3F)) bsl 6) bor (C4 band 16#3F) of	C when 16#10000 =< C ->	    expand_utf8_1(Cs, [C|Acc], Bad);	_ ->	    %% Bad range.	    expand_utf8_1(Cs, Acc, Bad+1)    end;expand_utf8_1([_|Cs], Acc, Bad) ->    %% Ignore bad character.    expand_utf8_1(Cs, Acc, Bad+1);expand_utf8_1([], Acc, Bad) -> {lists:reverse(Acc),Bad}.%%% ----------------------------------------------------------------------------%%% Translation to/from any IANA defined character set, given that a mapping%%% exists. Don't care about validating valid subsets of Unicodeto_unicode(Input,Cs) when Cs=='ansi_x3.4-1968';Cs=='iso-ir-6';			  Cs=='ansi_x3.4-1986';Cs=='iso_646.irv:1991';			  Cs=='ascii';Cs=='iso646-us';Cs=='us-ascii';Cs=='us';			  Cs=='ibm367';Cs=='cp367';Cs=='csascii' -> % US-ASCII    Input;to_unicode(Input,Cs) when Cs=='iso-10646-utf-1';Cs=='csiso10646utf1' ->    Input;to_unicode(Input,Cs) when Cs=='iso_646.basic:1983';Cs=='ref';			  Cs=='csiso646basic1983' ->    Input;to_unicode(Input,Cs) when Cs=='iso_8859-1:1987';Cs=='iso-ir-100';			  Cs=='iso_8859-1';Cs=='iso-8859-1';Cs=='latin1';			  Cs=='l1';Cs=='ibm819';			  Cs=='cp819';Cs=='csisolatin1' ->    Input;% to_unicode(Input,Cs) when Cs=='mnemonic';Cs=='"mnemonic+ascii+38';% 			  Cs=='mnem';Cs=='"mnemonic+ascii+8200' ->%     from_mnemonic(Input);to_unicode(Input,Cs) when Cs=='iso-10646-ucs-2';Cs=='csunicode' ->    from_ucs2be(Input); % Guess byteorderto_unicode(Input,Cs) when Cs=='iso-10646-ucs-4';Cs=='csucs4' ->    from_ucs4be(Input); % Guess byteorderto_unicode(Input,Cs) when Cs=='utf-16be';Cs=='utf-16' ->    from_utf16be(Input);to_unicode(Input,'utf-16le') ->    from_utf16le(Input);to_unicode(Input,'utf-8') ->    from_utf8(Input);to_unicode(Input,Charset) ->    exit({bad_character_code,Input,Charset}).    %ucs_data:to_unicode(Input,Charset).%%% Tests if Char is in Charset.%%% Do this by trying to convert it into unicode, if possible a mapping was%%% found and we are ok.is_incharset(In,Cs) when Cs=='ansi_x3.4-1968';Cs=='iso-ir-6';			 Cs=='ansi_x3.4-1986';Cs=='iso_646.irv:1991';			 Cs=='ascii';Cs=='iso646-us';Cs=='us-ascii';Cs=='us';			 Cs=='ibm367';Cs=='cp367';Cs=='csascii' -> % US-ASCII    if	integer(In) -> is_ascii(In);	list(In) -> test_charset(fun is_ascii/1,In)    end;is_incharset(In,Cs) when Cs=='iso-10646-utf-1';Cs=='csiso10646utf1' ->    if	integer(In) -> is_unicode(In);	list(In) -> test_charset(fun is_unicode/1, In)    end;is_incharset(In,Cs) when Cs=='iso_646.basic:1983';Cs=='ref';			 Cs=='csiso646basic1983' ->    if	integer(In) -> is_iso646_basic(In);	list(In) -> test_charset(fun is_iso646_basic/1, In)    end;is_incharset(In,Cs) when Cs=='iso_8859-1:1987';Cs=='iso-ir-100';			 Cs=='iso_8859-1';Cs=='iso-8859-1';			 Cs=='latin1';Cs=='l1';Cs=='ibm819';			 Cs=='cp819';Cs=='csisolatin1' ->    if	integer(In) -> is_latin1(In);	list(In) -> test_charset(fun is_latin1/1, In)    end;is_incharset(In,Charset) when integer(In) ->    case to_unicode([In],Charset) of	{error,unsupported_charset} ->	    {error,unsupported_charset};	{error,_} ->	    false;	[Int] when integer(Int) ->	    true    end;is_incharset(In,Charset) when list(In) ->    case to_unicode(In,Charset) of	{error,unsupported_charset} ->	    {error,unsupported_charset};	{error,_} ->	    false;	[Int] when integer(Int) ->	    true    end.test_charset(Fun,Input) ->    case lists:all(Fun, Input) of	true ->	    true;	_ ->	    false    end.

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?