xmerl_ucs.erl
来自「OTP是开放电信平台的简称」· ERL 代码 · 共 556 行 · 第 1/2 页
ERL
556 行
%%% UTF-16 support%%% Possible errors encoding UTF-16%%% - Non-character values (something other than 0 .. 2^31-1)%%% - Surrogate-pair code in string.%%% - 16#FFFE or 16#FFFF character in string.%%% NB: the UCS replacement char (U+FFFD) will be quietly substituted%%% for unrepresentable chars (i.e. those geq to 2^20+2^16).%%% Possible errors decoding UTF-16:%%% - Unmatched surrogate-pair code in string.%%% - 16#FFFE or 16#FFFF character in string.char_to_utf16be(Ch) when integer(Ch), Ch >= 0 -> if Ch =< 16#FFFF -> if Ch < 16#D800; Ch >= 16#E000, Ch < 16#FFFE -> [Ch bsr 8, Ch band 16#FF] end; Ch < 16#110000 -> %% Encode with surrogate pair X = Ch - 16#10000, [16#D8 + (X bsr 18), (X bsr 10) band 16#FF, 16#DC + ((X bsr 8) band 3), X band 16#FF]; Ch =< 16#7FFFFFFF -> %% Unrepresentable char: use REPLACEMENT CHARACTER (U+FFFD) [16#FF, 16#FD] end.from_utf16be(<<Ch:16/big-unsigned-integer, Rest/binary>>, Acc, Tail) when Ch < 16#D800; Ch > 16#DFFF -> if Ch < 16#FFFE -> from_utf16be(Rest,[Ch|Acc],Tail) end;from_utf16be(<<Hi:16/big-unsigned-integer, Lo:16/big-unsigned-integer, Rest/binary>>, Acc, Tail) when Hi >= 16#D800, Hi < 16#DC00, Lo >= 16#DC00, Lo =< 16#DFFF -> %% Surrogate pair Ch = ((Hi band 16#3FF) bsl 10) + (Lo band 16#3FF) + 16#10000, from_utf16be(Rest, [Ch|Acc], Tail);from_utf16be(<<>>,Acc,Tail) -> lists:reverse(Acc,Tail);from_utf16be(Bin,Acc,Tail) -> io:format("ucs Error: Bin=~p~n Acc=~p~n Tail=~p~n",[Bin,Acc,Tail]), {error,not_utf16be}.char_to_utf16le(Ch) when integer(Ch), Ch >= 0 -> if Ch =< 16#FFFF -> if Ch < 16#D800; Ch >= 16#E000, Ch < 16#FFFE -> [Ch band 16#FF, Ch bsr 8] end; Ch < 16#110000 -> %% Encode with surrogate pair X = Ch - 16#10000, [(X bsr 10) band 16#FF, 16#D8 + (X bsr 18), X band 16#FF, 16#DC + ((X bsr 8) band 3)]; Ch =< 16#7FFFFFFF -> %% Unrepresentable char: use REPLACEMENT CHARACTER (U+FFFD) [16#FD, 16#FF] end.from_utf16le(<<Ch:16/little-unsigned-integer, Rest/binary>>, Acc, Tail) when Ch < 16#D800; Ch > 16#DFFF -> if Ch < 16#FFFE -> from_utf16le(Rest, [Ch|Acc], Tail) end;from_utf16le(<<Hi:16/little-unsigned-integer, Lo:16/little-unsigned-integer, Rest/binary>>, Acc, Tail) when Hi >= 16#D800, Hi < 16#DC00, Lo >= 16#DC00, Lo =< 16#DFFF -> %% Surrogate pair Ch = ((Hi band 16#3FF) bsl 10) + (Lo band 16#3FF) + 16#10000, from_utf16le(Rest, [Ch|Acc], Tail);from_utf16le(<<>>,Acc,Tail) -> lists:reverse(Acc,Tail);from_utf16le(Bin,Acc,Tail) -> io:format("ucs Error: Bin=~p~n Acc=~p~n Tail=~p~n",[Bin,Acc,Tail]), {error,not_utf16le}.%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% UTF-8 support%%% Possible errors encoding UTF-8:%%% - Non-character values (something other than 0 .. 2^31-1).%%% - Surrogate pair code in string.%%% - 16#FFFE or 16#FFFF character in string.%%% Possible errors decoding UTF-8:%%% - 10xxxxxx or 1111111x as initial byte.%%% - Insufficient number of 10xxxxxx octets following an initial octet of%%% multi-octet sequence.%%% - Non-canonical encoding used.%%% - Surrogate-pair code encoded as UTF-8.%%% - 16#FFFE or 16#FFFF character in string.char_to_utf8(Ch) when integer(Ch), Ch >= 0 -> if Ch < 128 -> %% 0yyyyyyy [Ch]; Ch < 16#800 -> %% 110xxxxy 10yyyyyy [16#C0 + (Ch bsr 6), 128+(Ch band 16#3F)]; Ch < 16#10000 -> %% 1110xxxx 10xyyyyy 10yyyyyy if Ch < 16#D800; Ch > 16#DFFF, Ch < 16#FFFE -> [16#E0 + (Ch bsr 12), 128+((Ch bsr 6) band 16#3F), 128+(Ch band 16#3F)] end; Ch < 16#200000 -> %% 11110xxx 10xxyyyy 10yyyyyy 10yyyyyy [16#F0+(Ch bsr 18), 128+((Ch bsr 12) band 16#3F), 128+((Ch bsr 6) band 16#3F), 128+(Ch band 16#3F)]; Ch < 16#4000000 -> %% 111110xx 10xxxyyy 10yyyyyy 10yyyyyy 10yyyyyy [16#F8+(Ch bsr 24), 128+((Ch bsr 18) band 16#3F), 128+((Ch bsr 12) band 16#3F), 128+((Ch bsr 6) band 16#3F), 128+(Ch band 16#3F)]; Ch < 16#80000000 -> %% 1111110x 10xxxxyy 10yyyyyy 10yyyyyy 10yyyyyy 10yyyyyy [16#FC+(Ch bsr 30), 128+((Ch bsr 24) band 16#3F), 128+((Ch bsr 18) band 16#3F), 128+((Ch bsr 12) band 16#3F), 128+((Ch bsr 6) band 16#3F), 128+(Ch band 16#3F)] end.%% expand_utf8([Byte]) -> {[UnicodeChar],NumberOfBadBytes}%% Expand UTF8 byte sequences to ISO 10646/Unicode%% charactes. Any illegal bytes are removed and the number of%% bad bytes are returned.%%%% Reference:%% RFC 3629: "UTF-8, a transformation format of ISO 10646".expand_utf8(Str) -> expand_utf8_1(Str, [], 0).expand_utf8_1([C|Cs], Acc, Bad) when C < 16#80 -> %% Plain Ascii character. expand_utf8_1(Cs, [C|Acc], Bad);expand_utf8_1([C1,C2|Cs], Acc, Bad) when C1 band 16#E0 =:= 16#C0, C2 band 16#C0 =:= 16#80 -> case ((C1 band 16#1F) bsl 6) bor (C2 band 16#3F) of C when 16#80 =< C -> expand_utf8_1(Cs, [C|Acc], Bad); _ -> %% Bad range. expand_utf8_1(Cs, Acc, Bad+1) end;expand_utf8_1([C1,C2,C3|Cs], Acc, Bad) when C1 band 16#F0 =:= 16#E0, C2 band 16#C0 =:= 16#80, C3 band 16#C0 =:= 16#80 -> case ((((C1 band 16#0F) bsl 6) bor (C2 band 16#3F)) bsl 6) bor (C3 band 16#3F) of C when 16#800 =< C -> expand_utf8_1(Cs, [C|Acc], Bad); _ -> %% Bad range. expand_utf8_1(Cs, Acc, Bad+1) end;expand_utf8_1([C1,C2,C3,C4|Cs], Acc, Bad) when C1 band 16#F8 =:= 16#F0, C2 band 16#C0 =:= 16#80, C3 band 16#C0 =:= 16#80, C4 band 16#C0 =:= 16#80 -> case ((((((C1 band 16#0F) bsl 6) bor (C2 band 16#3F)) bsl 6) bor (C3 band 16#3F)) bsl 6) bor (C4 band 16#3F) of C when 16#10000 =< C -> expand_utf8_1(Cs, [C|Acc], Bad); _ -> %% Bad range. expand_utf8_1(Cs, Acc, Bad+1) end;expand_utf8_1([_|Cs], Acc, Bad) -> %% Ignore bad character. expand_utf8_1(Cs, Acc, Bad+1);expand_utf8_1([], Acc, Bad) -> {lists:reverse(Acc),Bad}.%%% ----------------------------------------------------------------------------%%% Translation to/from any IANA defined character set, given that a mapping%%% exists. Don't care about validating valid subsets of Unicodeto_unicode(Input,Cs) when Cs=='ansi_x3.4-1968';Cs=='iso-ir-6'; Cs=='ansi_x3.4-1986';Cs=='iso_646.irv:1991'; Cs=='ascii';Cs=='iso646-us';Cs=='us-ascii';Cs=='us'; Cs=='ibm367';Cs=='cp367';Cs=='csascii' -> % US-ASCII Input;to_unicode(Input,Cs) when Cs=='iso-10646-utf-1';Cs=='csiso10646utf1' -> Input;to_unicode(Input,Cs) when Cs=='iso_646.basic:1983';Cs=='ref'; Cs=='csiso646basic1983' -> Input;to_unicode(Input,Cs) when Cs=='iso_8859-1:1987';Cs=='iso-ir-100'; Cs=='iso_8859-1';Cs=='iso-8859-1';Cs=='latin1'; Cs=='l1';Cs=='ibm819'; Cs=='cp819';Cs=='csisolatin1' -> Input;% to_unicode(Input,Cs) when Cs=='mnemonic';Cs=='"mnemonic+ascii+38';% Cs=='mnem';Cs=='"mnemonic+ascii+8200' ->% from_mnemonic(Input);to_unicode(Input,Cs) when Cs=='iso-10646-ucs-2';Cs=='csunicode' -> from_ucs2be(Input); % Guess byteorderto_unicode(Input,Cs) when Cs=='iso-10646-ucs-4';Cs=='csucs4' -> from_ucs4be(Input); % Guess byteorderto_unicode(Input,Cs) when Cs=='utf-16be';Cs=='utf-16' -> from_utf16be(Input);to_unicode(Input,'utf-16le') -> from_utf16le(Input);to_unicode(Input,'utf-8') -> from_utf8(Input);to_unicode(Input,Charset) -> exit({bad_character_code,Input,Charset}). %ucs_data:to_unicode(Input,Charset).%%% Tests if Char is in Charset.%%% Do this by trying to convert it into unicode, if possible a mapping was%%% found and we are ok.is_incharset(In,Cs) when Cs=='ansi_x3.4-1968';Cs=='iso-ir-6'; Cs=='ansi_x3.4-1986';Cs=='iso_646.irv:1991'; Cs=='ascii';Cs=='iso646-us';Cs=='us-ascii';Cs=='us'; Cs=='ibm367';Cs=='cp367';Cs=='csascii' -> % US-ASCII if integer(In) -> is_ascii(In); list(In) -> test_charset(fun is_ascii/1,In) end;is_incharset(In,Cs) when Cs=='iso-10646-utf-1';Cs=='csiso10646utf1' -> if integer(In) -> is_unicode(In); list(In) -> test_charset(fun is_unicode/1, In) end;is_incharset(In,Cs) when Cs=='iso_646.basic:1983';Cs=='ref'; Cs=='csiso646basic1983' -> if integer(In) -> is_iso646_basic(In); list(In) -> test_charset(fun is_iso646_basic/1, In) end;is_incharset(In,Cs) when Cs=='iso_8859-1:1987';Cs=='iso-ir-100'; Cs=='iso_8859-1';Cs=='iso-8859-1'; Cs=='latin1';Cs=='l1';Cs=='ibm819'; Cs=='cp819';Cs=='csisolatin1' -> if integer(In) -> is_latin1(In); list(In) -> test_charset(fun is_latin1/1, In) end;is_incharset(In,Charset) when integer(In) -> case to_unicode([In],Charset) of {error,unsupported_charset} -> {error,unsupported_charset}; {error,_} -> false; [Int] when integer(Int) -> true end;is_incharset(In,Charset) when list(In) -> case to_unicode(In,Charset) of {error,unsupported_charset} -> {error,unsupported_charset}; {error,_} -> false; [Int] when integer(Int) -> true end.test_charset(Fun,Input) -> case lists:all(Fun, Input) of true -> true; _ -> false end.
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?