xmerl_ucs.erl
来自「OTP是开放电信平台的简称」· ERL 代码 · 共 556 行 · 第 1/2 页
ERL
556 行
%%% -*- Erlang -*-%%%-------------------------------------------------------------------%%% Author: Lon Willett <Lon.Willett@sse.ie>%%%%%% Description: Some minimal support for encoding, decoding, and%%% manipulating strings of ISO-10646 characters (i.e. Unicode).%%%-------------------------------------------------------------------%%% Note:%%% - The ucs server must be started before any call to:%%% to_unicode/2, from_unicode/2, getMIB/1, getCharset/1 and all the charset%%% test predicates. The server will currently NOT start automatically even%%% if this is not the case.%% NOTICE: This is just an excerpt of the original ucs application-module(xmerl_ucs).-vsn('0.3').-author('Lon.Willett@sse.ie').-modified_by('johan.blom@mobilearts.se').-compile([verbose,report_warnings,warn_unused_vars]).%%% Conversion to/from IANA recognised character sets-export([to_unicode/2]).%%% Micellaneous predicates-export([is_iso10646/1, is_unicode/1, is_bmpchar/1, is_latin1/1, is_ascii/1, is_visible_latin1/1, is_visible_ascii/1, is_iso646_basic/1, is_incharset/2]).%%% Conversion to/from RFC-1345 style mnemonic strings consisting%%% of subsets of ISO-10646 with "escape" sequences.%-export([from_mnemonic/1, from_mnemonic/2]).%%% UCS-2, UCS-4, UTF-16, and UTF-8 encoding and decoding-export([to_ucs2be/1,from_ucs2be/1, from_ucs2be/2]).-export([to_ucs2le/1,from_ucs2le/1, from_ucs2le/2]).-export([to_ucs4be/1,from_ucs4be/1, from_ucs4be/2]).-export([to_ucs4le/1,from_ucs4le/1, from_ucs4le/2]).-export([to_utf16be/1, from_utf16be/1, from_utf16be/2]).-export([to_utf16le/1, from_utf16le/1, from_utf16le/2]).-export([to_utf8/1, from_utf8/1]).%%% NB: Non-canonical UTF-8 encodings and incorrectly used%%% surrogate-pair codes are disallowed by this code. There are%%% important security implications concerning them. DO NOT REMOVE%%% THE VARIOUS GUARDS AND TESTS THAT ENFORCE THIS POLICY.%%% Test if Ch is a legitimate ISO-10646 character codeis_iso10646(Ch) when integer(Ch), Ch >= 0 -> if Ch < 16#D800 -> true; Ch < 16#E000 -> false; % Surrogates Ch < 16#FFFE -> true; Ch =< 16#FFFF -> false; % FFFE and FFFF (not characters) Ch =< 16#7FFFFFFF -> true; true -> false end;is_iso10646(_) -> false.%%% Test if Ch is a legitimate ISO-10646 character code capable of%%% being encoded in a UTF-16 string.is_unicode(Ch) when Ch < 16#110000 -> is_iso10646(Ch);is_unicode(_) -> false.%%% Test if Ch is a legitimate ISO-10646 character code belonging to%%% the basic multi-lingual plane (BMP).is_bmpchar(Ch) when integer(Ch), Ch >= 0 -> if Ch < 16#D800 -> true; Ch < 16#E000 -> false; % Surrogates Ch < 16#FFFE -> true; true -> false end;is_bmpchar(_) -> false.%%% Test for legitimate Latin-1 codeis_latin1(Ch) when integer(Ch), Ch >= 0, Ch =< 255 -> true;is_latin1(_) -> false.%%% Test for legitimate ASCII codeis_ascii(Ch) when integer(Ch), Ch >= 0, Ch =< 127 -> true;is_ascii(_) -> false.%%% Test for char an element of ISO-646.basic setis_iso646_basic(Ch) when integer(Ch), Ch >= $\s -> if Ch =< $Z -> %% Everything in this range except $# $$ and $@ if Ch > $$ -> Ch =/= $@; true -> Ch < $# end; %% Only $_ and $a .. $z in range above $Z Ch > $z -> false; Ch >= $a -> true; true -> Ch =:= $_ end;is_iso646_basic(_) -> false.%%% Test for char a visible Latin-1 char, i.e. a non-control Latin-1 char,%%% excepting non-break space (but including space).is_visible_latin1(Ch) when integer(Ch), Ch >= $\s -> if Ch =< $~ -> true; Ch >= 161 -> Ch =< 255 end;is_visible_latin1(_) -> false.%%% Test for char a visible ASCII char, i.e. a non-control ASCII char%%% (including space).is_visible_ascii(Ch) when integer(Ch), Ch >= $\s -> Ch =< $~;is_visible_ascii(_) -> false.%%% UCS-4, big and little endian versions, encoding and decodingto_ucs4be(List) when list(List) -> lists:flatmap(fun to_ucs4be/1, List);to_ucs4be(Ch) -> char_to_ucs4be(Ch).from_ucs4be(Bin) when binary(Bin) -> from_ucs4be(Bin,[],[]);from_ucs4be(List) -> from_ucs4be(list_to_binary(List),[],[]).from_ucs4be(Bin,Tail) when binary(Bin) -> from_ucs4be(Bin,[],Tail);from_ucs4be(List,Tail) -> from_ucs4be(list_to_binary(List),[],Tail).to_ucs4le(List) when list(List) -> lists:flatmap(fun to_ucs4le/1, List);to_ucs4le(Ch) -> char_to_ucs4le(Ch).from_ucs4le(Bin) when binary(Bin) -> from_ucs4le(Bin,[],[]);from_ucs4le(List) -> from_ucs4le(list_to_binary(List),[],[]).from_ucs4le(Bin,Tail) when binary(Bin) -> from_ucs4le(Bin,[],Tail);from_ucs4le(List,Tail) -> from_ucs4le(list_to_binary(List),[],Tail).%%% UCS-2, big and little endian versions, encoding and decodingto_ucs2be(List) when list(List) -> lists:flatmap(fun to_ucs2be/1, List);to_ucs2be(Ch) -> char_to_ucs2be(Ch).from_ucs2be(Bin) when binary(Bin) -> from_ucs2be(Bin,[],[]);from_ucs2be(List) -> from_ucs2be(list_to_binary(List),[],[]).from_ucs2be(Bin,Tail) when binary(Bin) -> from_ucs2be(Bin,[],Tail);from_ucs2be(List,Tail) -> from_ucs2be(list_to_binary(List),[],Tail).to_ucs2le(List) when list(List) -> lists:flatmap(fun to_ucs2le/1, List);to_ucs2le(Ch) -> char_to_ucs2le(Ch).from_ucs2le(Bin) when binary(Bin) -> from_ucs2le(Bin,[],[]);from_ucs2le(List) -> from_ucs2le(list_to_binary(List),[],[]).from_ucs2le(Bin,Tail) when binary(Bin) -> from_ucs2le(Bin,[],Tail);from_ucs2le(List,Tail) -> from_ucs2le(list_to_binary(List),[],Tail).%%% UTF-16, big and little endian versions, encoding and decodingto_utf16be(List) when list(List) -> lists:flatmap(fun to_utf16be/1, List);to_utf16be(Ch) -> char_to_utf16be(Ch).from_utf16be(Bin) when binary(Bin) -> from_utf16be(Bin,[],[]);from_utf16be(List) -> from_utf16be(list_to_binary(List),[],[]).from_utf16be(Bin,Tail) when binary(Bin) -> from_utf16be(Bin,[],Tail);from_utf16be(List,Tail) -> from_utf16be(list_to_binary(List),[],Tail).to_utf16le(List) when list(List) -> lists:flatmap(fun to_utf16le/1, List);to_utf16le(Ch) -> char_to_utf16le(Ch).from_utf16le(Bin) when binary(Bin) -> from_utf16le(Bin,[],[]);from_utf16le(List) -> from_utf16le(list_to_binary(List),[],[]).from_utf16le(Bin,Tail) when binary(Bin) -> from_utf16le(Bin,[],Tail);from_utf16le(List,Tail) -> from_utf16le(list_to_binary(List),[],Tail).%%% UTF-8 encoding and decodingto_utf8(List) when list(List) -> lists:flatmap(fun to_utf8/1, List);to_utf8(Ch) -> char_to_utf8(Ch).from_utf8(Bin) when binary(Bin) -> from_utf8(binary_to_list(Bin));from_utf8(List) -> case expand_utf8(List) of {Result,0} -> Result; {_Res,_NumBadChar} -> exit({ucs,{bad_utf8_character_code}}) end.%%% UCS-4 support%%% Possible errors encoding UCS-4:%%% - Non-character values (something other than 0 .. 2^31-1)%%% - Surrogate-pair code in string.%%% - 16#FFFE or 16#FFFF character in string.%%% Possible errors decoding UCS-4:%%% - Element out of range (i.e. the "sign" bit is set).%%% - Surrogate-pair code in string.%%% - 16#FFFE or 16#FFFF character in string.char_to_ucs4be(Ch) -> true = is_iso10646(Ch), [(Ch bsr 24), (Ch bsr 16) band 16#FF, (Ch bsr 8) band 16#FF, Ch band 16#FF].from_ucs4be(<<Ch:32/big-signed-integer, Rest/binary>>,Acc,Tail) -> if Ch < 0; Ch >= 16#D800, Ch < 16#E000; Ch =:= 16#FFFE; Ch =:= 16#FFFF -> exit({bad_character_code,Ch}); true -> from_ucs4be(Rest,[Ch|Acc],Tail) end;from_ucs4be(<<>>,Acc,Tail) -> lists:reverse(Acc,Tail);from_ucs4be(Bin,Acc,Tail) -> io:format("ucs Error: Bin=~p~n Acc=~p~n Tail=~p~n",[Bin,Acc,Tail]), {error,not_ucs4be}.char_to_ucs4le(Ch) -> true = is_iso10646(Ch), [Ch band 16#FF, (Ch bsr 8) band 16#FF, (Ch bsr 16) band 16#FF, (Ch bsr 24)].from_ucs4le(<<Ch:32/little-signed-integer, Rest/binary>>,Acc,Tail) -> if Ch < 0; Ch >= 16#D800, Ch < 16#E000; Ch =:= 16#FFFE; Ch =:= 16#FFFF -> exit({bad_character_code,Ch}); true -> from_ucs4le(Rest,[Ch|Acc],Tail) end;from_ucs4le(<<>>,Acc,Tail) -> lists:reverse(Acc,Tail);from_ucs4le(Bin,Acc,Tail) -> io:format("ucs Error: Bin=~p~n Acc=~p~n Tail=~p~n",[Bin,Acc,Tail]), {error,not_ucs4le}.%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% UCS-2 support%%% FIXME! Don't know how to encode UCS-2!! %%% Currently I just encode as UCS-4, but strips the 16 higher bits.char_to_ucs2be(Ch) -> true = is_iso10646(Ch), [(Ch bsr 8) band 16#FF, Ch band 16#FF].from_ucs2be(<<Ch:16/big-signed-integer, Rest/binary>>,Acc,Tail) -> if Ch < 0; Ch >= 16#D800, Ch < 16#E000; Ch =:= 16#FFFE; Ch =:= 16#FFFF -> exit({bad_character_code,Ch}); true -> from_ucs2be(Rest,[Ch|Acc],Tail) end;from_ucs2be(<<>>,Acc,Tail) -> lists:reverse(Acc,Tail);from_ucs2be(Bin,Acc,Tail) -> io:format("ucs Error: Bin=~p~n Acc=~p~n Tail=~p~n",[Bin,Acc,Tail]), {error,not_ucs2be}.char_to_ucs2le(Ch) -> true = is_iso10646(Ch), [(Ch bsr 16) band 16#FF, (Ch bsr 24)].from_ucs2le(<<Ch:16/little-signed-integer, Rest/binary>>,Acc,Tail) -> if Ch < 0; Ch >= 16#D800, Ch < 16#E000; Ch =:= 16#FFFE; Ch =:= 16#FFFF -> exit({bad_character_code,Ch}); true -> from_ucs4le(Rest,[Ch|Acc],Tail) end;from_ucs2le(<<>>,Acc,Tail) -> lists:reverse(Acc,Tail);from_ucs2le(Bin,Acc,Tail) -> io:format("ucs Error: Bin=~p~n Acc=~p~n Tail=~p~n",[Bin,Acc,Tail]), {error,not_ucs2le}.%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?