xmerl_ucs.erl

来自「OTP是开放电信平台的简称」· ERL 代码 · 共 556 行 · 第 1/2 页
ERL
556 行
%%% -*- Erlang -*-%%%-------------------------------------------------------------------%%% Author: Lon Willett <Lon.Willett@sse.ie>%%%%%% Description: Some minimal support for encoding, decoding, and%%% manipulating strings of ISO-10646 characters (i.e. Unicode).%%%-------------------------------------------------------------------%%% Note:%%% - The ucs server must be started before any call to:%%%   to_unicode/2, from_unicode/2, getMIB/1, getCharset/1 and all the charset%%%   test predicates. The server will currently NOT start automatically even%%%   if this is not the case.%% NOTICE: This is just an excerpt of the original ucs application-module(xmerl_ucs).-vsn('0.3').-author('Lon.Willett@sse.ie').-modified_by('johan.blom@mobilearts.se').-compile([verbose,report_warnings,warn_unused_vars]).%%% Conversion to/from IANA recognised character sets-export([to_unicode/2]).%%% Micellaneous predicates-export([is_iso10646/1, is_unicode/1, is_bmpchar/1, is_latin1/1, is_ascii/1,	 is_visible_latin1/1, is_visible_ascii/1, is_iso646_basic/1,	 is_incharset/2]).%%% Conversion to/from RFC-1345 style mnemonic strings consisting%%% of subsets of ISO-10646 with "escape" sequences.%-export([from_mnemonic/1, from_mnemonic/2]).%%% UCS-2, UCS-4, UTF-16, and UTF-8 encoding and decoding-export([to_ucs2be/1,from_ucs2be/1, from_ucs2be/2]).-export([to_ucs2le/1,from_ucs2le/1, from_ucs2le/2]).-export([to_ucs4be/1,from_ucs4be/1, from_ucs4be/2]).-export([to_ucs4le/1,from_ucs4le/1, from_ucs4le/2]).-export([to_utf16be/1, from_utf16be/1, from_utf16be/2]).-export([to_utf16le/1, from_utf16le/1, from_utf16le/2]).-export([to_utf8/1, from_utf8/1]).%%% NB: Non-canonical UTF-8 encodings and incorrectly used%%% surrogate-pair codes are disallowed by this code.  There are%%% important security implications concerning them.  DO NOT REMOVE%%% THE VARIOUS GUARDS AND TESTS THAT ENFORCE THIS POLICY.%%% Test if Ch is a legitimate ISO-10646 character codeis_iso10646(Ch) when integer(Ch), Ch >= 0 ->    if Ch  < 16#D800 -> true;       Ch  < 16#E000 -> false;	% Surrogates       Ch  < 16#FFFE -> true;       Ch =< 16#FFFF -> false;	% FFFE and FFFF (not characters)       Ch =< 16#7FFFFFFF -> true;       true -> false    end;is_iso10646(_) -> false.%%% Test if Ch is a legitimate ISO-10646 character code capable of%%% being encoded in a UTF-16 string.is_unicode(Ch) when Ch < 16#110000 -> is_iso10646(Ch);is_unicode(_) -> false.%%% Test if Ch is a legitimate ISO-10646 character code belonging to%%% the basic multi-lingual plane (BMP).is_bmpchar(Ch) when integer(Ch), Ch >= 0 ->    if Ch < 16#D800 -> true;       Ch < 16#E000 -> false;	% Surrogates       Ch < 16#FFFE -> true;       true -> false    end;is_bmpchar(_) -> false.%%% Test for legitimate Latin-1 codeis_latin1(Ch) when integer(Ch), Ch >= 0, Ch =< 255 -> true;is_latin1(_) -> false.%%% Test for legitimate ASCII codeis_ascii(Ch) when integer(Ch), Ch >= 0, Ch =< 127 -> true;is_ascii(_) -> false.%%% Test for char an element of ISO-646.basic setis_iso646_basic(Ch) when integer(Ch), Ch >= $\s ->    if Ch =< $Z ->	    %% Everything in this range except $# $$ and $@	    if Ch > $$ -> Ch =/= $@;	       true -> Ch < $#	    end;       %% Only $_ and $a .. $z in range above $Z       Ch > $z -> false;       Ch >= $a -> true;       true -> Ch =:= $_    end;is_iso646_basic(_) ->    false.%%% Test for char a visible Latin-1 char, i.e. a non-control Latin-1 char,%%% excepting non-break space (but including space).is_visible_latin1(Ch) when integer(Ch), Ch >= $\s ->    if Ch =< $~ -> true;       Ch >= 161 -> Ch =< 255    end;is_visible_latin1(_) ->    false.%%% Test for char a visible ASCII char, i.e. a non-control ASCII char%%% (including space).is_visible_ascii(Ch) when integer(Ch), Ch >= $\s -> Ch =< $~;is_visible_ascii(_) -> false.%%% UCS-4, big and little endian versions, encoding and decodingto_ucs4be(List) when list(List) -> lists:flatmap(fun to_ucs4be/1, List);to_ucs4be(Ch) -> char_to_ucs4be(Ch).from_ucs4be(Bin) when binary(Bin) -> from_ucs4be(Bin,[],[]);from_ucs4be(List) -> from_ucs4be(list_to_binary(List),[],[]).from_ucs4be(Bin,Tail) when binary(Bin) -> from_ucs4be(Bin,[],Tail);from_ucs4be(List,Tail) -> from_ucs4be(list_to_binary(List),[],Tail).to_ucs4le(List) when list(List) -> lists:flatmap(fun to_ucs4le/1, List);to_ucs4le(Ch) -> char_to_ucs4le(Ch).from_ucs4le(Bin) when binary(Bin) -> from_ucs4le(Bin,[],[]);from_ucs4le(List) -> from_ucs4le(list_to_binary(List),[],[]).from_ucs4le(Bin,Tail) when binary(Bin) -> from_ucs4le(Bin,[],Tail);from_ucs4le(List,Tail) -> from_ucs4le(list_to_binary(List),[],Tail).%%% UCS-2, big and little endian versions, encoding and decodingto_ucs2be(List) when list(List) -> lists:flatmap(fun to_ucs2be/1, List);to_ucs2be(Ch) -> char_to_ucs2be(Ch).from_ucs2be(Bin) when binary(Bin) -> from_ucs2be(Bin,[],[]);from_ucs2be(List) -> from_ucs2be(list_to_binary(List),[],[]).from_ucs2be(Bin,Tail) when binary(Bin) -> from_ucs2be(Bin,[],Tail);from_ucs2be(List,Tail) -> from_ucs2be(list_to_binary(List),[],Tail).to_ucs2le(List) when list(List) -> lists:flatmap(fun to_ucs2le/1, List);to_ucs2le(Ch) -> char_to_ucs2le(Ch).from_ucs2le(Bin) when binary(Bin) -> from_ucs2le(Bin,[],[]);from_ucs2le(List) -> from_ucs2le(list_to_binary(List),[],[]).from_ucs2le(Bin,Tail) when binary(Bin) -> from_ucs2le(Bin,[],Tail);from_ucs2le(List,Tail) -> from_ucs2le(list_to_binary(List),[],Tail).%%% UTF-16, big and little endian versions, encoding and decodingto_utf16be(List) when list(List) -> lists:flatmap(fun to_utf16be/1, List);to_utf16be(Ch) -> char_to_utf16be(Ch).from_utf16be(Bin) when binary(Bin) -> from_utf16be(Bin,[],[]);from_utf16be(List) -> from_utf16be(list_to_binary(List),[],[]).from_utf16be(Bin,Tail) when binary(Bin) -> from_utf16be(Bin,[],Tail);from_utf16be(List,Tail) -> from_utf16be(list_to_binary(List),[],Tail).to_utf16le(List) when list(List) -> lists:flatmap(fun to_utf16le/1, List);to_utf16le(Ch) -> char_to_utf16le(Ch).from_utf16le(Bin) when binary(Bin) -> from_utf16le(Bin,[],[]);from_utf16le(List) -> from_utf16le(list_to_binary(List),[],[]).from_utf16le(Bin,Tail) when binary(Bin) -> from_utf16le(Bin,[],Tail);from_utf16le(List,Tail) -> from_utf16le(list_to_binary(List),[],Tail).%%% UTF-8 encoding and decodingto_utf8(List) when list(List) -> lists:flatmap(fun to_utf8/1, List);to_utf8(Ch) -> char_to_utf8(Ch).from_utf8(Bin) when binary(Bin) -> from_utf8(binary_to_list(Bin));from_utf8(List) ->     case expand_utf8(List) of	{Result,0} -> Result;	{_Res,_NumBadChar} ->	    exit({ucs,{bad_utf8_character_code}})    end.%%% UCS-4 support%%% Possible errors encoding UCS-4:%%%	- Non-character values (something other than 0 .. 2^31-1)%%%	- Surrogate-pair code in string.%%%	- 16#FFFE or 16#FFFF character in string.%%% Possible errors decoding UCS-4:%%%	- Element out of range (i.e. the "sign" bit is set).%%%	- Surrogate-pair code in string.%%%	- 16#FFFE or 16#FFFF character in string.char_to_ucs4be(Ch) ->    true = is_iso10646(Ch),    [(Ch bsr 24),     (Ch bsr 16) band 16#FF,     (Ch bsr 8) band 16#FF,     Ch band 16#FF].from_ucs4be(<<Ch:32/big-signed-integer, Rest/binary>>,Acc,Tail) ->    if Ch < 0; Ch >= 16#D800, Ch < 16#E000; Ch =:= 16#FFFE; Ch =:= 16#FFFF ->	    exit({bad_character_code,Ch});       true ->	    from_ucs4be(Rest,[Ch|Acc],Tail)    end;from_ucs4be(<<>>,Acc,Tail) ->    lists:reverse(Acc,Tail);from_ucs4be(Bin,Acc,Tail) ->    io:format("ucs Error: Bin=~p~n     Acc=~p~n     Tail=~p~n",[Bin,Acc,Tail]),    {error,not_ucs4be}.char_to_ucs4le(Ch) ->    true = is_iso10646(Ch),    [Ch band 16#FF,     (Ch bsr 8) band 16#FF,     (Ch bsr 16) band 16#FF,     (Ch bsr 24)].from_ucs4le(<<Ch:32/little-signed-integer, Rest/binary>>,Acc,Tail) ->    if Ch < 0; Ch >= 16#D800, Ch < 16#E000; Ch =:= 16#FFFE; Ch =:= 16#FFFF ->	    exit({bad_character_code,Ch});       true ->	    from_ucs4le(Rest,[Ch|Acc],Tail)    end;from_ucs4le(<<>>,Acc,Tail) ->    lists:reverse(Acc,Tail);from_ucs4le(Bin,Acc,Tail) ->    io:format("ucs Error: Bin=~p~n     Acc=~p~n     Tail=~p~n",[Bin,Acc,Tail]),    {error,not_ucs4le}.%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% UCS-2 support%%% FIXME! Don't know how to encode UCS-2!! %%% Currently I just encode as UCS-4, but strips the 16 higher bits.char_to_ucs2be(Ch) ->    true = is_iso10646(Ch),    [(Ch bsr 8) band 16#FF,     Ch band 16#FF].from_ucs2be(<<Ch:16/big-signed-integer, Rest/binary>>,Acc,Tail) ->    if Ch < 0; Ch >= 16#D800, Ch < 16#E000; Ch =:= 16#FFFE; Ch =:= 16#FFFF ->	    exit({bad_character_code,Ch});       true ->	    from_ucs2be(Rest,[Ch|Acc],Tail)    end;from_ucs2be(<<>>,Acc,Tail) ->    lists:reverse(Acc,Tail);from_ucs2be(Bin,Acc,Tail) ->    io:format("ucs Error: Bin=~p~n     Acc=~p~n     Tail=~p~n",[Bin,Acc,Tail]),    {error,not_ucs2be}.char_to_ucs2le(Ch) ->    true = is_iso10646(Ch),    [(Ch bsr 16) band 16#FF,     (Ch bsr 24)].from_ucs2le(<<Ch:16/little-signed-integer, Rest/binary>>,Acc,Tail) ->    if Ch < 0; Ch >= 16#D800, Ch < 16#E000; Ch =:= 16#FFFE; Ch =:= 16#FFFF ->	    exit({bad_character_code,Ch});       true ->	    from_ucs4le(Rest,[Ch|Acc],Tail)    end;from_ucs2le(<<>>,Acc,Tail) ->    lists:reverse(Acc,Tail);from_ucs2le(Bin,Acc,Tail) ->    io:format("ucs Error: Bin=~p~n     Acc=~p~n     Tail=~p~n",[Bin,Acc,Tail]),    {error,not_ucs2le}.%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
xmerl_ucs.erl - 源码说明

本页面展示了「OTP是开放电信平台的简称」中的 xmerl_ucs.erl 源码文件，采用 ERL 编程语言编写，共 556 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫开发者社区收录了大量与OTP相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?