📄 collate.pm
字号:
package Unicode::Collate;BEGIN { unless ("A" eq pack('U', 0x41)) { die "Unicode::Collate cannot stringify a Unicode code point\n"; }}use 5.006;use strict;use warnings;use Carp;use File::Spec;no warnings 'utf8';our $VERSION = '0.52';our $PACKAGE = __PACKAGE__;my @Path = qw(Unicode Collate);my $KeyFile = "allkeys.txt";# Perl's booleanuse constant TRUE => 1;use constant FALSE => "";use constant NOMATCHPOS => -1;# A coderef to get combining class imported from Unicode::Normalize# (i.e. \&Unicode::Normalize::getCombinClass).# This is also used as a HAS_UNICODE_NORMALIZE flag.my $CVgetCombinClass;# Supported Levelsuse constant MinLevel => 1;use constant MaxLevel => 4;# Minimum weights at level 2 and 3, respectivelyuse constant Min2Wt => 0x20;use constant Min3Wt => 0x02;# Shifted weight at 4th leveluse constant Shift4Wt => 0xFFFF;# A boolean for Variable and 16-bit weights at 4 levels of Collation Element# PROBLEM: The Default Unicode Collation Element Table# has weights over 0xFFFF at the 4th level.# The tie-breaking in the variable weights# other than "shift" (as well as "shift-trimmed") is unreliable.use constant VCE_TEMPLATE => 'Cn4';# A sort key: 16-bit weights# See also the PROBLEM on VCE_TEMPLATE above.use constant KEY_TEMPLATE => 'n*';# Level separator in a sort key:# i.e. pack(KEY_TEMPLATE, 0)use constant LEVEL_SEP => "\0\0";# As Unicode code point separator for hash keys.# A joined code point string (denoted by JCPS below)# like "65;768" is used for internal processing# instead of Perl's Unicode string like "\x41\x{300}",# as the native code point is different from the Unicode code point# on EBCDIC platform.# This character must not be included in any stringified# representation of an integer.use constant CODE_SEP => ';';# boolean values of variable weightsuse constant NON_VAR => 0; # Non-Variable characteruse constant VAR => 1; # Variable character# specific code pointsuse constant Hangul_LBase => 0x1100;use constant Hangul_LIni => 0x1100;use constant Hangul_LFin => 0x1159;use constant Hangul_LFill => 0x115F;use constant Hangul_VBase => 0x1161;use constant Hangul_VIni => 0x1160; # from Vowel Filleruse constant Hangul_VFin => 0x11A2;use constant Hangul_TBase => 0x11A7; # from "no-final" codepointuse constant Hangul_TIni => 0x11A8;use constant Hangul_TFin => 0x11F9;use constant Hangul_TCount => 28;use constant Hangul_NCount => 588;use constant Hangul_SBase => 0xAC00;use constant Hangul_SIni => 0xAC00;use constant Hangul_SFin => 0xD7A3;use constant CJK_UidIni => 0x4E00;use constant CJK_UidFin => 0x9FA5;use constant CJK_UidF41 => 0x9FBB;use constant CJK_ExtAIni => 0x3400;use constant CJK_ExtAFin => 0x4DB5;use constant CJK_ExtBIni => 0x20000;use constant CJK_ExtBFin => 0x2A6D6;use constant BMP_Max => 0xFFFF;# Logical_Order_Exception in PropList.txtmy $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];sub UCA_Version { "14" }sub Base_Unicode_Version { "4.1.0" }######sub pack_U { return pack('U*', @_);}sub unpack_U { return unpack('U*', shift(@_).pack('U*'));}######my (%VariableOK);@VariableOK{ qw/ blanked non-ignorable shifted shift-trimmed / } = (); # keys lowercasedour @ChangeOK = qw/ alternate backwards level normalization rearrange katakana_before_hiragana upper_before_lower overrideHangul overrideCJK preprocess UCA_Version hangul_terminator variable /;our @ChangeNG = qw/ entry mapping table maxlength ignoreChar ignoreName undefChar undefName variableTable versionTable alternateTable backwardsTable forwardsTable rearrangeTable derivCode normCode rearrangeHash backwardsFlag /;# The hash key 'ignored' is deleted at v 0.21.# The hash key 'isShift' is deleted at v 0.23.# The hash key 'combining' is deleted at v 0.24.# The hash key 'entries' is deleted at v 0.30.# The hash key 'L3_ignorable' is deleted at v 0.40.sub version { my $self = shift; return $self->{versionTable} || 'unknown';}my (%ChangeOK, %ChangeNG);@ChangeOK{ @ChangeOK } = ();@ChangeNG{ @ChangeNG } = ();sub change { my $self = shift; my %hash = @_; my %old; if (exists $hash{variable} && exists $hash{alternate}) { delete $hash{alternate}; } elsif (!exists $hash{variable} && exists $hash{alternate}) { $hash{variable} = $hash{alternate}; } foreach my $k (keys %hash) { if (exists $ChangeOK{$k}) { $old{$k} = $self->{$k}; $self->{$k} = $hash{$k}; } elsif (exists $ChangeNG{$k}) { croak "change of $k via change() is not allowed!"; } # else => ignored } $self->checkCollator(); return wantarray ? %old : $self;}sub _checkLevel { my $level = shift; my $key = shift; # 'level' or 'backwards' MinLevel <= $level or croak sprintf "Illegal level %d (in value for key '%s') lower than %d.", $level, $key, MinLevel; $level <= MaxLevel or croak sprintf "Unsupported level %d (in value for key '%s') higher than %d.", $level, $key, MaxLevel;}my %DerivCode = ( 8 => \&_derivCE_8, 9 => \&_derivCE_9, 11 => \&_derivCE_9, # 11 == 9 14 => \&_derivCE_14,);sub checkCollator { my $self = shift; _checkLevel($self->{level}, "level"); $self->{derivCode} = $DerivCode{ $self->{UCA_Version} } or croak "Illegal UCA version (passed $self->{UCA_Version})."; $self->{variable} ||= $self->{alternate} || $self->{variableTable} || $self->{alternateTable} || 'shifted'; $self->{variable} = $self->{alternate} = lc($self->{variable}); exists $VariableOK{ $self->{variable} } or croak "$PACKAGE unknown variable parameter name: $self->{variable}"; if (! defined $self->{backwards}) { $self->{backwardsFlag} = 0; } elsif (! ref $self->{backwards}) { _checkLevel($self->{backwards}, "backwards"); $self->{backwardsFlag} = 1 << $self->{backwards}; } else { my %level; $self->{backwardsFlag} = 0; for my $b (@{ $self->{backwards} }) { _checkLevel($b, "backwards"); $level{$b} = 1; } for my $v (sort keys %level) { $self->{backwardsFlag} += 1 << $v; } } defined $self->{rearrange} or $self->{rearrange} = []; ref $self->{rearrange} or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF"; # keys of $self->{rearrangeHash} are $self->{rearrange}. $self->{rearrangeHash} = undef; if (@{ $self->{rearrange} }) { @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = (); } $self->{normCode} = undef; if (defined $self->{normalization}) { eval { require Unicode::Normalize }; $@ and croak "Unicode::Normalize is required to normalize strings"; $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass; if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default $self->{normCode} = \&Unicode::Normalize::NFD; } elsif ($self->{normalization} ne 'prenormalized') { my $norm = $self->{normalization}; $self->{normCode} = sub { Unicode::Normalize::normalize($norm, shift); }; eval { $self->{normCode}->("") }; # try $@ and croak "$PACKAGE unknown normalization form name: $norm"; } } return;}sub new{ my $class = shift; my $self = bless { @_ }, $class; # If undef is passed explicitly, no file is read. $self->{table} = $KeyFile if ! exists $self->{table}; $self->read_table() if defined $self->{table}; if ($self->{entry}) { while ($self->{entry} =~ /([^\n]+)/g) { $self->parseEntry($1); } } $self->{level} ||= MaxLevel; $self->{UCA_Version} ||= UCA_Version(); $self->{overrideHangul} = FALSE if ! exists $self->{overrideHangul}; $self->{overrideCJK} = FALSE if ! exists $self->{overrideCJK}; $self->{normalization} = 'NFD' if ! exists $self->{normalization}; $self->{rearrange} = $self->{rearrangeTable} || ($self->{UCA_Version} <= 11 ? $DefaultRearrange : []) if ! exists $self->{rearrange}; $self->{backwards} = $self->{backwardsTable} if ! exists $self->{backwards}; $self->checkCollator(); return $self;}sub read_table { my $self = shift; my($f, $fh); foreach my $d (@INC) { $f = File::Spec->catfile($d, @Path, $self->{table}); last if open($fh, $f); $f = undef; } if (!defined $f) { $f = File::Spec->catfile(@Path, $self->{table}); croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)"); } while (my $line = <$fh>) { next if $line =~ /^\s*#/; unless ($line =~ s/^\s*\@//) { $self->parseEntry($line); next; } # matched ^\s*\@ if ($line =~ /^version\s*(\S*)/) { $self->{versionTable} ||= $1; } elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9 $self->{variableTable} ||= $1; } elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8 $self->{alternateTable} ||= $1; } elsif ($line =~ /^backwards\s+(\S*)/) { push @{ $self->{backwardsTable} }, $1; } elsif ($line =~ /^forwards\s+(\S*)/) { # parhaps no use push @{ $self->{forwardsTable} }, $1; } elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG push @{ $self->{rearrangeTable} }, _getHexArray($1); } } close $fh;}#### get $line, parse it, and write an entry in $self##sub parseEntry{ my $self = shift; my $line = shift; my($name, $entry, @uv, @key); return if $line !~ /^\s*[0-9A-Fa-f]/; # removes comment and gets name $name = $1 if $line =~ s/[#%]\s*(.*)//; return if defined $self->{undefName} && $name =~ /$self->{undefName}/; # gets element my($e, $k) = split /;/, $line; croak "Wrong Entry: <charList> must be separated by ';' from <collElement>" if ! $k; @uv = _getHexArray($e); return if !@uv; $entry = join(CODE_SEP, @uv); # in JCPS if (defined $self->{undefChar} || defined $self->{ignoreChar}) { my $ele = pack_U(@uv); # regarded as if it were not entried in the table return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/; # replaced as completely ignorable $k = '[.0000.0000.0000.0000]' if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/; } # replaced as completely ignorable $k = '[.0000.0000.0000.0000]' if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/; my $is_L3_ignorable = TRUE; foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient. my @wt = _getHexArray($arr); push @key, pack(VCE_TEMPLATE, $var, @wt); $is_L3_ignorable = FALSE if $wt[0] || $wt[1] || $wt[2]; # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable # is completely ignorable. # For expansion, an entry $is_L3_ignorable # if and only if "all" CEs are [.0000.0000.0000]. } $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key; if (@uv > 1) { (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv) and $self->{maxlength}{$uv[0]} = @uv; }}#### VCE = _varCE(variable term, VCE)##sub _varCE{ my $vbl = shift; my $vce = shift; if ($vbl eq 'non-ignorable') { return $vce; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -