📄 collate.pm
字号:
#### "hhhh hhhh hhhh" to (dddd, dddd, dddd)##sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }## $code *must* be in Hangul syllable.# Check it before you enter here.#sub _decompHangul { my $code = shift; my $si = $code - Hangul_SBase; my $li = int( $si / Hangul_NCount); my $vi = int(($si % Hangul_NCount) / Hangul_TCount); my $ti = $si % Hangul_TCount; return ( Hangul_LBase + $li, Hangul_VBase + $vi, $ti ? (Hangul_TBase + $ti) : (), );}sub _isIllegal { my $code = shift; return ! defined $code # removed || ($code < 0 || 0x10FFFF < $code) # out of range || (($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c) || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters ;}# Hangul Syllable Typesub getHST { my $u = shift; return Hangul_LIni <= $u && $u <= Hangul_LFin || $u == Hangul_LFill ? "L" : Hangul_VIni <= $u && $u <= Hangul_VFin ? "V" : Hangul_TIni <= $u && $u <= Hangul_TFin ? "T" : Hangul_SIni <= $u && $u <= Hangul_SFin ? ($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV" : "";}#### bool _nonIgnorAtLevel(arrayref weights, int level)##sub _nonIgnorAtLevel($$){ my $wt = shift; return if ! defined $wt; my $lv = shift; return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;}#### bool _eqArray(## arrayref of arrayref[weights] source,## arrayref of arrayref[weights] substr,## int level)## * comparison of graphemes vs graphemes.## @$source >= @$substr must be true (check it before call this);##sub _eqArray($$$){ my $source = shift; my $substr = shift; my $lev = shift; for my $g (0..@$substr-1){ # Do the $g'th graphemes have the same number of AV weigths? return if @{ $source->[$g] } != @{ $substr->[$g] }; for my $w (0..@{ $substr->[$g] }-1) { for my $v (0..$lev-1) { return if $source->[$g][$w][$v] != $substr->[$g][$w][$v]; } } } return 1;}#### (int position, int length)## int position = index(string, substring, position, [undoc'ed grobal])#### With "grobal" (only for the list context),## returns list of arrayref[position, length].##sub index{ my $self = shift; my $str = shift; my $len = length($str); my $subE = $self->splitEnt(shift); my $pos = @_ ? shift : 0; $pos = 0 if $pos < 0; my $grob = shift; my $lev = $self->{level}; my $v2i = $self->{UCA_Version} >= 9 && $self->{variable} ne 'non-ignorable'; if (! @$subE) { my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos; return $grob ? map([$_, 0], $temp..$len) : wantarray ? ($temp,0) : $temp; } $len < $pos and return wantarray ? () : NOMATCHPOS; my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE); @$strE or return wantarray ? () : NOMATCHPOS; my(@strWt, @iniPos, @finPos, @subWt, @g_ret); my $last_is_variable; for my $vwt (map $self->getWt($_), @$subE) { my($var, @wt) = unpack(VCE_TEMPLATE, $vwt); my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev); # "Ignorable (L1, L2) after Variable" since track. v. 9 if ($v2i) { if ($var) { $last_is_variable = TRUE; } elsif (!$wt[0]) { # ignorable $to_be_pushed = FALSE if $last_is_variable; } else { $last_is_variable = FALSE; } } if (@subWt && !$var && !$wt[0]) { push @{ $subWt[-1] }, \@wt if $to_be_pushed; } else { push @subWt, [ \@wt ]; } } my $count = 0; my $end = @$strE - 1; $last_is_variable = FALSE; # reuse for (my $i = 0; $i <= $end; ) { # no $i++ my $found_base = 0; # fetch a grapheme while ($i <= $end && $found_base == 0) { for my $vwt ($self->getWt($strE->[$i][0])) { my($var, @wt) = unpack(VCE_TEMPLATE, $vwt); my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev); # "Ignorable (L1, L2) after Variable" since track. v. 9 if ($v2i) { if ($var) { $last_is_variable = TRUE; } elsif (!$wt[0]) { # ignorable $to_be_pushed = FALSE if $last_is_variable; } else { $last_is_variable = FALSE; } } if (@strWt && !$var && !$wt[0]) { push @{ $strWt[-1] }, \@wt if $to_be_pushed; $finPos[-1] = $strE->[$i][2]; } elsif ($to_be_pushed) { push @strWt, [ \@wt ]; push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1]; $finPos[-1] = NOMATCHPOS if $found_base; push @finPos, $strE->[$i][2]; $found_base++; } # else ===> no-op } $i++; } # try to match while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) { if ($iniPos[0] != NOMATCHPOS && $finPos[$#subWt] != NOMATCHPOS && _eqArray(\@strWt, \@subWt, $lev)) { my $temp = $iniPos[0] + $pos; if ($grob) { push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]]; splice @strWt, 0, $#subWt; splice @iniPos, 0, $#subWt; splice @finPos, 0, $#subWt; } else { return wantarray ? ($temp, $finPos[$#subWt] - $iniPos[0]) : $temp; } } shift @strWt; shift @iniPos; shift @finPos; } } return $grob ? @g_ret : wantarray ? () : NOMATCHPOS;}#### scalarref to matching part = match(string, substring)##sub match{ my $self = shift; if (my($pos,$len) = $self->index($_[0], $_[1])) { my $temp = substr($_[0], $pos, $len); return wantarray ? $temp : \$temp; # An lvalue ref \substr should be avoided, # since its value is affected by modification of its referent. } else { return; }}#### arrayref matching parts = gmatch(string, substring)##sub gmatch{ my $self = shift; my $str = shift; my $sub = shift; return map substr($str, $_->[0], $_->[1]), $self->index($str, $sub, 0, 'g');}#### bool subst'ed = subst(string, substring, replace)##sub subst{ my $self = shift; my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE; if (my($pos,$len) = $self->index($_[0], $_[1])) { if ($code) { my $mat = substr($_[0], $pos, $len); substr($_[0], $pos, $len, $code->($mat)); } else { substr($_[0], $pos, $len, $_[2]); } return TRUE; } else { return FALSE; }}#### int count = gsubst(string, substring, replace)##sub gsubst{ my $self = shift; my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE; my $cnt = 0; # Replacement is carried out from the end, then use reverse. for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) { if ($code) { my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]); substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat)); } else { substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]); } $cnt++; } return $cnt;}1;__END__=head1 NAMEUnicode::Collate - Unicode Collation Algorithm=head1 SYNOPSIS use Unicode::Collate; #construct $Collator = Unicode::Collate->new(%tailoring); #sort @sorted = $Collator->sort(@not_sorted); #compare $result = $Collator->cmp($a, $b); # returns 1, 0, or -1. # If %tailoring is false (i.e. empty), # $Collator should do the default collation.=head1 DESCRIPTIONThis module is an implementation of Unicode Technical Standard #10(a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).=head2 Constructor and TailoringThe C<new> method returns a collator object. $Collator = Unicode::Collate->new( UCA_Version => $UCA_Version, alternate => $alternate, # deprecated: use of 'variable' is recommended. backwards => $levelNumber, # or \@levelNumbers entry => $element, hangul_terminator => $term_primary_weight, ignoreName => qr/$ignoreName/, ignoreChar => qr/$ignoreChar/, katakana_before_hiragana => $bool, level => $collationLevel, normalization => $normalization_form, overrideCJK => \&overrideCJK, overrideHangul => \&overrideHangul, preprocess => \&preprocess, rearrange => \@charList, table => $filename, undefName => qr/$undefName/, undefChar => qr/$undefChar/, upper_before_lower => $bool, variable => $variable, );=over 4=item UCA_VersionIf the tracking version number of UCA is given,behavior of that tracking version is emulated on collating.If omitted, the return value of C<UCA_Version()> is used.C<UCA_Version()> should return the latest tracking version supported.The supported tracking version: 8, 9, 11, or 14. UCA Unicode Standard DUCET (@version) --------------------------------------------------- 8 3.1 3.0.1 (3.0.1d9) 9 3.1 with Corrigendum 3 3.1.1 (3.1.1) 11 4.0 4.0.0 (4.0.0) 14 4.1.0 4.1.0 (4.1.0)Note: Recent UTS #10 renames "Tracking Version" to "Revision."=item alternate-- see 3.2.2 Alternate Weighting, version 8 of UTS #10For backward compatibility, C<alternate> (old name) can be usedas an alias for C<variable>.=item backwards-- see 3.1.2 French Accents, UTS #10. backwards => $levelNumber or \@levelNumbersWeights in reverse order; ex. level 2 (diacritic ordering) in French.If omitted, forwards at all the levels.=item entry-- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.If the same character (or a sequence of characters) existsin the collation element table through C<table>,mapping to collation elements is overrided.If it does not exist, the mapping is defined additionally. entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)0063 0068 ; [.0E6A.0020.0002.0063] # ch0043 0068 ; [.0E6A.0020.0007.0043] # Ch0043 0048 ; [.0E6A.0020.0008.0043] # CH006C 006C ; [.0F4C.0020.0002.006C] # ll004C 006C ; [.0F4C.0020.0007.004C] # Ll004C 004C ; [.0F4C.0020.0008.004C] # LL00F1 ; [.0F7B.0020.0002.00F1] # n-tilde006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde00D1 ; [.0F7B.0020.0008.00D1] # N-tilde004E 0303 ; [.0F7B.0020.0008.00D1] # N-tildeENTRY entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)00E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>00C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>ENTRYB<NOTE:> The code point in the UCA file format (before C<';'>)B<must> be a Unicode code point (defined as hexadecimal),but not a native code point.So C<0063> must always denote C<U+0063>,but not a character of C<"\x63">.Weighting may vary depending on collation element table.So ensure the weights defined in C<entry> will be consistent withthose in the collation element table loaded via C<table>.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -