📄 collate.pm
字号:
my ($var, @wt) = unpack VCE_TEMPLATE, $vce; if ($var) { return pack(VCE_TEMPLATE, $var, 0, 0, 0, $vbl eq 'blanked' ? $wt[3] : $wt[0]); } elsif ($vbl eq 'blanked') { return $vce; } else { return pack(VCE_TEMPLATE, $var, @wt[0..2], $vbl eq 'shifted' && $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0); }}sub viewSortKey{ my $self = shift; $self->visualizeSortKey($self->getSortKey(@_));}sub visualizeSortKey{ my $self = shift; my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift); if ($self->{UCA_Version} <= 8) { $view =~ s/ ?0000 ?/|/g; } else { $view =~ s/\b0000\b/|/g; } return "[$view]";}#### arrayref of JCPS = splitEnt(string to be collated)## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)##sub splitEnt{ my $self = shift; my $wLen = $_[1]; my $code = $self->{preprocess}; my $norm = $self->{normCode}; my $map = $self->{mapping}; my $max = $self->{maxlength}; my $reH = $self->{rearrangeHash}; my $ver9 = $self->{UCA_Version} >= 9 && $self->{UCA_Version} <= 11; my ($str, @buf); if ($wLen) { $code and croak "Preprocess breaks character positions. " . "Don't use with index(), match(), etc."; $norm and croak "Normalization breaks character positions. " . "Don't use with index(), match(), etc."; $str = $_[0]; } else { $str = $_[0]; $str = &$code($str) if ref $code; $str = &$norm($str) if ref $norm; } # get array of Unicode code point of string. my @src = unpack_U($str); # rearrangement: # Character positions are not kept if rearranged, # then neglected if $wLen is true. if ($reH && ! $wLen) { for (my $i = 0; $i < @src; $i++) { if (exists $reH->{ $src[$i] } && $i + 1 < @src) { ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]); $i++; } } } # remove a code point marked as a completely ignorable. for (my $i = 0; $i < @src; $i++) { $src[$i] = undef if _isIllegal($src[$i]) || ($ver9 && $map->{ $src[$i] } && @{ $map->{ $src[$i] } } == 0); } for (my $i = 0; $i < @src; $i++) { my $jcps = $src[$i]; # skip removed code point if (! defined $jcps) { if ($wLen && @buf) { $buf[-1][2] = $i + 1; } next; } my $i_orig = $i; # find contraction if ($max->{$jcps}) { my $temp_jcps = $jcps; my $jcpsLen = 1; my $maxLen = $max->{$jcps}; for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) { next if ! defined $src[$p]; $temp_jcps .= CODE_SEP . $src[$p]; $jcpsLen++; if ($map->{$temp_jcps}) { $jcps = $temp_jcps; $i = $p; } } # not-contiguous contraction with Combining Char (cf. UTS#10, S2.1). # This process requires Unicode::Normalize. # If "normalization" is undef, here should be skipped *always* # (in spite of bool value of $CVgetCombinClass), # since canonical ordering cannot be expected. # Blocked combining character should not be contracted. if ($self->{normalization}) # $self->{normCode} is false in the case of "prenormalized". { my $preCC = 0; my $curCC = 0; for (my $p = $i + 1; $p < @src; $p++) { next if ! defined $src[$p]; $curCC = $CVgetCombinClass->($src[$p]); last unless $curCC; my $tail = CODE_SEP . $src[$p]; if ($preCC != $curCC && $map->{$jcps.$tail}) { $jcps .= $tail; $src[$p] = undef; } else { $preCC = $curCC; } } } } # skip completely ignorable if ($map->{$jcps} && @{ $map->{$jcps} } == 0) { if ($wLen && @buf) { $buf[-1][2] = $i + 1; } next; } push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps; } return \@buf;}#### list of VCE = getWt(JCPS)##sub getWt{ my $self = shift; my $u = shift; my $vbl = $self->{variable}; my $map = $self->{mapping}; my $der = $self->{derivCode}; return if !defined $u; return map(_varCE($vbl, $_), @{ $map->{$u} }) if $map->{$u}; # JCPS must not be a contraction, then it's a code point. if (Hangul_SIni <= $u && $u <= Hangul_SFin) { my $hang = $self->{overrideHangul}; my @hangulCE; if ($hang) { @hangulCE = map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u)); } elsif (!defined $hang) { @hangulCE = $der->($u); } else { my $max = $self->{maxlength}; my @decH = _decompHangul($u); if (@decH == 2) { my $contract = join(CODE_SEP, @decH); @decH = ($contract) if $map->{$contract}; } else { # must be <@decH == 3> if ($max->{$decH[0]}) { my $contract = join(CODE_SEP, @decH); if ($map->{$contract}) { @decH = ($contract); } else { $contract = join(CODE_SEP, @decH[0,1]); $map->{$contract} and @decH = ($contract, $decH[2]); } # even if V's ignorable, LT contraction is not supported. # If such a situatution were required, NFD should be used. } if (@decH == 3 && $max->{$decH[1]}) { my $contract = join(CODE_SEP, @decH[1,2]); $map->{$contract} and @decH = ($decH[0], $contract); } } @hangulCE = map({ $map->{$_} ? @{ $map->{$_} } : $der->($_); } @decH); } return map _varCE($vbl, $_), @hangulCE; } elsif (_isUIdeo($u, $self->{UCA_Version})) { my $cjk = $self->{overrideCJK}; return map _varCE($vbl, $_), $cjk ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u)) : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000 ? _uideoCE_8($u) : $der->($u); } else { return map _varCE($vbl, $_), $der->($u); }}#### string sortkey = getSortKey(string arg)##sub getSortKey{ my $self = shift; my $lev = $self->{level}; my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS my $v2i = $self->{UCA_Version} >= 9 && $self->{variable} ne 'non-ignorable'; my @buf; # weight arrays if ($self->{hangul_terminator}) { my $preHST = ''; foreach my $jcps (@$rEnt) { # weird things like VL, TL-contraction are not considered! my $curHST = ''; foreach my $u (split /;/, $jcps) { $curHST .= getHST($u); } if ($preHST && !$curHST || # hangul before non-hangul $preHST =~ /L\z/ && $curHST =~ /^T/ || $preHST =~ /V\z/ && $curHST =~ /^L/ || $preHST =~ /T\z/ && $curHST =~ /^[LV]/) { push @buf, $self->getWtHangulTerm(); } $preHST = $curHST; push @buf, $self->getWt($jcps); } $preHST # end at hangul and push @buf, $self->getWtHangulTerm(); } else { foreach my $jcps (@$rEnt) { push @buf, $self->getWt($jcps); } } # make sort key my @ret = ([],[],[],[]); my $last_is_variable; foreach my $vwt (@buf) { my($var, @wt) = unpack(VCE_TEMPLATE, $vwt); # "Ignorable (L1, L2) after Variable" since track. v. 9 if ($v2i) { if ($var) { $last_is_variable = TRUE; } elsif (!$wt[0]) { # ignorable next if $last_is_variable; } else { $last_is_variable = FALSE; } } foreach my $v (0..$lev-1) { 0 < $wt[$v] and push @{ $ret[$v] }, $wt[$v]; } } # modification of tertiary weights if ($self->{upper_before_lower}) { foreach my $w (@{ $ret[2] }) { if (0x8 <= $w && $w <= 0xC) { $w -= 6 } # lower elsif (0x2 <= $w && $w <= 0x6) { $w += 6 } # upper elsif ($w == 0x1C) { $w += 1 } # square upper elsif ($w == 0x1D) { $w -= 1 } # square lower } } if ($self->{katakana_before_hiragana}) { foreach my $w (@{ $ret[2] }) { if (0x0F <= $w && $w <= 0x13) { $w -= 2 } # katakana elsif (0x0D <= $w && $w <= 0x0E) { $w += 5 } # hiragana } } if ($self->{backwardsFlag}) { for (my $v = MinLevel; $v <= MaxLevel; $v++) { if ($self->{backwardsFlag} & (1 << $v)) { @{ $ret[$v-1] } = reverse @{ $ret[$v-1] }; } } } join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;}#### int compare = cmp(string a, string b)##sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }#### list[strings] sorted = sort(list[strings] arg)##sub sort { my $obj = shift; return map { $_->[1] } sort{ $a->[0] cmp $b->[0] } map [ $obj->getSortKey($_), $_ ], @_;}sub _derivCE_14 { my $u = shift; my $base = (CJK_UidIni <= $u && $u <= CJK_UidF41) ? 0xFB40 : # CJK (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) ? 0xFB80 # CJK ext. : 0xFBC0; # others my $aaaa = $base + ($u >> 15); my $bbbb = ($u & 0x7FFF) | 0x8000; return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u), pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);}sub _derivCE_9 { my $u = shift; my $base = (CJK_UidIni <= $u && $u <= CJK_UidFin) ? 0xFB40 : # CJK (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) ? 0xFB80 # CJK ext. : 0xFBC0; # others my $aaaa = $base + ($u >> 15); my $bbbb = ($u & 0x7FFF) | 0x8000; return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u), pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);}sub _derivCE_8 { my $code = shift; my $aaaa = 0xFF80 + ($code >> 15); my $bbbb = ($code & 0x7FFF) | 0x8000; return pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code), pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);}sub _uideoCE_8 { my $u = shift; return pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u);}sub _isUIdeo { my ($u, $uca_vers) = @_; return( (CJK_UidIni <= $u && ($uca_vers >= 14 ? ( $u <= CJK_UidF41) : ($u <= CJK_UidFin))) || (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin) || (CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) );}sub getWtHangulTerm { my $self = shift; return _varCE($self->{variable}, pack(VCE_TEMPLATE, NON_VAR, $self->{hangul_terminator}, 0,0,0));}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -