📄 ucd.pm
字号:
package Unicode::UCD;use strict;use warnings;our $VERSION = '0.25';use Storable qw(dclone);require Exporter;our @ISA = qw(Exporter);our @EXPORT_OK = qw(charinfo charblock charscript charblocks charscripts charinrange general_categories bidi_types compexcl casefold casespec namedseq);use Carp;=head1 NAMEUnicode::UCD - Unicode character database=head1 SYNOPSIS use Unicode::UCD 'charinfo'; my $charinfo = charinfo($codepoint); use Unicode::UCD 'charblock'; my $charblock = charblock($codepoint); use Unicode::UCD 'charscript'; my $charscript = charscript($codepoint); use Unicode::UCD 'charblocks'; my $charblocks = charblocks(); use Unicode::UCD 'charscripts'; my $charscripts = charscripts(); use Unicode::UCD qw(charscript charinrange); my $range = charscript($script); print "looks like $script\n" if charinrange($range, $codepoint); use Unicode::UCD qw(general_categories bidi_types); my $categories = general_categories(); my $types = bidi_types(); use Unicode::UCD 'compexcl'; my $compexcl = compexcl($codepoint); use Unicode::UCD 'namedseq'; my $namedseq = namedseq($named_sequence_name); my $unicode_version = Unicode::UCD::UnicodeVersion();=head1 DESCRIPTIONThe Unicode::UCD module offers a simple interface to the UnicodeCharacter Database.=cutmy $UNICODEFH;my $BLOCKSFH;my $SCRIPTSFH;my $VERSIONFH;my $COMPEXCLFH;my $CASEFOLDFH;my $CASESPECFH;my $NAMEDSEQFH;sub openunicode { my ($rfh, @path) = @_; my $f; unless (defined $$rfh) { for my $d (@INC) { use File::Spec; $f = File::Spec->catfile($d, "unicore", @path); last if open($$rfh, $f); undef $f; } croak __PACKAGE__, ": failed to find ", File::Spec->catfile(@path), " in @INC" unless defined $f; } return $f;}=head2 charinfo use Unicode::UCD 'charinfo'; my $charinfo = charinfo(0x41);charinfo() returns a reference to a hash that has the following fieldsas defined by the Unicode standard: key code code point with at least four hexdigits name name of the character IN UPPER CASE category general category of the character combining classes used in the Canonical Ordering Algorithm bidi bidirectional type decomposition character decomposition mapping decimal if decimal digit this is the integer numeric value digit if digit this is the numeric value numeric if numeric is the integer or rational numeric value mirrored if mirrored in bidirectional text unicode10 Unicode 1.0 name if existed and different comment ISO 10646 comment field upper uppercase equivalent mapping lower lowercase equivalent mapping title titlecase equivalent mapping block block the character belongs to (used in \p{In...}) script script the character belongs toIf no match is found, a reference to an empty hash is returned.The C<block> property is the same as returned by charinfo(). It isnot defined in the Unicode Character Database proper (Chapter 4 of theUnicode 3.0 Standard, aka TUS3) but instead in an auxiliary database(Chapter 14 of TUS3). Similarly for the C<script> property.Note that you cannot do (de)composition and casing based solely on theabove C<decomposition> and C<lower>, C<upper>, C<title>, properties,you will need also the compexcl(), casefold(), and casespec() functions.=cut# NB: This function is duplicated in charnames.pmsub _getcode { my $arg = shift; if ($arg =~ /^[1-9]\d*$/) { return $arg; } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) { return hex($1); } return;}# Lingua::KO::Hangul::Util not part of the standard distribution# but it will be used if available.eval { require Lingua::KO::Hangul::Util };my $hasHangulUtil = ! $@;if ($hasHangulUtil) { Lingua::KO::Hangul::Util->import();}sub hangul_decomp { # internal: called from charinfo if ($hasHangulUtil) { my @tmp = decomposeHangul(shift); return sprintf("%04X %04X", @tmp) if @tmp == 2; return sprintf("%04X %04X %04X", @tmp) if @tmp == 3; } return;}sub hangul_charname { # internal: called from charinfo return sprintf("HANGUL SYLLABLE-%04X", shift);}sub han_charname { # internal: called from charinfo return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);}my @CharinfoRanges = (# block name# [ first, last, coderef to name, coderef to decompose ],# CJK Ideographs Extension A [ 0x3400, 0x4DB5, \&han_charname, undef ],# CJK Ideographs [ 0x4E00, 0x9FA5, \&han_charname, undef ],# Hangul Syllables [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ],# Non-Private Use High Surrogates [ 0xD800, 0xDB7F, undef, undef ],# Private Use High Surrogates [ 0xDB80, 0xDBFF, undef, undef ],# Low Surrogates [ 0xDC00, 0xDFFF, undef, undef ],# The Private Use Area [ 0xE000, 0xF8FF, undef, undef ],# CJK Ideographs Extension B [ 0x20000, 0x2A6D6, \&han_charname, undef ],# Plane 15 Private Use Area [ 0xF0000, 0xFFFFD, undef, undef ],# Plane 16 Private Use Area [ 0x100000, 0x10FFFD, undef, undef ],);sub charinfo { my $arg = shift; my $code = _getcode($arg); croak __PACKAGE__, "::charinfo: unknown code '$arg'" unless defined $code; my $hexk = sprintf("%06X", $code); my($rcode,$rname,$rdec); foreach my $range (@CharinfoRanges){ if ($range->[0] <= $code && $code <= $range->[1]) { $rcode = $hexk; $rcode =~ s/^0+//; $rcode = sprintf("%04X", hex($rcode)); $rname = $range->[2] ? $range->[2]->($code) : ''; $rdec = $range->[3] ? $range->[3]->($code) : ''; $hexk = sprintf("%06X", $range->[0]); # replace by the first last; } } openunicode(\$UNICODEFH, "UnicodeData.txt"); if (defined $UNICODEFH) { use Search::Dict 1.02; if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) { my $line = <$UNICODEFH>; return unless defined $line; chomp $line; my %prop; @prop{qw( code name category combining bidi decomposition decimal digit numeric mirrored unicode10 comment upper lower title )} = split(/;/, $line, -1); $hexk =~ s/^0+//; $hexk = sprintf("%04X", hex($hexk)); if ($prop{code} eq $hexk) { $prop{block} = charblock($code); $prop{script} = charscript($code); if(defined $rname){ $prop{code} = $rcode; $prop{name} = $rname; $prop{decomposition} = $rdec; } return \%prop; } } } return;}sub _search { # Binary search in a [[lo,hi,prop],[...],...] table. my ($table, $lo, $hi, $code) = @_; return if $lo > $hi; my $mid = int(($lo+$hi) / 2); if ($table->[$mid]->[0] < $code) { if ($table->[$mid]->[1] >= $code) { return $table->[$mid]->[2]; } else { _search($table, $mid + 1, $hi, $code); } } elsif ($table->[$mid]->[0] > $code) { _search($table, $lo, $mid - 1, $code); } else { return $table->[$mid]->[2]; }}sub charinrange { my ($range, $arg) = @_; my $code = _getcode($arg); croak __PACKAGE__, "::charinrange: unknown code '$arg'" unless defined $code; _search($range, 0, $#$range, $code);}=head2 charblock use Unicode::UCD 'charblock'; my $charblock = charblock(0x41); my $charblock = charblock(1234); my $charblock = charblock("0x263a"); my $charblock = charblock("U+263a"); my $range = charblock('Armenian');With a B<code point argument> charblock() returns the I<block> the characterbelongs to, e.g. C<Basic Latin>. Note that not all the characterpositions within all blocks are defined.See also L</Blocks versus Scripts>.If supplied with an argument that can't be a code point, charblock() triesto do the opposite and interpret the argument as a character block. Thereturn value is a I<range>: an anonymous list of lists that containI<start-of-range>, I<end-of-range> code point pairs. You can test whethera code point is in a range using the L</charinrange> function. If theargument is not a known character block, C<undef> is returned.=cutmy @BLOCKS;my %BLOCKS;sub _charblocks { unless (@BLOCKS) { if (openunicode(\$BLOCKSFH, "Blocks.txt")) { local $_; while (<$BLOCKSFH>) { if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) { my ($lo, $hi) = (hex($1), hex($2)); my $subrange = [ $lo, $hi, $3 ]; push @BLOCKS, $subrange; push @{$BLOCKS{$3}}, $subrange; } } close($BLOCKSFH); } }}sub charblock { my $arg = shift; _charblocks() unless @BLOCKS; my $code = _getcode($arg); if (defined $code) { _search(\@BLOCKS, 0, $#BLOCKS, $code); } else { if (exists $BLOCKS{$arg}) { return dclone $BLOCKS{$arg}; } else { return; } }}=head2 charscript use Unicode::UCD 'charscript'; my $charscript = charscript(0x41); my $charscript = charscript(1234); my $charscript = charscript("U+263a"); my $range = charscript('Thai');With a B<code point argument> charscript() returns the I<script> thecharacter belongs to, e.g. C<Latin>, C<Greek>, C<Han>.See also L</Blocks versus Scripts>.If supplied with an argument that can't be a code point, charscript() triesto do the opposite and interpret the argument as a character script. Thereturn value is a I<range>: an anonymous list of lists that containI<start-of-range>, I<end-of-range> code point pairs. You can test whether acode point is in a range using the L</charinrange> function. If theargument is not a known character script, C<undef> is returned.=cutmy @SCRIPTS;my %SCRIPTS;sub _charscripts { unless (@SCRIPTS) { if (openunicode(\$SCRIPTSFH, "Scripts.txt")) { local $_; while (<$SCRIPTSFH>) { if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) { my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1)); my $script = lc($3); $script =~ s/\b(\w)/uc($1)/ge; my $subrange = [ $lo, $hi, $script ]; push @SCRIPTS, $subrange; push @{$SCRIPTS{$script}}, $subrange; } } close($SCRIPTSFH); @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS; } }}sub charscript { my $arg = shift; _charscripts() unless @SCRIPTS; my $code = _getcode($arg); if (defined $code) { _search(\@SCRIPTS, 0, $#SCRIPTS, $code); } else { if (exists $SCRIPTS{$arg}) { return dclone $SCRIPTS{$arg}; } else { return; } }}=head2 charblocks use Unicode::UCD 'charblocks'; my $charblocks = charblocks();charblocks() returns a reference to a hash with the known block namesas the keys, and the code point ranges (see L</charblock>) as the values.See also L</Blocks versus Scripts>.=cutsub charblocks { _charblocks() unless %BLOCKS; return dclone \%BLOCKS;}=head2 charscripts use Unicode::UCD 'charscripts'; my $charscripts = charscripts();charscripts() returns a reference to a hash with the known scriptnames as the keys, and the code point ranges (see L</charscript>) asthe values.See also L</Blocks versus Scripts>.=cutsub charscripts { _charscripts() unless %SCRIPTS; return dclone \%SCRIPTS;}=head2 Blocks versus ScriptsThe difference between a block and a script is that scripts are closerto the linguistic notion of a set of characters required to presentlanguages, while block is more of an artifact of the Unicode characternumbering and separation into blocks of (mostly) 256 characters.For example the Latin B<script> is spread over several B<blocks>, suchas C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, andC<Latin Extended-B>. On the other hand, the Latin script does notcontain all the characters of the C<Basic Latin> block (also known asthe ASCII): it includes only the letters, and not, for example, the digitsor the punctuation.For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txtFor scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/=head2 Matching Scripts and BlocksScripts are matched with the regular-expression constructC<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -