📄 ucd.pm

📁 视频监控网络部分的协议ddns,的模块的实现代码,请大家大胆指正.
💻 PM
📖 第 1 页 / 共 2 页
字号:
12 下一页
package Unicode::UCD;use strict;use warnings;our $VERSION = '0.25';use Storable qw(dclone);require Exporter;our @ISA = qw(Exporter);our @EXPORT_OK = qw(charinfo		    charblock charscript		    charblocks charscripts		    charinrange		    general_categories bidi_types		    compexcl		    casefold casespec		    namedseq);use Carp;=head1 NAMEUnicode::UCD - Unicode character database=head1 SYNOPSIS    use Unicode::UCD 'charinfo';    my $charinfo   = charinfo($codepoint);    use Unicode::UCD 'charblock';    my $charblock  = charblock($codepoint);    use Unicode::UCD 'charscript';    my $charscript = charscript($codepoint);    use Unicode::UCD 'charblocks';    my $charblocks = charblocks();    use Unicode::UCD 'charscripts';    my $charscripts = charscripts();    use Unicode::UCD qw(charscript charinrange);    my $range = charscript($script);    print "looks like $script\n" if charinrange($range, $codepoint);    use Unicode::UCD qw(general_categories bidi_types);    my $categories = general_categories();    my $types = bidi_types();    use Unicode::UCD 'compexcl';    my $compexcl = compexcl($codepoint);    use Unicode::UCD 'namedseq';    my $namedseq = namedseq($named_sequence_name);    my $unicode_version = Unicode::UCD::UnicodeVersion();=head1 DESCRIPTIONThe Unicode::UCD module offers a simple interface to the UnicodeCharacter Database.=cutmy $UNICODEFH;my $BLOCKSFH;my $SCRIPTSFH;my $VERSIONFH;my $COMPEXCLFH;my $CASEFOLDFH;my $CASESPECFH;my $NAMEDSEQFH;sub openunicode {    my ($rfh, @path) = @_;    my $f;    unless (defined $$rfh) {	for my $d (@INC) {	    use File::Spec;	    $f = File::Spec->catfile($d, "unicore", @path);	    last if open($$rfh, $f);	    undef $f;	}	croak __PACKAGE__, ": failed to find ",              File::Spec->catfile(@path), " in @INC"	    unless defined $f;    }    return $f;}=head2 charinfo    use Unicode::UCD 'charinfo';    my $charinfo = charinfo(0x41);charinfo() returns a reference to a hash that has the following fieldsas defined by the Unicode standard:    key    code             code point with at least four hexdigits    name             name of the character IN UPPER CASE    category         general category of the character    combining        classes used in the Canonical Ordering Algorithm    bidi             bidirectional type    decomposition    character decomposition mapping    decimal          if decimal digit this is the integer numeric value    digit            if digit this is the numeric value    numeric          if numeric is the integer or rational numeric value    mirrored         if mirrored in bidirectional text    unicode10        Unicode 1.0 name if existed and different    comment          ISO 10646 comment field    upper            uppercase equivalent mapping    lower            lowercase equivalent mapping    title            titlecase equivalent mapping    block            block the character belongs to (used in \p{In...})    script           script the character belongs toIf no match is found, a reference to an empty hash is returned.The C<block> property is the same as returned by charinfo().  It isnot defined in the Unicode Character Database proper (Chapter 4 of theUnicode 3.0 Standard, aka TUS3) but instead in an auxiliary database(Chapter 14 of TUS3).  Similarly for the C<script> property.Note that you cannot do (de)composition and casing based solely on theabove C<decomposition> and C<lower>, C<upper>, C<title>, properties,you will need also the compexcl(), casefold(), and casespec() functions.=cut# NB: This function is duplicated in charnames.pmsub _getcode {    my $arg = shift;    if ($arg =~ /^[1-9]\d*$/) {	return $arg;    } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {	return hex($1);    }    return;}# Lingua::KO::Hangul::Util not part of the standard distribution# but it will be used if available.eval { require Lingua::KO::Hangul::Util };my $hasHangulUtil = ! $@;if ($hasHangulUtil) {    Lingua::KO::Hangul::Util->import();}sub hangul_decomp { # internal: called from charinfo    if ($hasHangulUtil) {	my @tmp = decomposeHangul(shift);	return sprintf("%04X %04X",      @tmp) if @tmp == 2;	return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;    }    return;}sub hangul_charname { # internal: called from charinfo    return sprintf("HANGUL SYLLABLE-%04X", shift);}sub han_charname { # internal: called from charinfo    return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);}my @CharinfoRanges = (# block name# [ first, last, coderef to name, coderef to decompose ],# CJK Ideographs Extension A  [ 0x3400,   0x4DB5,   \&han_charname,   undef  ],# CJK Ideographs  [ 0x4E00,   0x9FA5,   \&han_charname,   undef  ],# Hangul Syllables  [ 0xAC00,   0xD7A3,   $hasHangulUtil ? \&getHangulName : \&hangul_charname,  \&hangul_decomp ],# Non-Private Use High Surrogates  [ 0xD800,   0xDB7F,   undef,   undef  ],# Private Use High Surrogates  [ 0xDB80,   0xDBFF,   undef,   undef  ],# Low Surrogates  [ 0xDC00,   0xDFFF,   undef,   undef  ],# The Private Use Area  [ 0xE000,   0xF8FF,   undef,   undef  ],# CJK Ideographs Extension B  [ 0x20000,  0x2A6D6,  \&han_charname,   undef  ],# Plane 15 Private Use Area  [ 0xF0000,  0xFFFFD,  undef,   undef  ],# Plane 16 Private Use Area  [ 0x100000, 0x10FFFD, undef,   undef  ],);sub charinfo {    my $arg  = shift;    my $code = _getcode($arg);    croak __PACKAGE__, "::charinfo: unknown code '$arg'"	unless defined $code;    my $hexk = sprintf("%06X", $code);    my($rcode,$rname,$rdec);    foreach my $range (@CharinfoRanges){      if ($range->[0] <= $code && $code <= $range->[1]) {        $rcode = $hexk;	$rcode =~ s/^0+//;	$rcode =  sprintf("%04X", hex($rcode));        $rname = $range->[2] ? $range->[2]->($code) : '';        $rdec  = $range->[3] ? $range->[3]->($code) : '';        $hexk  = sprintf("%06X", $range->[0]); # replace by the first        last;      }    }    openunicode(\$UNICODEFH, "UnicodeData.txt");    if (defined $UNICODEFH) {	use Search::Dict 1.02;	if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {	    my $line = <$UNICODEFH>;	    return unless defined $line;	    chomp $line;	    my %prop;	    @prop{qw(		     code name category		     combining bidi decomposition		     decimal digit numeric		     mirrored unicode10 comment		     upper lower title		    )} = split(/;/, $line, -1);	    $hexk =~ s/^0+//;	    $hexk =  sprintf("%04X", hex($hexk));	    if ($prop{code} eq $hexk) {		$prop{block}  = charblock($code);		$prop{script} = charscript($code);		if(defined $rname){                    $prop{code} = $rcode;                    $prop{name} = $rname;                    $prop{decomposition} = $rdec;                }		return \%prop;	    }	}    }    return;}sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.    my ($table, $lo, $hi, $code) = @_;    return if $lo > $hi;    my $mid = int(($lo+$hi) / 2);    if ($table->[$mid]->[0] < $code) {	if ($table->[$mid]->[1] >= $code) {	    return $table->[$mid]->[2];	} else {	    _search($table, $mid + 1, $hi, $code);	}    } elsif ($table->[$mid]->[0] > $code) {	_search($table, $lo, $mid - 1, $code);    } else {	return $table->[$mid]->[2];    }}sub charinrange {    my ($range, $arg) = @_;    my $code = _getcode($arg);    croak __PACKAGE__, "::charinrange: unknown code '$arg'"	unless defined $code;    _search($range, 0, $#$range, $code);}=head2 charblock    use Unicode::UCD 'charblock';    my $charblock = charblock(0x41);    my $charblock = charblock(1234);    my $charblock = charblock("0x263a");    my $charblock = charblock("U+263a");    my $range     = charblock('Armenian');With a B<code point argument> charblock() returns the I<block> the characterbelongs to, e.g.  C<Basic Latin>.  Note that not all the characterpositions within all blocks are defined.See also L</Blocks versus Scripts>.If supplied with an argument that can't be a code point, charblock() triesto do the opposite and interpret the argument as a character block. Thereturn value is a I<range>: an anonymous list of lists that containI<start-of-range>, I<end-of-range> code point pairs. You can test whethera code point is in a range using the L</charinrange> function. If theargument is not a known character block, C<undef> is returned.=cutmy @BLOCKS;my %BLOCKS;sub _charblocks {    unless (@BLOCKS) {	if (openunicode(\$BLOCKSFH, "Blocks.txt")) {	    local $_;	    while (<$BLOCKSFH>) {		if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {		    my ($lo, $hi) = (hex($1), hex($2));		    my $subrange = [ $lo, $hi, $3 ];		    push @BLOCKS, $subrange;		    push @{$BLOCKS{$3}}, $subrange;		}	    }	    close($BLOCKSFH);	}    }}sub charblock {    my $arg = shift;    _charblocks() unless @BLOCKS;    my $code = _getcode($arg);    if (defined $code) {	_search(\@BLOCKS, 0, $#BLOCKS, $code);    } else {	if (exists $BLOCKS{$arg}) {	    return dclone $BLOCKS{$arg};	} else {	    return;	}    }}=head2 charscript    use Unicode::UCD 'charscript';    my $charscript = charscript(0x41);    my $charscript = charscript(1234);    my $charscript = charscript("U+263a");    my $range      = charscript('Thai');With a B<code point argument> charscript() returns the I<script> thecharacter belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.See also L</Blocks versus Scripts>.If supplied with an argument that can't be a code point, charscript() triesto do the opposite and interpret the argument as a character script. Thereturn value is a I<range>: an anonymous list of lists that containI<start-of-range>, I<end-of-range> code point pairs. You can test whether acode point is in a range using the L</charinrange> function. If theargument is not a known character script, C<undef> is returned.=cutmy @SCRIPTS;my %SCRIPTS;sub _charscripts {    unless (@SCRIPTS) {	if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {	    local $_;	    while (<$SCRIPTSFH>) {		if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {		    my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));		    my $script = lc($3);		    $script =~ s/\b(\w)/uc($1)/ge;		    my $subrange = [ $lo, $hi, $script ];		    push @SCRIPTS, $subrange;		    push @{$SCRIPTS{$script}}, $subrange;		}	    }	    close($SCRIPTSFH);	    @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;	}    }}sub charscript {    my $arg = shift;    _charscripts() unless @SCRIPTS;    my $code = _getcode($arg);    if (defined $code) {	_search(\@SCRIPTS, 0, $#SCRIPTS, $code);    } else {	if (exists $SCRIPTS{$arg}) {	    return dclone $SCRIPTS{$arg};	} else {	    return;	}    }}=head2 charblocks    use Unicode::UCD 'charblocks';    my $charblocks = charblocks();charblocks() returns a reference to a hash with the known block namesas the keys, and the code point ranges (see L</charblock>) as the values.See also L</Blocks versus Scripts>.=cutsub charblocks {    _charblocks() unless %BLOCKS;    return dclone \%BLOCKS;}=head2 charscripts    use Unicode::UCD 'charscripts';    my $charscripts = charscripts();charscripts() returns a reference to a hash with the known scriptnames as the keys, and the code point ranges (see L</charscript>) asthe values.See also L</Blocks versus Scripts>.=cutsub charscripts {    _charscripts() unless %SCRIPTS;    return dclone \%SCRIPTS;}=head2 Blocks versus ScriptsThe difference between a block and a script is that scripts are closerto the linguistic notion of a set of characters required to presentlanguages, while block is more of an artifact of the Unicode characternumbering and separation into blocks of (mostly) 256 characters.For example the Latin B<script> is spread over several B<blocks>, suchas C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, andC<Latin Extended-B>.  On the other hand, the Latin script does notcontain all the characters of the C<Basic Latin> block (also known asthe ASCII): it includes only the letters, and not, for example, the digitsor the punctuation.For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txtFor scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/=head2 Matching Scripts and BlocksScripts are matched with the regular-expression constructC<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -