📄 encode.pm
字号:
## $Id: Encode.pm,v 2.23 2007/05/29 18:15:32 dankogai Exp dankogai $#package Encode;use strict;use warnings;our $VERSION = sprintf "%d.%02d", q$Revision: 2.23 $ =~ /(\d+)/g;sub DEBUG () { 0 }use XSLoader ();XSLoader::load( __PACKAGE__, $VERSION );require Exporter;use base qw/Exporter/;# Public, encouraged API is exported by defaultour @EXPORT = qw( decode decode_utf8 encode encode_utf8 str2bytes bytes2str encodings find_encoding clone_encoding);our @FB_FLAGS = qw( DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC PERLQQ HTMLCREF XMLCREF STOP_AT_PARTIAL);our @FB_CONSTS = qw( FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ FB_HTMLCREF FB_XMLCREF);our @EXPORT_OK = ( qw( _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade ), @FB_FLAGS, @FB_CONSTS,);our %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ], fallbacks => [@FB_CONSTS], fallback_all => [ @FB_CONSTS, @FB_FLAGS ],);# Documentation moved after __END__ for speed - NI-Sour $ON_EBCDIC = ( ord("A") == 193 );use Encode::Alias;# Make a %Encoding package variable to allow a certain amount of cheatingour %Encoding;our %ExtModule;require Encode::Config;eval { require Encode::ConfigLocal };sub encodings { my $class = shift; my %enc; if ( @_ and $_[0] eq ":all" ) { %enc = ( %Encoding, %ExtModule ); } else { %enc = %Encoding; for my $mod ( map { m/::/o ? $_ : "Encode::$_" } @_ ) { DEBUG and warn $mod; for my $enc ( keys %ExtModule ) { $ExtModule{$enc} eq $mod and $enc{$enc} = $mod; } } } return sort { lc $a cmp lc $b } grep { !/^(?:Internal|Unicode|Guess)$/o } keys %enc;}sub perlio_ok { my $obj = ref( $_[0] ) ? $_[0] : find_encoding( $_[0] ); $obj->can("perlio_ok") and return $obj->perlio_ok(); return 0; # safety net}sub define_encoding { my $obj = shift; my $name = shift; $Encoding{$name} = $obj; my $lc = lc($name); define_alias( $lc => $obj ) unless $lc eq $name; while (@_) { my $alias = shift; define_alias( $alias, $obj ); } return $obj;}sub getEncoding { my ( $class, $name, $skip_external ) = @_; ref($name) && $name->can('renew') and return $name; exists $Encoding{$name} and return $Encoding{$name}; my $lc = lc $name; exists $Encoding{$lc} and return $Encoding{$lc}; my $oc = $class->find_alias($name); defined($oc) and return $oc; $lc ne $name and $oc = $class->find_alias($lc); defined($oc) and return $oc; unless ($skip_external) { if ( my $mod = $ExtModule{$name} || $ExtModule{$lc} ) { $mod =~ s,::,/,g; $mod .= '.pm'; eval { require $mod; }; exists $Encoding{$name} and return $Encoding{$name}; } } return;}sub find_encoding($;$) { my ( $name, $skip_external ) = @_; return __PACKAGE__->getEncoding( $name, $skip_external );}sub resolve_alias($) { my $obj = find_encoding(shift); defined $obj and return $obj->name; return;}sub clone_encoding($) { my $obj = find_encoding(shift); ref $obj or return; eval { require Storable }; $@ and return; return Storable::dclone($obj);}sub encode($$;$) { my ( $name, $string, $check ) = @_; return undef unless defined $string; $string .= '' if ref $string; # stringify; $check ||= 0; my $enc = find_encoding($name); unless ( defined $enc ) { require Carp; Carp::croak("Unknown encoding '$name'"); } my $octets = $enc->encode( $string, $check ); $_[1] = $string if $check and !ref $check and !( $check & LEAVE_SRC() ); return $octets;}*str2bytes = \&encode;sub decode($$;$) { my ( $name, $octets, $check ) = @_; return undef unless defined $octets; $octets .= '' if ref $octets; $check ||= 0; my $enc = find_encoding($name); unless ( defined $enc ) { require Carp; Carp::croak("Unknown encoding '$name'"); } my $string = $enc->decode( $octets, $check ); $_[1] = $octets if $check and !ref $check and !( $check & LEAVE_SRC() ); return $string;}*bytes2str = \&decode;sub from_to($$$;$) { my ( $string, $from, $to, $check ) = @_; return undef unless defined $string; $check ||= 0; my $f = find_encoding($from); unless ( defined $f ) { require Carp; Carp::croak("Unknown encoding '$from'"); } my $t = find_encoding($to); unless ( defined $t ) { require Carp; Carp::croak("Unknown encoding '$to'"); } my $uni = $f->decode($string); $_[0] = $string = $t->encode( $uni, $check ); return undef if ( $check && length($uni) ); return defined( $_[0] ) ? length($string) : undef;}sub encode_utf8($) { my ($str) = @_; utf8::encode($str); return $str;}sub decode_utf8($;$) { my ( $str, $check ) = @_; return $str if is_utf8($str); if ($check) { return decode( "utf8", $str, $check ); } else { return decode( "utf8", $str ); return $str; }}predefine_encodings(1);## This is to restore %Encoding if really needed;#sub predefine_encodings { require Encode::Encoding; no warnings 'redefine'; my $use_xs = shift; if ($ON_EBCDIC) { # was in Encode::UTF_EBCDIC package Encode::UTF_EBCDIC; push @Encode::UTF_EBCDIC::ISA, 'Encode::Encoding'; *decode = sub { my ( $obj, $str, $chk ) = @_; my $res = ''; for ( my $i = 0 ; $i < length($str) ; $i++ ) { $res .= chr( utf8::unicode_to_native( ord( substr( $str, $i, 1 ) ) ) ); } $_[1] = '' if $chk; return $res; }; *encode = sub { my ( $obj, $str, $chk ) = @_; my $res = ''; for ( my $i = 0 ; $i < length($str) ; $i++ ) { $res .= chr( utf8::native_to_unicode( ord( substr( $str, $i, 1 ) ) ) ); } $_[1] = '' if $chk; return $res; }; $Encode::Encoding{Unicode} = bless { Name => "UTF_EBCDIC" } => "Encode::UTF_EBCDIC"; } else { package Encode::Internal; push @Encode::Internal::ISA, 'Encode::Encoding'; *decode = sub { my ( $obj, $str, $chk ) = @_; utf8::upgrade($str); $_[1] = '' if $chk; return $str; }; *encode = \&decode; $Encode::Encoding{Unicode} = bless { Name => "Internal" } => "Encode::Internal"; } { # was in Encode::utf8 package Encode::utf8; push @Encode::utf8::ISA, 'Encode::Encoding'; # if ($use_xs) { Encode::DEBUG and warn __PACKAGE__, " XS on"; *decode = \&decode_xs; *encode = \&encode_xs; } else { Encode::DEBUG and warn __PACKAGE__, " XS off"; *decode = sub { my ( $obj, $octets, $chk ) = @_; my $str = Encode::decode_utf8($octets); if ( defined $str ) { $_[1] = '' if $chk; return $str; } return undef; }; *encode = sub { my ( $obj, $string, $chk ) = @_; my $octets = Encode::encode_utf8($string); $_[1] = '' if $chk; return $octets; }; } *cat_decode = sub { # ($obj, $dst, $src, $pos, $trm, $chk) # currently ignores $chk my ( $obj, undef, undef, $pos, $trm ) = @_; my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ]; use bytes; if ( ( my $npos = index( $$rsrc, $trm, $pos ) ) >= 0 ) { $$rdst .= substr( $$rsrc, $pos, $npos - $pos + length($trm) ); $$rpos = $npos + length($trm); return 1; } $$rdst .= substr( $$rsrc, $pos ); $$rpos = length($$rsrc); return ''; }; $Encode::Encoding{utf8} = bless { Name => "utf8" } => "Encode::utf8"; $Encode::Encoding{"utf-8-strict"} = bless { Name => "utf-8-strict", strict_utf8 => 1 } => "Encode::utf8"; }}1;__END__=head1 NAMEEncode - character encodings=head1 SYNOPSIS use Encode;=head2 Table of ContentsEncode consists of a collection of modules whose details are too bigto fit in one document. This POD itself explains the top-level APIsand general topics at a glance. For other topics and more details,see the PODs below: Name Description -------------------------------------------------------- Encode::Alias Alias definitions to encodings Encode::Encoding Encode Implementation Base Class Encode::Supported List of Supported Encodings Encode::CN Simplified Chinese Encodings Encode::JP Japanese Encodings Encode::KR Korean Encodings Encode::TW Traditional Chinese Encodings --------------------------------------------------------=head1 DESCRIPTIONThe C<Encode> module provides the interfaces between Perl's stringsand the rest of the system. Perl strings are sequences ofB<characters>.The repertoire of characters that Perl can represent is at least thatdefined by the Unicode Consortium. On most platforms the ordinalvalues of the characters (as returned by C<ord(ch)>) is the "Unicodecodepoint" for the character (the exceptions are those platforms wherethe legacy encoding is some variant of EBCDIC rather than a super-setof ASCII - see L<perlebcdic>).Traditionally, computer data has been moved around in 8-bit chunksoften called "bytes". These chunks are also known as "octets" innetworking standards. Perl is widely used to manipulate data of manytypes - not only strings of characters representing human or computerlanguages but also "binary" data being the machine's representation ofnumbers, pixels in an image - or just about anything.When Perl is processing "binary data", the programmer wants Perl toprocess "sequences of bytes". This is not a problem for Perl - as abyte has 256 possible values, it easily fits in Perl's much larger"logical character".=head2 TERMINOLOGY=over 2=item *I<character>: a character in the range 0..(2**32-1) (or more).(What Perl's strings are made of.)=item *I<byte>: a character in the range 0..255(A special case of a Perl character.)=item *I<octet>: 8 bits of data, with ordinal values 0..255(Term for bytes passed to or from a non-Perl context, e.g. a disk file.)=back=head1 PERL ENCODING API=over 2=item $octets = encode(ENCODING, $string [, CHECK])Encodes a string from Perl's internal form into I<ENCODING> and returnsa sequence of octets. ENCODING can be either a canonical name oran alias. For encoding names and aliases, see L</"Defining Aliases">.For CHECK, see L</"Handling Malformed Data">.For example, to convert a string from Perl's internal format toiso-8859-1 (also known as Latin1), $octets = encode("iso-8859-1", $string);B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then$octets B<may not be equal to> $string. Though they both contain thesame data, the UTF8 flag for $octets is B<always> off. When youencode anything, UTF8 flag of the result is always off, even when itcontains completely valid utf8 string. See L</"The UTF8 flag"> below.If the $string is C<undef> then C<undef> is returned.=item $string = decode(ENCODING, $octets [, CHECK])Decodes a sequence of octets assumed to be in I<ENCODING> into Perl'sinternal form and returns the resulting string. As in encode(),ENCODING can be either a canonical name or an alias. For encoding namesand aliases, see L</"Defining Aliases">. For CHECK, seeL</"Handling Malformed Data">.For example, to convert ISO-8859-1 data to a string in Perl's internal format: $string = decode("iso-8859-1", $octets);B<CAVEAT>: When you run C<$string = decode("utf8", $octets)>, then $stringB<may not be equal to> $octets. Though they both contain the same data,the UTF8 flag for $string is on unless $octets entirely consists ofASCII data (or EBCDIC on EBCDIC machines). See L</"The UTF8 flag">below.If the $string is C<undef> then C<undef> is returned.=item [$obj =] find_encoding(ENCODING)Returns the I<encoding object> corresponding to ENCODING. Returnsundef if no matching ENCODING is find.This object is what actually does the actual (en|de)coding. $utf8 = decode($name, $bytes);is in fact $utf8 = do{ $obj = find_encoding($name); croak qq(encoding "$name" not found) unless ref $obj; $obj->decode($bytes) };with more error checking.Therefore you can save time by reusing this object as follows; my $enc = find_encoding("iso-8859-1"); while(<>){ my $utf8 = $enc->decode($_); # and do someting with $utf8; }Besides C<< ->decode >> and C<< ->encode >>, other methods areavailable as well. For instance, C<< -> name >> returns the canonicalname of the encoding object. find_encoding("latin1")->name; # iso-8859-1See L<Encode::Encoding> for details.=item [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])Converts B<in-place> data between two encodings. The data in $octetsmust be encoded as octets and not as characters in Perl's internal
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -