📄 encoding.pm
字号:
# $Id: encoding.pm,v 2.6 2007/04/22 14:56:12 dankogai Exp $package encoding;our $VERSION = do { my @r = ( q$Revision: 2.6 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };use Encode;use strict;use warnings;sub DEBUG () { 0 }BEGIN { if ( ord("A") == 193 ) { require Carp; Carp::croak("encoding: pragma does not support EBCDIC platforms"); }}our $HAS_PERLIO = 0;eval { require PerlIO::encoding };unless ($@) { $HAS_PERLIO = ( PerlIO::encoding->VERSION >= 0.02 );}sub _exception { my $name = shift; $] > 5.008 and return 0; # 5.8.1 or higher then no my %utfs = map { $_ => 1 } qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE); $utfs{$name} or return 0; # UTFs or no require Config; Config->import(); our %Config; return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no}sub in_locale { $^H & ( $locale::hint_bits || 0 ) }sub _get_locale_encoding { my $locale_encoding; # I18N::Langinfo isn't available everywhere eval { require I18N::Langinfo; I18N::Langinfo->import(qw(langinfo CODESET)); $locale_encoding = langinfo( CODESET() ); }; my $country_language; no warnings 'uninitialized'; if ( not $locale_encoding && in_locale() ) { if ( $ENV{LC_ALL} =~ /^([^.]+)\.([^.]+)$/ ) { ( $country_language, $locale_encoding ) = ( $1, $2 ); } elsif ( $ENV{LANG} =~ /^([^.]+)\.([^.]+)$/ ) { ( $country_language, $locale_encoding ) = ( $1, $2 ); } # LANGUAGE affects only LC_MESSAGES only on glibc } elsif ( not $locale_encoding ) { if ( $ENV{LC_ALL} =~ /\butf-?8\b/i || $ENV{LANG} =~ /\butf-?8\b/i ) { $locale_encoding = 'utf8'; } # Could do more heuristics based on the country and language # parts of LC_ALL and LANG (the parts before the dot (if any)), # since we have Locale::Country and Locale::Language available. # TODO: get a database of Language -> Encoding mappings # (the Estonian database at http://www.eki.ee/letter/ # would be excellent!) --jhi } if ( defined $locale_encoding && lc($locale_encoding) eq 'euc' && defined $country_language ) { if ( $country_language =~ /^ja_JP|japan(?:ese)?$/i ) { $locale_encoding = 'euc-jp'; } elsif ( $country_language =~ /^ko_KR|korean?$/i ) { $locale_encoding = 'euc-kr'; } elsif ( $country_language =~ /^zh_CN|chin(?:a|ese)$/i ) { $locale_encoding = 'euc-cn'; } elsif ( $country_language =~ /^zh_TW|taiwan(?:ese)?$/i ) { $locale_encoding = 'euc-tw'; } else { require Carp; Carp::croak( "encoding: Locale encoding '$locale_encoding' too ambiguous" ); } } return $locale_encoding;}sub import { my $class = shift; my $name = shift; if ( $name eq ':_get_locale_encoding' ) { # used by lib/open.pm my $caller = caller(); { no strict 'refs'; *{"${caller}::_get_locale_encoding"} = \&_get_locale_encoding; } return; } $name = _get_locale_encoding() if $name eq ':locale'; my %arg = @_; $name = $ENV{PERL_ENCODING} unless defined $name; my $enc = find_encoding($name); unless ( defined $enc ) { require Carp; Carp::croak("encoding: Unknown encoding '$name'"); } $name = $enc->name; # canonize unless ( $arg{Filter} ) { DEBUG and warn "_exception($name) = ", _exception($name); _exception($name) or ${^ENCODING} = $enc; $HAS_PERLIO or return 1; } else { defined( ${^ENCODING} ) and undef ${^ENCODING}; # implicitly 'use utf8' require utf8; # to fetch $utf8::hint_bits; $^H |= $utf8::hint_bits; eval { require Filter::Util::Call; Filter::Util::Call->import; filter_add( sub { my $status = filter_read(); if ( $status > 0 ) { $_ = $enc->decode( $_, 1 ); DEBUG and warn $_; } $status; } ); }; $@ eq '' and DEBUG and warn "Filter installed"; } defined ${^UNICODE} and ${^UNICODE} != 0 and return 1; for my $h (qw(STDIN STDOUT)) { if ( $arg{$h} ) { unless ( defined find_encoding( $arg{$h} ) ) { require Carp; Carp::croak( "encoding: Unknown encoding for $h, '$arg{$h}'"); } eval { binmode( $h, ":raw :encoding($arg{$h})" ) }; } else { unless ( exists $arg{$h} ) { eval { no warnings 'uninitialized'; binmode( $h, ":raw :encoding($name)" ); }; } } if ($@) { require Carp; Carp::croak($@); } } return 1; # I doubt if we need it, though}sub unimport { no warnings; undef ${^ENCODING}; if ($HAS_PERLIO) { binmode( STDIN, ":raw" ); binmode( STDOUT, ":raw" ); } else { binmode(STDIN); binmode(STDOUT); } if ( $INC{"Filter/Util/Call.pm"} ) { eval { filter_del() }; }}1;__END__=pod=head1 NAMEencoding - allows you to write your script in non-ascii or non-utf8=head1 SYNOPSIS use encoding "greek"; # Perl like Greek to you? use encoding "euc-jp"; # Jperl! # or you can even do this if your shell supports your native encoding perl -Mencoding=latin2 -e '...' # Feeling centrally European? perl -Mencoding=euc-kr -e '...' # Or Korean? # more control # A simple euc-cn => utf-8 converter use encoding "euc-cn", STDOUT => "utf8"; while(<>){print}; # "no encoding;" supported (but not scoped!) no encoding; # an alternate way, Filter use encoding "euc-jp", Filter=>1; # now you can use kanji identifiers -- in euc-jp! # switch on locale - # note that this probably means that unless you have a complete control # over the environments the application is ever going to be run, you should # NOT use the feature of encoding pragma allowing you to write your script # in any recognized encoding because changing locale settings will wreck # the script; you can of course still use the other features of the pragma. use encoding ':locale';=head1 ABSTRACTLet's start with a bit of history: Perl 5.6.0 introduced Unicodesupport. You could apply C<substr()> and regexes even to complex CJKcharacters -- so long as the script was written in UTF-8. But backthen, text editors that supported UTF-8 were still rare and many usersinstead chose to write scripts in legacy encodings, giving up a wholenew feature of Perl 5.6.Rewind to the future: starting from perl 5.8.0 with the B<encoding>pragma, you can write your script in any encoding you like (so longas the C<Encode> module supports it) and still enjoy Unicode support.This pragma achieves that by doing the following:=over=item *Internally converts all literals (C<q//,qq//,qr//,qw///, qx//>) fromthe encoding specified to utf8. In Perl 5.8.1 and later, literals inC<tr///> and C<DATA> pseudo-filehandle are also converted.=item *Changing PerlIO layers of C<STDIN> and C<STDOUT> to the encoding specified.=back=head2 Literal ConversionsYou can write code in EUC-JP as follows: my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji #<-char-><-char-> # 4 octets s/\bCamel\b/$Rakuda/;And with C<use encoding "euc-jp"> in effect, it is the same thing asthe code in UTF-8: my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters s/\bCamel\b/$Rakuda/;=head2 PerlIO layers for C<STD(IN|OUT)>The B<encoding> pragma also modifies the filehandle layers ofSTDIN and STDOUT to the specified encoding. Therefore, use encoding "euc-jp"; my $message = "Camel is the symbol of perl.\n"; my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji $message =~ s/\bCamel\b/$Rakuda/; print $message;Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n",not "\x{99F1}\x{99DD} is the symbol of perl.\n".You can override this by giving extra arguments; see below.=head2 Implicit upgrading for byte stringsBy default, if strings operating under byte semantics and stringswith Unicode character data are concatenated, the new string willbe created by decoding the byte strings as I<ISO 8859-1 (Latin-1)>.The B<encoding> pragma changes this to use the specified encodinginstead. For example: use encoding 'utf8'; my $string = chr(20000); # a Unicode string utf8::encode($string); # now it's a UTF-8 encoded byte string # concatenate with another Unicode string print length($string . chr(20000));Will print C<2>, because C<$string> is upgraded as UTF-8. WithoutC<use encoding 'utf8';>, it will print C<4> instead, since C<$string>is three octets when interpreted as Latin-1.=head2 Side effectsIf the C<encoding> pragma is in scope then the lengths returned arecalculated from the length of C<$/> in Unicode characters, which is notalways the same as the length of C<$/> in the native encoding.This pragma affects utf8::upgrade, but not utf8::downgrade.=head2 Side effectsIf the C<encoding> pragma is in scope then the lengths returned arecalculated from the length of C<$/> in Unicode characters, which is notalways the same as the length of C<$/> in the native encoding.This pragma affects utf8::upgrade, but not utf8::downgrade.=head2 Side effectsIf the C<encoding> pragma is in scope then the lengths returned arecalculated from the length of C<$/> in Unicode characters, which is notalways the same as the length of C<$/> in the native encoding.This pragma affects utf8::upgrade, but not utf8::downgrade.=head1 FEATURES THAT REQUIRE 5.8.1Some of the features offered by this pragma requires perl 5.8.1. Mostof these are done by Inaba Hiroto. Any other features and changesare good for 5.8.0.=over=item "NON-EUC" doublebyte encodings
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -