enc2xs

来自「source of perl for linux application,」· 代码 · 共 1,408 行 · 第 1/3 页
TXT
1,408 行
#!./perlBEGIN {    # @INC poking  no longer needed w/ new MakeMaker and Makefile.PL's    # with $ENV{PERL_CORE} set    # In case we need it in future...    require Config; import Config;}use strict;use warnings;use Getopt::Std;use Config;my @orig_ARGV = @ARGV;our $VERSION  = do { my @r = (q$Revision: 2.5 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };# These may get re-ordered.# RAW is a do_now as inserted by &enter# AGG is an aggreagated do_now, as built up by &processuse constant {  RAW_NEXT => 0,  RAW_IN_LEN => 1,  RAW_OUT_BYTES => 2,  RAW_FALLBACK => 3,  AGG_MIN_IN => 0,  AGG_MAX_IN => 1,  AGG_OUT_BYTES => 2,  AGG_NEXT => 3,  AGG_IN_LEN => 4,  AGG_OUT_LEN => 5,  AGG_FALLBACK => 6,};# (See the algorithm in encengine.c - we're building structures for it)# There are two sorts of structures.# "do_now" (an array, two variants of what needs storing) is whatever we need# to do now we've read an input byte.# It's housed in a "do_next" (which is how we got to it), and in turn points# to a "do_next" which contains all the "do_now"s for the next input byte.# There will be a "do_next" which is the start state.# For a single byte encoding it's the only "do_next" - each "do_now" points# back to it, and each "do_now" will cause bytes. There is no state.# For a multi-byte encoding where all characters in the input are the same# length, then there will be a tree of "do_now"->"do_next"->"do_now"# branching out from the start state, one step for each input byte.# The leaf "do_now"s will all be at the same distance from the start state,# only the leaf "do_now"s cause output bytes, and they in turn point back to# the start state.# For an encoding where there are varaible length input byte sequences, you# will encounter a leaf "do_now" sooner for the shorter input sequences, but# as before the leaves will point back to the start state.# The system will cope with escape encodings (imagine them as a mostly# self-contained tree for each escape state, and cross links between trees# at the state-switching characters) but so far no input format defines these.# The system will also cope with having output "leaves" in the middle of# the bifurcating branches, not just at the extremities, but again no# input format does this yet.# There are two variants of the "do_now" structure. The first, smaller variant# is generated by &enter as the input file is read. There is one structure# for each input byte. Say we are mapping a single byte encoding to a# single byte encoding, with  "ABCD" going "abcd". There will be# 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}# &process then walks the tree, building aggregate "do_now" structres for# adjacent bytes where possible. The aggregate is for a contiguous range of# bytes which each produce the same length of output, each move to the# same next state, and each have the same fallback flag.# So our 4 RAW "do_now"s above become replaced by a single structure# containing:# ["A", "D", "abcd", 1, ...]# ie, for an input byte $_ in "A".."D", output 1 byte, found as# substr ("abcd", (ord $_ - ord "A") * 1, 1)# which maps very nicely into pointer arithmetic in C for encengine.csub encode_U{ # UTF-8 encode long hand - only covers part of perl's range ## my $uv = shift; # chr() works in native space so convert value from table # into that space before using chr(). my $ch = chr(utf8::unicode_to_native($_[0])); # Now get core perl to encode that the way it likes. utf8::encode($ch); return $ch;}sub encode_S{ # encode single byte ## my ($ch,$page) = @_; return chr($ch); return chr $_[0];}sub encode_D{ # encode double byte MS byte first ## my ($ch,$page) = @_; return chr($page).chr($ch); return chr ($_[1]) . chr $_[0];}sub encode_M{ # encode Multi-byte - single for 0..255 otherwise double ## my ($ch,$page) = @_; ## return &encode_D if $page; ## return &encode_S; return chr ($_[1]) . chr $_[0] if $_[1]; return chr $_[0];}my %encode_types = (U => \&encode_U,                    S => \&encode_S,                    D => \&encode_D,                    M => \&encode_M,                   );# Win32 does not expand globs on command lineeval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');my %opt;# I think these are:# -Q to disable the duplicate codepoint test# -S make mapping errors fatal# -q to remove comments written to output files# -O to enable the (brute force) substring optimiser# -o <output> to specify the output file name (else it's the first arg)# -f <inlist> to give a file with a list of input files (else use the args)# -n <name> to name the encoding (else use the basename of the input file.getopts('CM:SQqOo:f:n:',\%opt);$opt{M} and make_makefile_pl($opt{M}, @ARGV);$opt{C} and make_configlocal_pm($opt{C}, @ARGV);# This really should go first, else the die here causes empty (non-erroneous)# output files to be written.my @encfiles;if (exists $opt{'f'}) {    # -F is followed by name of file containing list of filenames    my $flist = $opt{'f'};    open(FLIST,$flist) || die "Cannot open $flist:$!";    chomp(@encfiles = <FLIST>);    close(FLIST);} else {    @encfiles = @ARGV;}my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);chmod(0666,$cname) if -f $cname && !-w $cname;open(C,">$cname") || die "Cannot open $cname:$!";my $dname = $cname;my $hname = $cname;my ($doC,$doEnc,$doUcm,$doPet);if ($cname =~ /\.(c|xs)$/i) # VMS may have upcased filenames with DECC$ARGV_PARSE_STYLE defined {  $doC = 1;  $dname =~ s/(\.[^\.]*)?$/.exh/;  chmod(0666,$dname) if -f $cname && !-w $dname;  open(D,">$dname") || die "Cannot open $dname:$!";  $hname =~ s/(\.[^\.]*)?$/.h/;  chmod(0666,$hname) if -f $cname && !-w $hname;  open(H,">$hname") || die "Cannot open $hname:$!";  foreach my $fh (\*C,\*D,\*H)  {   print $fh <<"END" unless $opt{'q'};/* !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!! This file was autogenerated by: $^X $0 @orig_ARGV enc2xs VERSION $VERSION*/END  }  if ($cname =~ /(\w+)\.xs$/)   {    print C "#include <EXTERN.h>\n";    print C "#include <perl.h>\n";    print C "#include <XSUB.h>\n";    print C "#define U8 U8\n";   }  print C "#include \"encode.h\"\n\n"; }elsif ($cname =~ /\.enc$/) {  $doEnc = 1; }elsif ($cname =~ /\.ucm$/) {  $doUcm = 1; }elsif ($cname =~ /\.pet$/) {  $doPet = 1; }my %encoding;my %strings;my $string_acc;my %strings_in_acc;my $saved = 0;my $subsave = 0;my $strings = 0;sub cmp_name{ if ($a =~ /^.*-(\d+)/)  {   my $an = $1;   if ($b =~ /^.*-(\d+)/)    {     my $r = $an <=> $1;     return $r if $r;    }  } return $a cmp $b;}foreach my $enc (sort cmp_name @encfiles) {  my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;  $name = $opt{'n'} if exists $opt{'n'};  if (open(E,$enc))   {    if ($sfx eq 'enc')     {      compile_enc(\*E,lc($name));     }    else     {      compile_ucm(\*E,lc($name));     }   }  else   {    warn "Cannot open $enc for $name:$!";   } }if ($doC) {  print STDERR "Writing compiled form\n";  foreach my $name (sort cmp_name keys %encoding)   {    my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};    process($name.'_utf8',$e2u);    addstrings(\*C,$e2u);    process('utf8_'.$name,$u2e);    addstrings(\*C,$u2e);   }  outbigstring(\*C,"enctable");  foreach my $name (sort cmp_name keys %encoding)   {    my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};    outtable(\*C,$e2u, "enctable");    outtable(\*C,$u2e, "enctable");    # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));   }  my $cpp = ($Config{d_cplusplus} || '') eq 'define';  my $exta = $cpp ? 'extern "C" ' : "static";  my $extb = $cpp ? 'extern "C" ' : "";  foreach my $enc (sort cmp_name keys %encoding)   {    # my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};    my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}};    #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);    my $replen = 0;     $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g);    my $sym = "${enc}_encoding";    $sym =~ s/\W+/_/g;    my @info = ($e2u->{Cname},$u2e->{Cname},"${sym}_rep_character",$replen,        $min_el,$max_el);    print C "${exta} const U8 ${sym}_rep_character[] = \"$rep\";\n";    print C "${exta} const char ${sym}_enc_name[] = \"$enc\";\n\n";    print C "${extb} const encode_t $sym = \n";    # This is to make null encoding work -- dankogai    for (my $i = (scalar @info) - 1;  $i >= 0; --$i){    $info[$i] ||= 1;    }    # end of null tweak -- dankogai    print C " {",join(',',@info,"{${sym}_enc_name,(const char *)0}"),"};\n\n";   }  foreach my $enc (sort cmp_name keys %encoding)   {    my $sym = "${enc}_encoding";    $sym =~ s/\W+/_/g;    print H "extern encode_t $sym;\n";    print D " Encode_XSEncoding(aTHX_ &$sym);\n";   }  if ($cname =~ /(\w+)\.xs$/)   {    my $mod = $1;    print C <<'END';static voidEncode_XSEncoding(pTHX_ encode_t *enc){ dSP; HV *stash = gv_stashpv("Encode::XS", TRUE); SV *sv    = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash); int i = 0; PUSHMARK(sp); XPUSHs(sv); while (enc->name[i])  {   const char *name = enc->name[i++];   XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));  } PUTBACK; call_pv("Encode::define_encoding",G_DISCARD); SvREFCNT_dec(sv);}END    print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";    print C "BOOT:\n{\n";    print C "#include \"$dname\"\n";    print C "}\n";   }  # Close in void context is bad, m'kay  close(D) or warn "Error closing '$dname': $!";  close(H) or warn "Error closing '$hname': $!";  my $perc_saved    = $saved/($strings + $saved) * 100;  my $perc_subsaved = $subsave/($strings + $subsave) * 100;  printf STDERR "%d bytes in string tables\n",$strings;  printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",    $saved, $perc_saved              if $saved;  printf STDERR "%d bytes (%.3g%%) saved using substrings\n",    $subsave, $perc_subsaved         if $subsave; }elsif ($doEnc) {  foreach my $name (sort cmp_name keys %encoding)   {    my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};    output_enc(\*C,$name,$e2u);   } }elsif ($doUcm) {  foreach my $name (sort cmp_name keys %encoding)   {    my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};    output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);   } }# writing half meg files and then not checking to see if you just filled the# disk is bad, m'kayclose(C) or die "Error closing '$cname': $!";# End of the main program.sub compile_ucm{ my ($fh,$name) = @_; my $e2u = {}; my $u2e = {}; my $cs; my %attr; while (<$fh>)  {   s/#.*$//;   last if /^\s*CHARMAP\s*$/i;   if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr    {     $attr{$1} = $2;    }  } if (!defined($cs =  $attr{'code_set_name'}))  {   warn "No <code_set_name> in $name\n";  } else  {   $name = $cs unless exists $opt{'n'};  } my $erep; my $urep; my $max_el; my $min_el; if (exists $attr{'subchar'})  {   #my @byte;   #$attr{'subchar'} =~ /^\s*/cg;   #push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;   #$erep = join('',map(chr(hex($_)),@byte));   $erep = $attr{'subchar'};    $erep =~ s/^\s+//; $erep =~ s/\s+$//;  } print "Reading $name ($cs)\n"; my $nfb = 0; my $hfb = 0; while (<$fh>)  {   s/#.*$//;   last if /^\s*END\s+CHARMAP\s*$/i;   next if /^\s*$/;   my (@uni, @byte) = ();   my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o       or die "Bad line: $_";   while ($uni =~  m/\G<([U0-9a-fA-F\+]+)>/g){       push @uni, map { substr($_, 1) } split(/\+/, $1);   }   while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){       push @byte, $1;   }   if (@uni)    {     my $uch =  join('', map { encode_U(hex($_)) } @uni );     my $ech = join('',map(chr(hex($_)),@byte));     my $el  = length($ech);     $max_el = $el if (!defined($max_el) || $el > $max_el);     $min_el = $el if (!defined($min_el) || $el < $min_el);     if (length($fb))      {       $fb = substr($fb,1);       $hfb++;      }     else      {       $nfb++;       $fb = '0';      }     # $fb is fallback flag     # 0 - round trip safe     # 1 - fallback for unicode -> enc     # 2 - skip sub-char mapping     # 3 - fallback enc -> unicode     enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);     enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);    }   else    {     warn $_;    }  } if ($nfb && $hfb)  {   die "$nfb entries without fallback, $hfb entries with\n";  } $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];}sub compile_enc{ my ($fh,$name) = @_; my $e2u = {}; my $u2e = {};
enc2xs - 源码说明

本页面展示了「source of perl for linux application,」中的 enc2xs 源码文件，采用编程语言编写，共 1,408 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与application相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?