📄 enc2xs
字号:
#!./perlBEGIN { # @INC poking no longer needed w/ new MakeMaker and Makefile.PL's # with $ENV{PERL_CORE} set # In case we need it in future... require Config; import Config;}use strict;use warnings;use Getopt::Std;use Config;my @orig_ARGV = @ARGV;our $VERSION = do { my @r = (q$Revision: 2.5 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };# These may get re-ordered.# RAW is a do_now as inserted by &enter# AGG is an aggreagated do_now, as built up by &processuse constant { RAW_NEXT => 0, RAW_IN_LEN => 1, RAW_OUT_BYTES => 2, RAW_FALLBACK => 3, AGG_MIN_IN => 0, AGG_MAX_IN => 1, AGG_OUT_BYTES => 2, AGG_NEXT => 3, AGG_IN_LEN => 4, AGG_OUT_LEN => 5, AGG_FALLBACK => 6,};# (See the algorithm in encengine.c - we're building structures for it)# There are two sorts of structures.# "do_now" (an array, two variants of what needs storing) is whatever we need# to do now we've read an input byte.# It's housed in a "do_next" (which is how we got to it), and in turn points# to a "do_next" which contains all the "do_now"s for the next input byte.# There will be a "do_next" which is the start state.# For a single byte encoding it's the only "do_next" - each "do_now" points# back to it, and each "do_now" will cause bytes. There is no state.# For a multi-byte encoding where all characters in the input are the same# length, then there will be a tree of "do_now"->"do_next"->"do_now"# branching out from the start state, one step for each input byte.# The leaf "do_now"s will all be at the same distance from the start state,# only the leaf "do_now"s cause output bytes, and they in turn point back to# the start state.# For an encoding where there are varaible length input byte sequences, you# will encounter a leaf "do_now" sooner for the shorter input sequences, but# as before the leaves will point back to the start state.# The system will cope with escape encodings (imagine them as a mostly# self-contained tree for each escape state, and cross links between trees# at the state-switching characters) but so far no input format defines these.# The system will also cope with having output "leaves" in the middle of# the bifurcating branches, not just at the extremities, but again no# input format does this yet.# There are two variants of the "do_now" structure. The first, smaller variant# is generated by &enter as the input file is read. There is one structure# for each input byte. Say we are mapping a single byte encoding to a# single byte encoding, with "ABCD" going "abcd". There will be# 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}# &process then walks the tree, building aggregate "do_now" structres for# adjacent bytes where possible. The aggregate is for a contiguous range of# bytes which each produce the same length of output, each move to the# same next state, and each have the same fallback flag.# So our 4 RAW "do_now"s above become replaced by a single structure# containing:# ["A", "D", "abcd", 1, ...]# ie, for an input byte $_ in "A".."D", output 1 byte, found as# substr ("abcd", (ord $_ - ord "A") * 1, 1)# which maps very nicely into pointer arithmetic in C for encengine.csub encode_U{ # UTF-8 encode long hand - only covers part of perl's range ## my $uv = shift; # chr() works in native space so convert value from table # into that space before using chr(). my $ch = chr(utf8::unicode_to_native($_[0])); # Now get core perl to encode that the way it likes. utf8::encode($ch); return $ch;}sub encode_S{ # encode single byte ## my ($ch,$page) = @_; return chr($ch); return chr $_[0];}sub encode_D{ # encode double byte MS byte first ## my ($ch,$page) = @_; return chr($page).chr($ch); return chr ($_[1]) . chr $_[0];}sub encode_M{ # encode Multi-byte - single for 0..255 otherwise double ## my ($ch,$page) = @_; ## return &encode_D if $page; ## return &encode_S; return chr ($_[1]) . chr $_[0] if $_[1]; return chr $_[0];}my %encode_types = (U => \&encode_U, S => \&encode_S, D => \&encode_D, M => \&encode_M, );# Win32 does not expand globs on command lineeval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');my %opt;# I think these are:# -Q to disable the duplicate codepoint test# -S make mapping errors fatal# -q to remove comments written to output files# -O to enable the (brute force) substring optimiser# -o <output> to specify the output file name (else it's the first arg)# -f <inlist> to give a file with a list of input files (else use the args)# -n <name> to name the encoding (else use the basename of the input file.getopts('CM:SQqOo:f:n:',\%opt);$opt{M} and make_makefile_pl($opt{M}, @ARGV);$opt{C} and make_configlocal_pm($opt{C}, @ARGV);# This really should go first, else the die here causes empty (non-erroneous)# output files to be written.my @encfiles;if (exists $opt{'f'}) { # -F is followed by name of file containing list of filenames my $flist = $opt{'f'}; open(FLIST,$flist) || die "Cannot open $flist:$!"; chomp(@encfiles = <FLIST>); close(FLIST);} else { @encfiles = @ARGV;}my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);chmod(0666,$cname) if -f $cname && !-w $cname;open(C,">$cname") || die "Cannot open $cname:$!";my $dname = $cname;my $hname = $cname;my ($doC,$doEnc,$doUcm,$doPet);if ($cname =~ /\.(c|xs)$/i) # VMS may have upcased filenames with DECC$ARGV_PARSE_STYLE defined { $doC = 1; $dname =~ s/(\.[^\.]*)?$/.exh/; chmod(0666,$dname) if -f $cname && !-w $dname; open(D,">$dname") || die "Cannot open $dname:$!"; $hname =~ s/(\.[^\.]*)?$/.h/; chmod(0666,$hname) if -f $cname && !-w $hname; open(H,">$hname") || die "Cannot open $hname:$!"; foreach my $fh (\*C,\*D,\*H) { print $fh <<"END" unless $opt{'q'};/* !!!!!!! DO NOT EDIT THIS FILE !!!!!!! This file was autogenerated by: $^X $0 @orig_ARGV enc2xs VERSION $VERSION*/END } if ($cname =~ /(\w+)\.xs$/) { print C "#include <EXTERN.h>\n"; print C "#include <perl.h>\n"; print C "#include <XSUB.h>\n"; print C "#define U8 U8\n"; } print C "#include \"encode.h\"\n\n"; }elsif ($cname =~ /\.enc$/) { $doEnc = 1; }elsif ($cname =~ /\.ucm$/) { $doUcm = 1; }elsif ($cname =~ /\.pet$/) { $doPet = 1; }my %encoding;my %strings;my $string_acc;my %strings_in_acc;my $saved = 0;my $subsave = 0;my $strings = 0;sub cmp_name{ if ($a =~ /^.*-(\d+)/) { my $an = $1; if ($b =~ /^.*-(\d+)/) { my $r = $an <=> $1; return $r if $r; } } return $a cmp $b;}foreach my $enc (sort cmp_name @encfiles) { my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/; $name = $opt{'n'} if exists $opt{'n'}; if (open(E,$enc)) { if ($sfx eq 'enc') { compile_enc(\*E,lc($name)); } else { compile_ucm(\*E,lc($name)); } } else { warn "Cannot open $enc for $name:$!"; } }if ($doC) { print STDERR "Writing compiled form\n"; foreach my $name (sort cmp_name keys %encoding) { my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}}; process($name.'_utf8',$e2u); addstrings(\*C,$e2u); process('utf8_'.$name,$u2e); addstrings(\*C,$u2e); } outbigstring(\*C,"enctable"); foreach my $name (sort cmp_name keys %encoding) { my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}}; outtable(\*C,$e2u, "enctable"); outtable(\*C,$u2e, "enctable"); # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep)); } my $cpp = ($Config{d_cplusplus} || '') eq 'define'; my $exta = $cpp ? 'extern "C" ' : "static"; my $extb = $cpp ? 'extern "C" ' : ""; foreach my $enc (sort cmp_name keys %encoding) { # my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}}; my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}}; #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el); my $replen = 0; $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g); my $sym = "${enc}_encoding"; $sym =~ s/\W+/_/g; my @info = ($e2u->{Cname},$u2e->{Cname},"${sym}_rep_character",$replen, $min_el,$max_el); print C "${exta} const U8 ${sym}_rep_character[] = \"$rep\";\n"; print C "${exta} const char ${sym}_enc_name[] = \"$enc\";\n\n"; print C "${extb} const encode_t $sym = \n"; # This is to make null encoding work -- dankogai for (my $i = (scalar @info) - 1; $i >= 0; --$i){ $info[$i] ||= 1; } # end of null tweak -- dankogai print C " {",join(',',@info,"{${sym}_enc_name,(const char *)0}"),"};\n\n"; } foreach my $enc (sort cmp_name keys %encoding) { my $sym = "${enc}_encoding"; $sym =~ s/\W+/_/g; print H "extern encode_t $sym;\n"; print D " Encode_XSEncoding(aTHX_ &$sym);\n"; } if ($cname =~ /(\w+)\.xs$/) { my $mod = $1; print C <<'END';static voidEncode_XSEncoding(pTHX_ encode_t *enc){ dSP; HV *stash = gv_stashpv("Encode::XS", TRUE); SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash); int i = 0; PUSHMARK(sp); XPUSHs(sv); while (enc->name[i]) { const char *name = enc->name[i++]; XPUSHs(sv_2mortal(newSVpvn(name,strlen(name)))); } PUTBACK; call_pv("Encode::define_encoding",G_DISCARD); SvREFCNT_dec(sv);}END print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n"; print C "BOOT:\n{\n"; print C "#include \"$dname\"\n"; print C "}\n"; } # Close in void context is bad, m'kay close(D) or warn "Error closing '$dname': $!"; close(H) or warn "Error closing '$hname': $!"; my $perc_saved = $saved/($strings + $saved) * 100; my $perc_subsaved = $subsave/($strings + $subsave) * 100; printf STDERR "%d bytes in string tables\n",$strings; printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n", $saved, $perc_saved if $saved; printf STDERR "%d bytes (%.3g%%) saved using substrings\n", $subsave, $perc_subsaved if $subsave; }elsif ($doEnc) { foreach my $name (sort cmp_name keys %encoding) { my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}}; output_enc(\*C,$name,$e2u); } }elsif ($doUcm) { foreach my $name (sort cmp_name keys %encoding) { my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}}; output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el); } }# writing half meg files and then not checking to see if you just filled the# disk is bad, m'kayclose(C) or die "Error closing '$cname': $!";# End of the main program.sub compile_ucm{ my ($fh,$name) = @_; my $e2u = {}; my $u2e = {}; my $cs; my %attr; while (<$fh>) { s/#.*$//; last if /^\s*CHARMAP\s*$/i; if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr { $attr{$1} = $2; } } if (!defined($cs = $attr{'code_set_name'})) { warn "No <code_set_name> in $name\n"; } else { $name = $cs unless exists $opt{'n'}; } my $erep; my $urep; my $max_el; my $min_el; if (exists $attr{'subchar'}) { #my @byte; #$attr{'subchar'} =~ /^\s*/cg; #push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg; #$erep = join('',map(chr(hex($_)),@byte)); $erep = $attr{'subchar'}; $erep =~ s/^\s+//; $erep =~ s/\s+$//; } print "Reading $name ($cs)\n"; my $nfb = 0; my $hfb = 0; while (<$fh>) { s/#.*$//; last if /^\s*END\s+CHARMAP\s*$/i; next if /^\s*$/; my (@uni, @byte) = (); my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o or die "Bad line: $_"; while ($uni =~ m/\G<([U0-9a-fA-F\+]+)>/g){ push @uni, map { substr($_, 1) } split(/\+/, $1); } while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){ push @byte, $1; } if (@uni) { my $uch = join('', map { encode_U(hex($_)) } @uni ); my $ech = join('',map(chr(hex($_)),@byte)); my $el = length($ech); $max_el = $el if (!defined($max_el) || $el > $max_el); $min_el = $el if (!defined($min_el) || $el < $min_el); if (length($fb)) { $fb = substr($fb,1); $hfb++; } else { $nfb++; $fb = '0'; } # $fb is fallback flag # 0 - round trip safe # 1 - fallback for unicode -> enc # 2 - skip sub-char mapping # 3 - fallback enc -> unicode enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/); enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/); } else { warn $_; } } if ($nfb && $hfb) { die "$nfb entries without fallback, $hfb entries with\n"; } $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];}sub compile_enc{ my ($fh,$name) = @_; my $e2u = {}; my $u2e = {};
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -