📄 enc2xs
字号:
my $type; while ($type = <$fh>) { last if $type !~ /^\s*#/; } chomp($type); return if $type eq 'E'; # Do the hash lookup once, rather than once per function call. 4% speedup. my $type_func = $encode_types{$type}; my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>)); warn "$type encoded $name\n"; my $rep = ''; # Save a defined test by setting these to defined values. my $min_el = ~0; # A very big integer my $max_el = 0; # Anything must be longer than 0 { my $v = hex($def); $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe); } my $errors; my $seen; # use -Q to silence the seen test. Makefile.PL uses this by default. $seen = {} unless $opt{Q}; do { my $line = <$fh>; chomp($line); my $page = hex($line); my $ch = 0; my $i = 16; do { # So why is it 1% faster to leave the my here? my $line = <$fh>; $line =~ s/\r\n$/\n/; die "$.:${line}Line should be exactly 65 characters long including newline (".length($line).")" unless length ($line) == 65; # Split line into groups of 4 hex digits, convert groups to ints # This takes 65.35 # map {hex $_} $line =~ /(....)/g # This takes 63.75 (2.5% less time) # unpack "n*", pack "H*", $line # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay # Doing it as while ($line =~ /(....)/g) took 74.63 foreach my $val (unpack "n*", pack "H*", $line) { next if $val == 0xFFFD; my $ech = &$type_func($ch,$page); if ($val || (!$ch && !$page)) { my $el = length($ech); $max_el = $el if $el > $max_el; $min_el = $el if $el < $min_el; my $uch = encode_U($val); if ($seen) { # We're doing the test. # We don't need to read this quickly, so storing it as a scalar, # rather than 3 (anon array, plus the 2 scalars it holds) saves # RAM and may make us faster on low RAM systems. [see __END__] if (exists $seen->{$uch}) { warn sprintf("U%04X is %02X%02X and %04X\n", $val,$page,$ch,$seen->{$uch}); $errors++; } else { $seen->{$uch} = $page << 8 | $ch; } } # Passing 2 extra args each time is 3.6% slower! # Even with having to add $fallback ||= 0 later enter_fb0($e2u,$ech,$uch); enter_fb0($u2e,$uch,$ech); } else { # No character at this position # enter($e2u,$ech,undef,$e2u); } $ch++; } } while --$i; } while --$pages; die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines" if $min_el > $max_el; die "$errors mapping conflicts\n" if ($errors && $opt{'S'}); $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];}# my ($a,$s,$d,$t,$fb) = @_;sub enter { my ($current,$inbytes,$outbytes,$next,$fallback) = @_; # state we shift to after this (multibyte) input character defaults to same # as current state. $next ||= $current; # Making sure it is defined seems to be faster than {no warnings;} in # &process, or passing it in as 0 explicity. # XXX $fallback ||= 0; # Start at the beginning and work forwards through the string to zero. # effectively we are removing 1 character from the front each time # but we don't actually edit the string. [this alone seems to be 14% speedup] # Hence -$pos is the length of the remaining string. my $pos = -length $inbytes; while (1) { my $byte = substr $inbytes, $pos, 1; # RAW_NEXT => 0, # RAW_IN_LEN => 1, # RAW_OUT_BYTES => 2, # RAW_FALLBACK => 3, # to unicode an array would seem to be better, because the pages are dense. # from unicode can be very sparse, favouring a hash. # hash using the bytes (all length 1) as keys rather than ord value, # as it's easier to sort these in &process. # It's faster to always add $fallback even if it's undef, rather than # choosing between 3 and 4 element array. (hence why we set it defined # above) my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback]; # When $pos was -1 we were at the last input character. unless (++$pos) { $do_now->[RAW_OUT_BYTES] = $outbytes; $do_now->[RAW_NEXT] = $next; return; } # Tail recursion. The intermdiate state may not have a name yet. $current = $do_now->[RAW_NEXT]; }}# This is purely for optimistation. It's just &enter hard coded for $fallback# of 0, using only a 3 entry array ref to save memory for every entry.sub enter_fb0 { my ($current,$inbytes,$outbytes,$next) = @_; $next ||= $current; my $pos = -length $inbytes; while (1) { my $byte = substr $inbytes, $pos, 1; my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'']; unless (++$pos) { $do_now->[RAW_OUT_BYTES] = $outbytes; $do_now->[RAW_NEXT] = $next; return; } $current = $do_now->[RAW_NEXT]; }}sub process{ my ($name,$a) = @_; $name =~ s/\W+/_/g; $a->{Cname} = $name; my $raw = $a->{Raw}; my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback); my @ent; $agg_max_in = 0; foreach my $key (sort keys %$raw) { # RAW_NEXT => 0, # RAW_IN_LEN => 1, # RAW_OUT_BYTES => 2, # RAW_FALLBACK => 3, my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}}; # Now we are converting from raw to aggregate, switch from 1 byte strings # to numbers my $b = ord $key; $fallback ||= 0; if ($l && # If this == fails, we're going to reset $agg_max_in below anyway. $b == ++$agg_max_in && # References in numeric context give the pointer as an int. $agg_next == $next && $agg_in_len == $in_len && $agg_out_len == length $out_bytes && $agg_fallback == $fallback # && length($l->[AGG_OUT_BYTES]) < 16 ) { # my $i = ord($b)-ord($l->[AGG_MIN_IN]); # we can aggregate this byte onto the end. $l->[AGG_MAX_IN] = $b; $l->[AGG_OUT_BYTES] .= $out_bytes; } else { # AGG_MIN_IN => 0, # AGG_MAX_IN => 1, # AGG_OUT_BYTES => 2, # AGG_NEXT => 3, # AGG_IN_LEN => 4, # AGG_OUT_LEN => 5, # AGG_FALLBACK => 6, # Reset the last thing we saw, plus set 5 lexicals to save some derefs. # (only gains .6% on euc-jp -- is it worth it?) push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next, $agg_in_len = $in_len, $agg_out_len = length $out_bytes, $agg_fallback = $fallback]; } if (exists $next->{Cname}) { $next->{'Forward'} = 1 if $next != $a; } else { process(sprintf("%s_%02x",$name,$b),$next); } } # encengine.c rules say that last entry must be for 255 if ($agg_max_in < 255) { push @ent, [1+$agg_max_in, 255,undef,$a,0,0]; } $a->{'Entries'} = \@ent;}sub addstrings{ my ($fh,$a) = @_; my $name = $a->{'Cname'}; # String tables foreach my $b (@{$a->{'Entries'}}) { next unless $b->[AGG_OUT_LEN]; $strings{$b->[AGG_OUT_BYTES]} = undef; } if ($a->{'Forward'}) { my $cpp = ($Config{d_cplusplus} || '') eq 'define'; my $var = $^O eq 'MacOS' || $cpp ? 'extern' : 'static'; my $const = $cpp ? '' : 'const'; print $fh "$var $const encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n"; } $a->{'DoneStrings'} = 1; foreach my $b (@{$a->{'Entries'}}) { my ($s,$e,$out,$t,$end,$l) = @$b; addstrings($fh,$t) unless $t->{'DoneStrings'}; }}sub outbigstring{ my ($fh,$name) = @_; $string_acc = ''; # Make the big string in the string accumulator. Longest first, on the hope # that this makes it more likely that we find the short strings later on. # Not sure if it helps sorting strings of the same length lexcically. foreach my $s (sort {length $b <=> length $a || $a cmp $b} keys %strings) { my $index = index $string_acc, $s; if ($index >= 0) { $saved += length($s); $strings_in_acc{$s} = $index; } else { OPTIMISER: { if ($opt{'O'}) { my $sublength = length $s; while (--$sublength > 0) { # progressively lop characters off the end, to see if the start of # the new string overlaps the end of the accumulator. if (substr ($string_acc, -$sublength) eq substr ($s, 0, $sublength)) { $subsave += $sublength; $strings_in_acc{$s} = length ($string_acc) - $sublength; # append the last bit on the end. $string_acc .= substr ($s, $sublength); last OPTIMISER; } # or if the end of the new string overlaps the start of the # accumulator next unless substr ($string_acc, 0, $sublength) eq substr ($s, -$sublength); # well, the last $sublength characters of the accumulator match. # so as we're prepending to the accumulator, need to shift all our # existing offsets forwards $_ += $sublength foreach values %strings_in_acc; $subsave += $sublength; $strings_in_acc{$s} = 0; # append the first bit on the start. $string_acc = substr ($s, 0, -$sublength) . $string_acc; last OPTIMISER; } } # Optimiser (if it ran) found nothing, so just going have to tack the # whole thing on the end. $strings_in_acc{$s} = length $string_acc; $string_acc .= $s; }; } } $strings = length $string_acc; my $cpp = ($Config{d_cplusplus} || '') eq 'define'; my $var = $cpp ? '' : 'static'; my $definition = "\n$var const U8 $name\[$strings] = { " . join(',',unpack "C*",$string_acc); # We have a single long line. Split it at convenient commas. print $fh $1, "\n" while $definition =~ /\G(.{74,77},)/gcs; print $fh substr ($definition, pos $definition), " };\n";}sub findstring { my ($name,$s) = @_; my $offset = $strings_in_acc{$s}; die "Can't find string " . join (',',unpack "C*",$s) . " in accumulator" unless defined $offset; "$name + $offset";}sub outtable{ my ($fh,$a,$bigname) = @_; my $name = $a->{'Cname'}; $a->{'Done'} = 1; foreach my $b (@{$a->{'Entries'}}) { my ($s,$e,$out,$t,$end,$l) = @$b; outtable($fh,$t,$bigname) unless $t->{'Done'}; } my $cpp = ($Config{d_cplusplus} || '') eq 'define'; my $var = $cpp ? '' : 'static'; my $const = $cpp ? '' : 'const'; print $fh "\n$var $const encpage_t $name\[", scalar(@{$a->{'Entries'}}), "] = {\n"; foreach my $b (@{$a->{'Entries'}}) { my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b; # $end |= 0x80 if $fb; # what the heck was on your mind, Nick? -- Dan print $fh "{"; if ($l) { printf $fh findstring($bigname,$out); } else { print $fh "0"; } print $fh ",",$t->{Cname}; printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec; } print $fh "};\n";}sub output_enc{ my ($fh,$name,$a) = @_; die "Changed - fix me for new structure"; foreach my $b (sort keys %$a) { my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}}; }}sub decode_U{ my $s = shift;}my @uname;sub char_names{ my $s = do "unicore/Name.pl"; die "char_names: unicore/Name.pl: $!\n" unless defined $s; pos($s) = 0; while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc) { my $name = $3; my $s = hex($1); last if $s >= 0x10000; my $e = length($2) ? hex($2) : $s; for (my $i = $s; $i <= $e; $i++) { $uname[$i] = $name;# print sprintf("U%04X $name\n",$i); } }}sub output_ucm_page{ my ($cmap,$a,$t,$pre) = @_; # warn sprintf("Page %x\n",$pre); my $raw = $t->{Raw}; foreach my $key (sort keys %$raw) { # RAW_NEXT => 0, # RAW_IN_LEN => 1, # RAW_OUT_BYTES => 2, # RAW_FALLBACK => 3, my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}}; my $u = ord $key; $fallback ||= 0; if ($next != $a && $next != $t) { output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF); } elsif (length $out_bytes) { if ($pre) { $u = $pre|($u &0x3f); } my $s = sprintf "<U%04X> ",$u; #foreach my $c (split(//,$out_bytes)) { # $s .= sprintf "\\x%02X",ord($c); #} # 9.5% faster changing that loop to this: $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes; $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u]; push(@$cmap,$s); } else { warn join(',',$u, @{$raw->{$key}},$a,$t); } }}sub output_ucm{ my ($fh,$name,$h,$rep,$min_el,$max_el) = @_; print $fh "# $0 @orig_ARGV\n" unless $opt{'q'}; print $fh "<code_set_name> \"$name\"\n"; char_names(); if (defined $min_el) { print $fh "<mb_cur_min> $min_el\n"; } if (defined $max_el) { print $fh "<mb_cur_max> $max_el\n"; } if (defined $rep) { print $fh "<subchar> "; foreach my $c (split(//,$rep)) { printf $fh "\\x%02X",ord($c); } print $fh "\n"; } my @cmap; output_ucm_page(\@cmap,$h,$h,0); print $fh "#\nCHARMAP\n"; foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap) { print $fh $line; } print $fh "END CHARMAP\n";}use vars qw( $_Enc2xs $_Version $_Inc $_E2X $_Name $_TableFiles $_Now);sub find_e2x{ eval { require File::Find; }; my (@inc, %e2x_dir); for my $inc (@INC){ push @inc, $inc unless $inc eq '.'; #skip current dir } File::Find::find( sub { my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, $atime,$mtime,$ctime,$blksize,$blocks) = lstat($_) or return; -f _ or return; if (/^.*\.e2x$/o){ no warnings 'once'; $e2x_dir{$File::Find::dir} ||= $mtime; } return;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -