📄 mktables
字号:
## !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!!## Any files created or read by this program should be listed in 'mktables.lst'#!/usr/bin/perl -wrequire 5.008; # Needs pack "U". Probably safest to run on 5.8.xuse strict;use Carp;use File::Spec;#### mktables -- create the runtime Perl Unicode files (lib/unicore/**/*.pl)## from the Unicode database files (lib/unicore/*.txt).#### "Fuzzy" means this section in Unicode TR18:#### The recommended names for UCD properties and property values are in## PropertyAliases.txt [Prop] and PropertyValueAliases.txt## [PropValue]. There are both abbreviated names and longer, more## descriptive names. It is strongly recommended that both names be## recognized, and that loose matching of property names be used,## whereby the case distinctions, whitespace, hyphens, and underbar## are ignored.## Base names already used in lib/gc_sc (for avoiding 8.3 conflicts)my %BaseNames;#### Process any args.##my $Verbose = 0;my $MakeTestScript = 0;my $AlwaysWrite = 0;my $UseDir = "";my $FileList = "$0.lst";my $MakeList = 0;while (@ARGV){ my $arg = shift @ARGV; if ($arg eq '-v') { $Verbose = 1; } elsif ($arg eq '-q') { $Verbose = 0; } elsif ($arg eq '-w') { $AlwaysWrite = 1; # update the files even if they havent changed $FileList = ""; } elsif ($arg eq '-check') { my $this = shift @ARGV; my $ok = shift @ARGV; if ($this ne $ok) { print "Skipping as check params are not the same.\n"; exit(0); } } elsif ($arg eq '-maketest') { $MakeTestScript = 1; } elsif ($arg eq '-makelist') { $MakeList = 1; } elsif ($arg eq '-C' && defined ($UseDir = shift)) { -d $UseDir or die "Unknown directory '$UseDir'"; } elsif ($arg eq '-L' && defined ($FileList = shift)) { -e $FileList or die "Filelist '$FileList' doesn't appear to exist!"; } else { die "usage: $0 [-v|-q|-w|-C dir|-L filelist] [-maketest] [-makelist]\n", " -v : Verbose Mode\n", " -q : Quiet Mode\n", " -w : Write files regardless\n", " -maketest : Make test script\n", " -makelist : Rewrite the file list based on current setup\n", " -L filelist : Use this file list, (defaults to $0)\n", " -C dir : Change to this directory before proceeding\n", " -check A B : Executes only if A and B are the same\n"; }}if ($FileList) { print "Reading file list '$FileList'\n" if $Verbose; open my $fh,"<",$FileList or die "Failed to read '$FileList':$!"; my @input; my @output; for my $list ( \@input, \@output ) { while (<$fh>) { s/^ \s+ | \s+ $//xg; next if /^ \s* (?: \# .* )? $/x; last if /^ =+ $/x; my ( $file ) = split /\t/, $_; push @$list, $file; } my %dupe; @$list = grep !$dupe{ $_ }++, @$list; } close $fh; die "No input or output files in '$FileList'!" if !@input or !@output; if ( $MakeList ) { foreach my $file (@output) { unlink $file; } } if ( $Verbose ) { print "Expecting ".scalar( @input )." input files. ", "Checking ".scalar( @output )." output files.\n"; } # we set maxtime to be the youngest input file, including $0 itself. my $maxtime = -M $0; # do this before the chdir! if ($UseDir) { chdir $UseDir or die "Failed to chdir to '$UseDir':$!"; } foreach my $in (@input) { my $time = -M $in; die "Missing input file '$in'" unless defined $time; $maxtime = $time if $maxtime < $time; } # now we check to see if any output files are older than maxtime, if # they are we need to continue on, otherwise we can presumably bail. my $ok = 1; foreach my $out (@output) { if ( ! -e $out ) { print "'$out' is missing.\n" if $Verbose; $ok = 0; last; } if ( -M $out > $maxtime ) { print "'$out' is too old.\n" if $Verbose; $ok = 0; last; } } if ($ok) { print "Files seem to be ok, not bothering to rebuild.\n"; exit(0); } print "Must rebuild tables.\n" if $Verbose;} else { if ($Verbose) { print "Not checking filelist.\n"; } if ($UseDir) { chdir $UseDir or die "Failed to chdir to '$UseDir':$!"; }}foreach my $lib ('To', 'lib', map {File::Spec->catdir("lib",$_)} qw(gc_sc dt bc hst ea jt lb nt ccc)) { next if -d $lib; mkdir $lib, 0755 or die "mkdir '$lib': $!";}my $LastUnicodeCodepoint = 0x10FFFF; # As of Unicode 3.1.1.my $HEADER=<<"EOF";# !!!!!!! DO NOT EDIT THIS FILE !!!!!!! # This file is built by $0 from e.g. UnicodeData.txt.# Any changes made here will be lost!EOFsub force_unlink { my $filename = shift; return unless -e $filename; return if CORE::unlink($filename); # We might need write permission chmod 0777, $filename; CORE::unlink($filename) or die "Couldn't unlink $filename: $!\n";}#### Given a filename and a reference to an array of lines,## write the lines to the file only if the contents have not changed.## Filename can be given as an arrayref of directory names##sub WriteIfChanged($\@){ my $file = shift; my $lines = shift; $file = File::Spec->catfile(@$file) if ref $file; my $TextToWrite = join '', @$lines; if (open IN, $file) { local($/) = undef; my $PreviousText = <IN>; close IN; if ($PreviousText eq $TextToWrite) { print "$file unchanged.\n" if $Verbose; return unless $AlwaysWrite; } } force_unlink ($file); if (not open OUT, ">$file") { die "$0: can't open $file for output: $!\n"; } print "$file written.\n" if $Verbose; print OUT $TextToWrite; close OUT;}#### The main datastructure (a "Table") represents a set of code points that## are part of a particular quality (that are part of \pL, \p{InGreek},## etc.). They are kept as ranges of code points (starting and ending of## each range).#### For example, a range ASCII LETTERS would be represented as:## [ [ 0x41 => 0x5A, 'UPPER' ],## [ 0x61 => 0x7A, 'LOWER, ] ]##sub RANGE_START() { 0 } ## index into range elementsub RANGE_END() { 1 } ## index into range elementsub RANGE_NAME() { 2 } ## index into range element## Conceptually, these should really be folded into the 'Table' objectsmy %TableInfo;my %TableDesc;my %FuzzyNames;my %AliasInfo;my %CanonicalToOrig;#### Turn something like## OLD-ITALIC## into## OldItalic##sub CanonicalName($){ my $orig = shift; my $name = lc $orig; $name =~ s/(?<![a-z])(\w)/\u$1/g; $name =~ s/[-_\s]+//g; $CanonicalToOrig{$name} = $orig if not $CanonicalToOrig{$name}; return $name;}#### Store the alias definitions for later use.##my %PropertyAlias;my %PropValueAlias;my %PA_reverse;my %PVA_reverse;sub Build_Aliases(){ ## ## Most of the work with aliases doesn't occur here, ## but rather in utf8_heavy.pl, which uses PVA.pl, # Placate the warnings about used only once. (They are used again, but # via a typeglob lookup) %utf8::PropertyAlias = (); %utf8::PA_reverse = (); %utf8::PropValueAlias = (); %utf8::PVA_reverse = (); %utf8::PVA_abbr_map = (); open PA, "< PropertyAliases.txt" or confess "Can't open PropertyAliases.txt: $!"; while (<PA>) { s/#.*//; s/\s+$//; next if /^$/; my ($abbrev, $name) = split /\s*;\s*/; next if $abbrev eq "n/a"; $PropertyAlias{$abbrev} = $name; $PA_reverse{$name} = $abbrev; # The %utf8::... versions use japhy's code originally from utf8_pva.pl # However, it's moved here so that we build the tables at runtime. tr/ _-//d for $abbrev, $name; $utf8::PropertyAlias{lc $abbrev} = $name; $utf8::PA_reverse{lc $name} = $abbrev; } close PA; open PVA, "< PropValueAliases.txt" or confess "Can't open PropValueAliases.txt: $!"; while (<PVA>) { s/#.*//; s/\s+$//; next if /^$/; my ($prop, @data) = split /\s*;\s*/; if ($prop eq 'ccc') { $PropValueAlias{$prop}{$data[1]} = [ @data[0,2] ]; $PVA_reverse{$prop}{$data[2]} = [ @data[0,1] ]; } else { next if $data[0] eq "n/a"; $PropValueAlias{$prop}{$data[0]} = $data[1]; $PVA_reverse{$prop}{$data[1]} = $data[0]; } shift @data if $prop eq 'ccc'; next if $data[0] eq "n/a"; $data[1] =~ tr/ _-//d; $utf8::PropValueAlias{$prop}{lc $data[0]} = $data[1]; $utf8::PVA_reverse{$prop}{lc $data[1]} = $data[0]; my $abbr_class = ($prop eq 'gc' or $prop eq 'sc') ? 'gc_sc' : $prop; $utf8::PVA_abbr_map{$abbr_class}{lc $data[0]} = $data[0]; } close PVA; # backwards compatibility for L& -> LC $utf8::PropValueAlias{gc}{'l&'} = $utf8::PropValueAlias{gc}{lc}; $utf8::PVA_abbr_map{gc_sc}{'l&'} = $utf8::PVA_abbr_map{gc_sc}{lc};}#### Associates a property ("Greek", "Lu", "Assigned",...) with a Table.#### Called like:## New_Prop(In => 'Greek', $Table, Desc => 'Greek Block', Fuzzy => 1);#### Normally, these parameters are set when the Table is created (when the## Table->New constructor is called), but there are times when it needs to## be done after-the-fact...)##sub New_Prop($$$@){ my $Type = shift; ## "Is" or "In"; my $Name = shift; my $Table = shift; ## remaining args are optional key/val my %Args = @_; my $Fuzzy = delete $Args{Fuzzy}; my $Desc = delete $Args{Desc}; # description $Name = CanonicalName($Name) if $Fuzzy; ## sanity check a few args if (%Args or ($Type ne 'Is' and $Type ne 'In') or not ref $Table) { confess "$0: bad args to New_Prop" } if (not $TableInfo{$Type}->{$Name}) { $TableInfo{$Type}->{$Name} = $Table; $TableDesc{$Type}->{$Name} = $Desc; if ($Fuzzy) { $FuzzyNames{$Type}->{$Name} = $Name; } }}#### Creates a new Table object.#### Args are key/value pairs:## In => Name -- Name of "In" property to be associated with## Is => Name -- Name of "Is" property to be associated with## Fuzzy => Boolean -- True if name can be accessed "fuzzily"## Desc => String -- Description of the property#### No args are required.##sub Table::New{ my $class = shift; my %Args = @_; my $Table = bless [], $class; my $Fuzzy = delete $Args{Fuzzy}; my $Desc = delete $Args{Desc}; for my $Type ('Is', 'In') { if (my $Name = delete $Args{$Type}) { New_Prop($Type => $Name, $Table, Desc => $Desc, Fuzzy => $Fuzzy); } } ## shouldn't have any left over if (%Args) { confess "$0: bad args to Table->New" } return $Table;}#### Returns the maximum code point currently in the table.##sub Table::Max{ my $last = $_[0]->[-1]; ## last code point confess "oops" unless $last; ## must have code points to have a max return $last->[RANGE_END];}#### Replaces the codepoints in the Table with those in the Table given## as an arg. (NOTE: this is not a "deep copy").##sub Table::Replace($$){ my $Table = shift; #self my $New = shift; @$Table = @$New;}#### Given a new code point, make the last range of the Table extend to## include the new (and all intervening) code points.#### Takes the time to make sure that the extension is valid.##sub Table::Extend{ my $Table = shift; #self my $codepoint = shift; my $PrevMax = $Table->Max; confess "oops ($codepoint <= $PrevMax)" if $codepoint <= $PrevMax; $Table->ExtendNoCheck($codepoint);}#### Given a new code point, make the last range of the Table extend to## include the new (and all intervening) code points.#### Does NOT check that the extension is valid. Assumes that the caller## has already made this check.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -