📄 generate_normalize_data.pl
字号:
#! /usr/local/bin/perl -w# $Id: generate_normalize_data.pl,v 1.1.1.1 2003/06/04 00:27:55 marka Exp $## Copyright (c) 2000,2001 Japan Network Information Center.# All rights reserved.# # By using this file, you agree to the terms and conditions set forth bellow.# # LICENSE TERMS AND CONDITIONS # # The following License Terms and Conditions apply, unless a different# license is obtained from Japan Network Information Center ("JPNIC"),# a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,# Chiyoda-ku, Tokyo 101-0047, Japan.# # 1. Use, Modification and Redistribution (including distribution of any# modified or derived work) in source and/or binary forms is permitted# under this License Terms and Conditions.# # 2. Redistribution of source code must retain the copyright notices as they# appear in each source code file, this License Terms and Conditions.# # 3. Redistribution in binary form must reproduce the Copyright Notice,# this License Terms and Conditions, in the documentation and/or other# materials provided with the distribution. For the purposes of binary# distribution the "Copyright Notice" refers to the following language:# "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved."# # 4. The name of JPNIC may not be used to endorse or promote products# derived from this Software without specific prior written approval of# JPNIC.# # 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF# ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.## # Generate lib/unicodedata.c from UnicodeData.txt,# CompositionExclusions-1.txt, SpecialCasing.txt and CaseFolding.txt,# all of them available from ftp://ftp.unicode.org/Public/UNIDATA/.#use strict;use lib qw(.);use Getopt::Long;use UCD;use SparseMap;use constant UCS_MAX => 0x110000;use constant END_BIT => 0x80000000;my $DECOMP_COMPAT_BIT = 0x8000;my $CASEMAP_FINAL_BIT = 0x1;my $CASEMAP_NONFINAL_BIT = 0x2;my $CASEMAP_LAST_BIT = 0x10;my $LETTER_BIT = 1;my $NSPMARK_BIT = 2;(my $myid = '$Id: generate_normalize_data.pl,v 1.1.1.1 2003/06/04 00:27:55 marka Exp $') =~ s/\$([^\$]+)\$/\$-$1-\$/;my @default_bits = (9, 7, 5);#my @default_bits = (7, 7, 7);my @canon_class_bits = @default_bits;my @decomp_bits = @default_bits;my @comp_bits = @default_bits;my @folding_bits = @default_bits;my @casemap_bits = @default_bits;my @casemap_ctx_bits = @default_bits;my $prefix = '';my $dir = '.';my $unicodedatafile = 'UnicodeData.txt';my $exclusionfile = 'CompositionExclusions.txt';my $specialcasefile = 'SpecialCasing.txt';my $casefoldingfile = 'CaseFolding.txt';my $verbose;GetOptions('dir|d=s' => \$dir, 'unicodedata|u=s' => \$unicodedatafile, 'exclude|e=s' => \$exclusionfile, 'specialcase|s=s' => \$specialcasefile, 'casefold|c=s' => \$casefoldingfile, 'prefix|p=s' => \$prefix, 'verbose|v' => \$verbose,) or usage();foreach my $r (\$unicodedatafile, \$exclusionfile, \$specialcasefile, \$casefoldingfile) { $$r = "$dir/$$r" unless $$r =~ m|^/|;}my %exclusions;my %lower_special;my %upper_special;my @decomp_data;my @comp_data;my @toupper_data;my @tolower_data;my @folding_data;## Create Mapping/Bitmap objects.## canonical classmy $canon_class = SparseMap::Int->new(BITS => [@canon_class_bits], MAX => UCS_MAX, MAPALL => 1, DEFAULT => 0);# canonical/compatibility decompositionmy $decomp = SparseMap::Int->new(BITS => [@decomp_bits], MAX => UCS_MAX, MAPALL => 1, DEFAULT => 0);# canonical compositionmy $comp = SparseMap::Int->new(BITS => [@comp_bits], MAX => UCS_MAX, MAPALL => 1, DEFAULT => 0);# uppercase/lowercasemy $upper = SparseMap::Int->new(BITS => [@casemap_bits], MAX => UCS_MAX, MAPALL => 1, DEFAULT => 0);my $lower = SparseMap::Int->new(BITS => [@casemap_bits], MAX => UCS_MAX, MAPALL => 1, DEFAULT => 0);# final/nonfinal contextmy $casemap_ctx = SparseMap::Int->new(BITS => [@casemap_ctx_bits], MAX => UCS_MAX, MAPALL => 1, DEFAULT => 0);# casefoldingmy $folding = SparseMap::Int->new(BITS => [@folding_bits], MAX => UCS_MAX, MAPALL => 1, DEFAULT => 0);## Read datafiles.#read_exclusion_file();read_specialcasing_file();read_unicodedata_file();read_casefolding_file();print_header();print_canon_class();print_composition();print_decomposition();print_casemap();print_casemap_context();print_casefolding();exit;sub usage { print STDERR <<"END";Usage: $0 [options..] options: -d DIR directory where Unicode Character Data files resides [./] -u FILE name of the UnicodeData file [UnicodeData.txt] -e FILE name of the CompositionExclusion file [CompositionExclusions-1.txt] -s FILE name of the SpecialCasing file [SpecialCasing.txt] -c FILE name of the CaseFolding file [CaseFolding.txt]END exit 1;}## read_exclusion_file -- read CompositionExclusions-1.txt.#sub read_exclusion_file { open EXCLUDE, $exclusionfile or die "cannot open $exclusionfile: $!\n"; while ($_ = UCD::CompositionExclusions::getline(\*EXCLUDE)) { my %data = UCD::CompositionExclusions::parseline($_); $exclusions{$data{CODE}} = 1; } close EXCLUDE;}## read_specialcasing_file -- read SpecialCasing.txt#sub read_specialcasing_file { open SPCASE, $specialcasefile or die "cannot open $specialcasefile: $!\n"; while ($_ = UCD::SpecialCasing::getline(\*SPCASE)) { my %data = UCD::SpecialCasing::parseline($_); my $code = $data{CODE}; my $lower = $data{LOWER}; my $upper = $data{UPPER}; my $cond = $data{CONDITION} || ''; next unless $cond eq '' or $cond =~ /^(NON_)?FINAL/; if (defined $cond && (@$lower > 1 || $lower->[0] != $code) or @$lower > 1 or $lower->[0] != $code) { $lower_special{$code} = [$lower, $cond]; } if (defined $cond && (@$upper > 1 || $upper->[0] != $code) or @$upper > 1 or $upper->[0] != $code) { $upper_special{$code} = [$upper, $cond]; } } close SPCASE;}## read_unicodedata_file -- read UnicodeData.txt#sub read_unicodedata_file { open UCD, $unicodedatafile or die "cannot open $unicodedatafile: $!\n"; @decomp_data = (0); @toupper_data = (0); @tolower_data = (0); my @comp_cand; # canonical composition candidates my %nonstarter; while ($_ = UCD::UnicodeData::getline(\*UCD)) { my %data = UCD::UnicodeData::parseline($_); my $code = $data{CODE}; # combining class if ($data{CLASS} > 0) { $nonstarter{$code} = 1; $canon_class->add($code, $data{CLASS}); } # uppercasing if (exists $upper_special{$code} or defined $data{UPPER}) { my $offset = @toupper_data; my @casedata; $upper->add($code, $offset); if (exists $upper_special{$code}) { push @casedata, $upper_special{$code}; } if (defined $data{UPPER}) { push @casedata, $data{UPPER}; } push @toupper_data, casemap_data(@casedata); } # lowercasing if (exists $lower_special{$code} or defined $data{LOWER}) { my $offset = @tolower_data; my @casedata; $lower->add($code, $offset); if (exists $lower_special{$code}) { push @casedata, $lower_special{$code}; } if (defined $data{LOWER}) { push @casedata, $data{LOWER}; } push @tolower_data, casemap_data(@casedata); } # composition/decomposition if ($data{DECOMP}) { my ($tag, @decomp) = @{$data{DECOMP}}; my $offset = @decomp_data; # composition if ($tag eq '' and @decomp > 1 and not exists $exclusions{$code}) { # canonical composition candidate push @comp_cand, [$code, @decomp]; } # decomposition if ($tag ne '') { # compatibility decomposition $offset |= $DECOMP_COMPAT_BIT; } $decomp->add($code, $offset); push @decomp_data, @decomp; $decomp_data[-1] |= END_BIT; } # final/nonfinal context if ($data{CATEGORY} =~ /L[ult]/) { $casemap_ctx->add($code, $LETTER_BIT); } elsif ($data{CATEGORY} eq 'Mn') { $casemap_ctx->add($code, $NSPMARK_BIT); } } close UCD; # Eliminate composition candidates whose decomposition starts with # a non-starter. @comp_cand = grep {not exists $nonstarter{$_->[1]}} @comp_cand; @comp_data = ([0, 0, 0]); my $last_code = -1; my $last_offset = @comp_data; for my $r (sort {$a->[1] <=> $b->[1] || $a->[2] <=> $b->[2]} @comp_cand) { if ($r->[1] != $last_code) { $comp->add($last_code, ($last_offset | ((@comp_data - $last_offset)<<16))) unless $last_code == -1; $last_code = $r->[1]; $last_offset = @comp_data; } push @comp_data, $r; } $comp->add($last_code, ($last_offset | ((@comp_data - $last_offset)<<16)));}sub casemap_data { my @data = @_; my @result = (); while (@data > 0) { my $r = shift @data; my $flag = 0; if (ref $r) { if ($r->[1] eq 'FINAL') { $flag |= $CASEMAP_FINAL_BIT; } elsif ($r->[1] eq 'NON_FINAL') { $flag |= $CASEMAP_NONFINAL_BIT; } elsif ($r->[1] ne '') { die "unknown condition \"", $r->[1], "\"\n"; } } $flag |= $CASEMAP_LAST_BIT if @data == 0; push @result, $flag; push @result, (ref $r) ? @{$r->[0]} : $r; $result[-1] |= END_BIT; } @result;}## read_casefolding_file -- read CaseFolding.txt#sub read_casefolding_file { open FOLD, $casefoldingfile or die "cannto open $casefoldingfile: $!\n"; # dummy. @folding_data = (0); while ($_ = UCD::CaseFolding::getline(\*FOLD)) { my %data = UCD::CaseFolding::parseline($_); $folding->add($data{CODE}, scalar(@folding_data)); push @folding_data, @{$data{MAP}}; $folding_data[-1] |= END_BIT; } close FOLD;}sub print_header { print <<"END";/* \$Id\$ *//* $myid *//* * Do not edit this file! * This file is generated from UnicodeData.txt, CompositionExclusions-1.txt, * SpecialCasing.txt and CaseFolding.txt. */END}## print_canon_class -- generate data for canonical class#sub print_canon_class { $canon_class->fix(); print STDERR "** cannon_class\n", $canon_class->stat() if $verbose; print <<"END";/* * Canonical Class */END print_bits("CANON_CLASS", @canon_class_bits); print "\n"; print $canon_class->cprog(NAME => "${prefix}canon_class");}## print_composition -- generate data for canonical composition#sub print_composition { $comp->fix(); print STDERR "** composition\n", $comp->stat() if $verbose; print <<"END";/* * Canonical Composition */END print_bits("CANON_COMPOSE", @comp_bits); print "\n"; print $comp->cprog(NAME => "${prefix}compose"); print <<"END";static const struct composition ${prefix}compose_seq[] = {END my $i = 0; foreach my $r (@comp_data) { if ($i % 2 == 0) { print "\n" if $i != 0; print "\t"; } printf "{ 0x%08x, 0x%08x }, ", $r->[2], $r->[0]; $i++; } print "\n};\n\n";}## print_decomposition -- generate data for canonical/compatibility# decomposition#sub print_decomposition { $decomp->fix(); print STDERR "** decomposition\n", $decomp->stat() if $verbose; print <<"END";/* * Canonical/Compatibility Decomposition */END print_bits("DECOMP", @decomp_bits); print "#define DECOMP_COMPAT\t$DECOMP_COMPAT_BIT\n\n"; print $decomp->cprog(NAME => "${prefix}decompose"); print "static const unsigned long ${prefix}decompose_seq[] = {\n"; print_ulseq(@decomp_data); print "};\n\n";}## print_casemap -- generate data for case mapping#sub print_casemap { $upper->fix(); $lower->fix(); print STDERR "** upper mapping\n", $upper->stat() if $verbose; print STDERR "** lower mapping\n", $lower->stat() if $verbose; print <<"END";/* * Lowercase <-> Uppercase mapping *//* * Flags for special case mapping. */#define CMF_FINAL $CASEMAP_FINAL_BIT#define CMF_NONFINAL $CASEMAP_NONFINAL_BIT#define CMF_LAST $CASEMAP_LAST_BIT#define CMF_CTXDEP (CMF_FINAL|CMF_NONFINAL)END print_bits("CASEMAP", @casemap_bits); print "\n"; print $upper->cprog(NAME => "${prefix}toupper"); print $lower->cprog(NAME => "${prefix}tolower"); print "static const unsigned long ${prefix}toupper_seq[] = {\n"; print_ulseq(@toupper_data); print "};\n\n"; print "static const unsigned long ${prefix}tolower_seq[] = {\n"; print_ulseq(@tolower_data); print "};\n\n";}## print_casefolding -- generate data for case folding#sub print_casefolding { $folding->fix(); print STDERR "** case folding\n", $folding->stat() if $verbose; print <<"END";/* * Case Folding */END print_bits("CASE_FOLDING", @folding_bits); print "\n"; print $folding->cprog(NAME => "${prefix}case_folding"); print "static const unsigned long ${prefix}case_folding_seq[] = {\n"; print_ulseq(@folding_data); print "};\n\n";}## print_casemap_context -- gerarate data for determining context# (final/non-final)#sub print_casemap_context { $casemap_ctx->fix(); print STDERR "** casemap context\n", $casemap_ctx->stat() if $verbose; print <<"END";/* * Cased characters and non-spacing marks (for casemap context) */END print_bits("CASEMAP_CTX", @casemap_ctx_bits); print <<"END";#define CTX_CASED $LETTER_BIT#define CTX_NSM $NSPMARK_BITEND print $casemap_ctx->cprog(NAME => "${prefix}casemap_ctx");}sub sprint_composition_hash { my $i = 0; my $s = ''; foreach my $r (@_) { if ($i % 2 == 0) { $s .= "\n" if $i != 0; $s .= "\t"; } $s .= sprintf "{0x%04x, 0x%04x, 0x%04x}, ", @{$r}; $i++; } $s;}sub print_bits { my $prefix = shift; my $i = 0; foreach my $bit (@_) { print "#define ${prefix}_BITS_$i\t$bit\n"; $i++; }}sub print_ulseq { my $i = 0; foreach my $v (@_) { if ($i % 4 == 0) { print "\n" if $i != 0; print "\t"; } printf "0x%08x, ", $v; $i++; } print "\n";}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -