⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 generate_normalize_data.pl

📁 package of develop dns
💻 PL
字号:
#! /usr/local/bin/perl -w# $Id: generate_normalize_data.pl,v 1.1.1.1 2003/06/04 00:27:55 marka Exp $## Copyright (c) 2000,2001 Japan Network Information Center.# All rights reserved.#  # By using this file, you agree to the terms and conditions set forth bellow.# # 			LICENSE TERMS AND CONDITIONS # # The following License Terms and Conditions apply, unless a different# license is obtained from Japan Network Information Center ("JPNIC"),# a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,# Chiyoda-ku, Tokyo 101-0047, Japan.# # 1. Use, Modification and Redistribution (including distribution of any#    modified or derived work) in source and/or binary forms is permitted#    under this License Terms and Conditions.# # 2. Redistribution of source code must retain the copyright notices as they#    appear in each source code file, this License Terms and Conditions.# # 3. Redistribution in binary form must reproduce the Copyright Notice,#    this License Terms and Conditions, in the documentation and/or other#    materials provided with the distribution.  For the purposes of binary#    distribution the "Copyright Notice" refers to the following language:#    "Copyright (c) 2000-2002 Japan Network Information Center.  All rights reserved."# # 4. The name of JPNIC may not be used to endorse or promote products#    derived from this Software without specific prior written approval of#    JPNIC.# # 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC#    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT#    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A#    PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL JPNIC BE LIABLE#    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR#    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF#    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR#    BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,#    WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR#    OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF#    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.## # Generate lib/unicodedata.c from UnicodeData.txt,# CompositionExclusions-1.txt, SpecialCasing.txt and CaseFolding.txt,# all of them available from ftp://ftp.unicode.org/Public/UNIDATA/.#use strict;use lib qw(.);use Getopt::Long;use UCD;use SparseMap;use constant UCS_MAX => 0x110000;use constant END_BIT => 0x80000000;my $DECOMP_COMPAT_BIT = 0x8000;my $CASEMAP_FINAL_BIT = 0x1;my $CASEMAP_NONFINAL_BIT = 0x2;my $CASEMAP_LAST_BIT = 0x10;my $LETTER_BIT = 1;my $NSPMARK_BIT = 2;(my $myid = '$Id: generate_normalize_data.pl,v 1.1.1.1 2003/06/04 00:27:55 marka Exp $') =~ s/\$([^\$]+)\$/\$-$1-\$/;my @default_bits = (9, 7, 5);#my @default_bits = (7, 7, 7);my @canon_class_bits = @default_bits;my @decomp_bits = @default_bits;my @comp_bits = @default_bits;my @folding_bits = @default_bits;my @casemap_bits = @default_bits;my @casemap_ctx_bits = @default_bits;my $prefix = '';my $dir = '.';my $unicodedatafile = 'UnicodeData.txt';my $exclusionfile = 'CompositionExclusions.txt';my $specialcasefile = 'SpecialCasing.txt';my $casefoldingfile = 'CaseFolding.txt';my $verbose;GetOptions('dir|d=s' => \$dir,	   'unicodedata|u=s' => \$unicodedatafile,	   'exclude|e=s' => \$exclusionfile,		   'specialcase|s=s' => \$specialcasefile,	   'casefold|c=s' => \$casefoldingfile,	   'prefix|p=s' => \$prefix,	   'verbose|v' => \$verbose,) or usage();foreach my $r (\$unicodedatafile, \$exclusionfile,	       \$specialcasefile, \$casefoldingfile) {    $$r = "$dir/$$r" unless $$r =~ m|^/|;}my %exclusions;my %lower_special;my %upper_special;my @decomp_data;my @comp_data;my @toupper_data;my @tolower_data;my @folding_data;## Create Mapping/Bitmap objects.## canonical classmy $canon_class = SparseMap::Int->new(BITS => [@canon_class_bits],				     MAX => UCS_MAX,				     MAPALL => 1,				     DEFAULT => 0);# canonical/compatibility decompositionmy $decomp = SparseMap::Int->new(BITS => [@decomp_bits],				 MAX => UCS_MAX,				 MAPALL => 1,				 DEFAULT => 0);# canonical compositionmy $comp = SparseMap::Int->new(BITS => [@comp_bits],			       MAX => UCS_MAX,			       MAPALL => 1,			       DEFAULT => 0);# uppercase/lowercasemy $upper = SparseMap::Int->new(BITS => [@casemap_bits],			        MAX => UCS_MAX,			        MAPALL => 1,			        DEFAULT => 0);my $lower = SparseMap::Int->new(BITS => [@casemap_bits],			        MAX => UCS_MAX,			        MAPALL => 1,			        DEFAULT => 0);# final/nonfinal contextmy $casemap_ctx = SparseMap::Int->new(BITS => [@casemap_ctx_bits],				      MAX => UCS_MAX,				      MAPALL => 1,				      DEFAULT => 0);# casefoldingmy $folding = SparseMap::Int->new(BITS => [@folding_bits],				  MAX => UCS_MAX,				  MAPALL => 1,				  DEFAULT => 0);## Read datafiles.#read_exclusion_file();read_specialcasing_file();read_unicodedata_file();read_casefolding_file();print_header();print_canon_class();print_composition();print_decomposition();print_casemap();print_casemap_context();print_casefolding();exit;sub usage {    print STDERR <<"END";Usage: $0 [options..]  options:    -d DIR  directory where Unicode Character Data files resides [./]    -u FILE name of the UnicodeData file [UnicodeData.txt]    -e FILE name of the CompositionExclusion file [CompositionExclusions-1.txt]    -s FILE name of the SpecialCasing file [SpecialCasing.txt]    -c FILE name of the CaseFolding file [CaseFolding.txt]END    exit 1;}## read_exclusion_file -- read CompositionExclusions-1.txt.#sub read_exclusion_file {    open EXCLUDE, $exclusionfile   or die "cannot open $exclusionfile: $!\n";    while ($_ = UCD::CompositionExclusions::getline(\*EXCLUDE)) {	my %data = UCD::CompositionExclusions::parseline($_);	$exclusions{$data{CODE}} = 1;    }    close EXCLUDE;}## read_specialcasing_file -- read SpecialCasing.txt#sub read_specialcasing_file {    open SPCASE, $specialcasefile or die "cannot open $specialcasefile: $!\n";    while ($_ = UCD::SpecialCasing::getline(\*SPCASE)) {	my %data = UCD::SpecialCasing::parseline($_);	my $code = $data{CODE};	my $lower = $data{LOWER};	my $upper = $data{UPPER};	my $cond = $data{CONDITION} || '';	next unless $cond eq '' or $cond =~ /^(NON_)?FINAL/;	if (defined $cond && (@$lower > 1 || $lower->[0] != $code)	    or @$lower > 1 or $lower->[0] != $code) {	    $lower_special{$code} = [$lower, $cond];	}	if (defined $cond && (@$upper > 1 || $upper->[0] != $code)	    or @$upper > 1 or $upper->[0] != $code) {	    $upper_special{$code} = [$upper, $cond];	}    }    close SPCASE;}## read_unicodedata_file -- read UnicodeData.txt#sub read_unicodedata_file {    open UCD, $unicodedatafile or die "cannot open $unicodedatafile: $!\n";    @decomp_data = (0);    @toupper_data = (0);    @tolower_data = (0);    my @comp_cand;	# canonical composition candidates    my %nonstarter;    while ($_ = UCD::UnicodeData::getline(\*UCD)) {	my %data = UCD::UnicodeData::parseline($_);	my $code = $data{CODE};	# combining class	if ($data{CLASS} > 0) {	    $nonstarter{$code} = 1;	    $canon_class->add($code, $data{CLASS});	}	# uppercasing	if (exists $upper_special{$code} or defined $data{UPPER}) {	    my $offset = @toupper_data;	    my @casedata;	    $upper->add($code, $offset);	    if (exists $upper_special{$code}) {		push @casedata, $upper_special{$code};	    }	    if (defined $data{UPPER}) {		push @casedata, $data{UPPER};	    }	    push @toupper_data, casemap_data(@casedata);	}	# lowercasing	if (exists $lower_special{$code} or defined $data{LOWER}) {	    my $offset = @tolower_data;	    my @casedata;	    $lower->add($code, $offset);	    if (exists $lower_special{$code}) {		push @casedata, $lower_special{$code};	    }	    if (defined $data{LOWER}) {		push @casedata, $data{LOWER};	    }	    push @tolower_data, casemap_data(@casedata);	}	# composition/decomposition	if ($data{DECOMP}) {	    my ($tag, @decomp) = @{$data{DECOMP}};	    my $offset = @decomp_data;	    # composition	    if ($tag eq '' and @decomp > 1 and not exists $exclusions{$code}) {		# canonical composition candidate		push @comp_cand, [$code, @decomp];	    }	    # decomposition	    if ($tag ne '') {		# compatibility decomposition		$offset |= $DECOMP_COMPAT_BIT;	    }	    $decomp->add($code, $offset);	    push @decomp_data, @decomp;	    $decomp_data[-1] |= END_BIT;	}	# final/nonfinal context	if ($data{CATEGORY} =~ /L[ult]/) {	    $casemap_ctx->add($code, $LETTER_BIT);	} elsif ($data{CATEGORY} eq 'Mn') {	    $casemap_ctx->add($code, $NSPMARK_BIT);	}    }    close UCD;    # Eliminate composition candidates whose decomposition starts with    # a non-starter.    @comp_cand = grep {not exists $nonstarter{$_->[1]}} @comp_cand;    @comp_data = ([0, 0, 0]);    my $last_code = -1;    my $last_offset = @comp_data;    for my $r (sort {$a->[1] <=> $b->[1] || $a->[2] <=> $b->[2]} @comp_cand) {	if ($r->[1] != $last_code) {	    $comp->add($last_code,		       ($last_offset | ((@comp_data - $last_offset)<<16)))		unless $last_code == -1;	    $last_code = $r->[1];	    $last_offset = @comp_data;	}	push @comp_data, $r;    }    $comp->add($last_code,	       ($last_offset | ((@comp_data - $last_offset)<<16)));}sub casemap_data {    my @data = @_;    my @result = ();    while (@data > 0) {	my $r = shift @data;	my $flag = 0;	if (ref $r) {	    if ($r->[1] eq 'FINAL') {		$flag |= $CASEMAP_FINAL_BIT;	    } elsif ($r->[1] eq 'NON_FINAL') {		$flag |= $CASEMAP_NONFINAL_BIT;	    } elsif ($r->[1] ne '') {		die "unknown condition \"", $r->[1], "\"\n";	    }	}	$flag |= $CASEMAP_LAST_BIT if @data == 0;	push @result, $flag;	push @result, (ref $r) ? @{$r->[0]} : $r;	$result[-1] |= END_BIT;    }    @result;}## read_casefolding_file -- read CaseFolding.txt#sub read_casefolding_file {    open FOLD, $casefoldingfile or die "cannto open $casefoldingfile: $!\n";    # dummy.    @folding_data = (0);    while ($_ = UCD::CaseFolding::getline(\*FOLD)) {	my %data = UCD::CaseFolding::parseline($_);	$folding->add($data{CODE}, scalar(@folding_data));	push @folding_data, @{$data{MAP}};	$folding_data[-1] |= END_BIT;    }    close FOLD;}sub print_header {    print <<"END";/* \$Id\$ *//* $myid *//* * Do not edit this file! * This file is generated from UnicodeData.txt, CompositionExclusions-1.txt, * SpecialCasing.txt and CaseFolding.txt. */END}## print_canon_class -- generate data for canonical class#sub print_canon_class {    $canon_class->fix();    print STDERR "** cannon_class\n", $canon_class->stat() if $verbose;    print <<"END";/* * Canonical Class */END    print_bits("CANON_CLASS", @canon_class_bits);    print "\n";    print $canon_class->cprog(NAME => "${prefix}canon_class");}## print_composition -- generate data for canonical composition#sub print_composition {    $comp->fix();    print STDERR "** composition\n", $comp->stat() if $verbose;    print <<"END";/* * Canonical Composition */END    print_bits("CANON_COMPOSE", @comp_bits);    print "\n";    print $comp->cprog(NAME => "${prefix}compose");    print <<"END";static const struct composition ${prefix}compose_seq[] = {END    my $i = 0;    foreach my $r (@comp_data) {	if ($i % 2 == 0) {	    print "\n" if $i != 0;	    print "\t";	}	printf "{ 0x%08x, 0x%08x }, ", $r->[2], $r->[0];	$i++;    }    print "\n};\n\n";}## print_decomposition -- generate data for canonical/compatibility# decomposition#sub print_decomposition {    $decomp->fix();    print STDERR "** decomposition\n", $decomp->stat() if $verbose;    print <<"END";/* * Canonical/Compatibility Decomposition */END    print_bits("DECOMP", @decomp_bits);    print "#define DECOMP_COMPAT\t$DECOMP_COMPAT_BIT\n\n";    print $decomp->cprog(NAME => "${prefix}decompose");    print "static const unsigned long ${prefix}decompose_seq[] = {\n";    print_ulseq(@decomp_data);    print "};\n\n";}## print_casemap -- generate data for case mapping#sub print_casemap {    $upper->fix();    $lower->fix();    print STDERR "** upper mapping\n", $upper->stat() if $verbose;    print STDERR "** lower mapping\n", $lower->stat() if $verbose;    print <<"END";/* * Lowercase <-> Uppercase mapping *//* * Flags for special case mapping. */#define CMF_FINAL	$CASEMAP_FINAL_BIT#define CMF_NONFINAL	$CASEMAP_NONFINAL_BIT#define CMF_LAST	$CASEMAP_LAST_BIT#define CMF_CTXDEP	(CMF_FINAL|CMF_NONFINAL)END    print_bits("CASEMAP", @casemap_bits);    print "\n";    print $upper->cprog(NAME => "${prefix}toupper");    print $lower->cprog(NAME => "${prefix}tolower");    print "static const unsigned long ${prefix}toupper_seq[] = {\n";    print_ulseq(@toupper_data);    print "};\n\n";    print "static const unsigned long ${prefix}tolower_seq[] = {\n";    print_ulseq(@tolower_data);    print "};\n\n";}## print_casefolding -- generate data for case folding#sub print_casefolding {    $folding->fix();    print STDERR "** case folding\n", $folding->stat() if $verbose;    print <<"END";/* * Case Folding */END    print_bits("CASE_FOLDING", @folding_bits);    print "\n";    print $folding->cprog(NAME => "${prefix}case_folding");    print "static const unsigned long ${prefix}case_folding_seq[] = {\n";    print_ulseq(@folding_data);    print "};\n\n";}## print_casemap_context -- gerarate data for determining context# (final/non-final)#sub print_casemap_context {    $casemap_ctx->fix();    print STDERR "** casemap context\n", $casemap_ctx->stat() if $verbose;    print <<"END";/* * Cased characters and non-spacing marks (for casemap context) */END    print_bits("CASEMAP_CTX", @casemap_ctx_bits);    print <<"END";#define CTX_CASED	$LETTER_BIT#define CTX_NSM		$NSPMARK_BITEND    print $casemap_ctx->cprog(NAME => "${prefix}casemap_ctx");}sub sprint_composition_hash {    my $i = 0;    my $s = '';    foreach my $r (@_) {	if ($i % 2 == 0) {	    $s .= "\n" if $i != 0;	    $s .= "\t";	}	$s .= sprintf "{0x%04x, 0x%04x, 0x%04x}, ", @{$r};	$i++;    }    $s;}sub print_bits {    my $prefix = shift;    my $i = 0;    foreach my $bit (@_) {	print "#define ${prefix}_BITS_$i\t$bit\n";	$i++;    }}sub print_ulseq {    my $i = 0;    foreach my $v (@_) {	if ($i % 4 == 0) {	    print "\n" if $i != 0;	    print "\t";	}	printf "0x%08x, ", $v;	$i++;    }    print "\n";}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -