⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 uni_parse.tcl

📁 ejabberd-0.7.5 分布式Jabber服务器
💻 TCL
字号:
# uni_parse.tcl --##	This program parses the UnicodeData file and generates the#	corresponding uni_data.c file with compressed character#	data tables.  The input to this program should be rfc3454.txt## Copyright (c) 1998-1999 by Scriptics Corporation.# All rights reserved.## Modified for ejabberd by Alexey Shchepin# # RCS: @(#) $Id: uni_parse.tcl,v 1.3 2003/10/08 18:27:29 aleksey Exp $namespace eval uni {    set shift 5;		# number of bits of data within a page				# This value can be adjusted to find the				# best split to minimize table size    variable pMap;		# map from page to page index, each entry is				# an index into the pages table, indexed by				# page number    variable pages;		# map from page index to page info, each				# entry is a list of indices into the groups				# table, the list is indexed by the offset    variable groups;		# list of character info values, indexed by				# group number, initialized with the				# unassigned character group}proc uni::getValue {i} {    variable casemap    variable casemap2    variable tablemap    set tables $tablemap($i)    if {[info exists casemap2($i)]} {	set multicase 1	set delta $casemap2($i)    } else {	set multicase 0	set delta $casemap($i)    }    set ac 0    set c11 0    set c21 0    set b1 0    set d1 0    set d2 0    set xnp 0    foreach tab $tables {	switch -glob -- $tab {	    C.1.1 {set c11 1}	    C.2.1 {set c21 1}	    C.*   {set ac 1}	    A.1   {set ac 1}	    B.1   {set b1 1}	    D.1   {set d1 1}	    D.2   {set d2 1}	    XNP   {set xnp 1}	}    }    set val [expr {($ac  << 0) |		   ($c11 << 1) |		   ($c21 << 2) |		   ($b1  << 3) |		   ($d1  << 4) |		   ($d2  << 5) |		   ($xnp << 6) |		   ($multicase << 7) |		   ($delta << 16)}]    return $val}proc uni::getGroup {value} {    variable groups    set gIndex [lsearch -exact $groups $value]    if {$gIndex == -1} {	set gIndex [llength $groups]	lappend groups $value    }    return $gIndex}proc uni::addPage {info} {    variable pMap    variable pages        set pIndex [lsearch -exact $pages $info]    if {$pIndex == -1} {	set pIndex [llength $pages]	lappend pages $info    }    lappend pMap $pIndex    return}proc uni::load_tables {data} {    variable casemap    variable casemap2    variable multicasemap    variable tablemap    for {set i 0} {$i <= 0xffff} {incr i} {	set casemap($i) 0	set tablemap($i) {}    }    set multicasemap {}    set table ""    foreach line [split $data \n] {	if {$table == ""} {	    if {[regexp {   ----- Start Table (.*) -----} $line temp table]} {		#puts "Start table '$table'"	    }	} else {	    if {[regexp {   ----- End Table (.*) -----} $line temp table1]} {		set table ""	    } else {		if {$table == "B.1"} {		    if {[regexp {^   ([[:xdigit:]]+); ;} $line \			     temp val]} {			scan $val %x val			if {$val <= 0xffff} {			    lappend tablemap($val) $table			}		    }		} elseif {$table == "B.2"} {		    if {[regexp {^   ([[:xdigit:]]+); ([[:xdigit:]]+);} $line \			     temp from to]} {			scan $from %x from			scan $to %x to			if {$from <= 0xffff && $to <= 0xffff} {			    set casemap($from) [expr {$to - $from}]			}		    } elseif {[regexp {^   ([[:xdigit:]]+); ([[:xdigit:]]+) ([[:xdigit:]]+);} $line \			     temp from to1 to2]} {			scan $from %x from			scan $to1 %x to1			scan $to2 %x to2			if {$from <= 0xffff && \				$to1 <= 0xffff && $to2 <= 0xffff} {			    set casemap2($from) [llength $multicasemap]			    lappend multicasemap [list $to1 $to2]			}		    } elseif {[regexp {^   ([[:xdigit:]]+); ([[:xdigit:]]+) ([[:xdigit:]]+) ([[:xdigit:]]+);} $line \			     temp from to1 to2 to3]} {			scan $from %x from			scan $to1 %x to1			scan $to2 %x to2			scan $to3 %x to3			if {$from <= 0xffff && \				$to1 <= 0xffff && $to2 <= 0xffff && \				$to3 <= 0xffff} {			    set casemap2($from) [llength $multicasemap]			    lappend multicasemap [list $to1 $to2 $to3]			}		    } else {			#puts "missed: $line"		    }		    		} elseif {$table != "B.3"} {		    if {[regexp {^   ([[:xdigit:]]+)-([[:xdigit:]]+)} $line \			     temp from to]} {			scan $from %x from			scan $to %x to			for {set i $from} {$i <= $to && $i <= 0xffff} {incr i} {			    lappend tablemap($i) $table			}		    } elseif {[regexp {^   ([[:xdigit:]]+)} $line \			     temp val]} {			scan $val %x val			if {$val <= 0xffff} {			    lappend tablemap($val) $table			}		    }		}	    }	}    }    # XMPP nodeprep prohibited    foreach val {22 26 27 2f 3a 3c 3e 40} {	scan $val %x val	lappend tablemap($val) XNP    }}proc uni::buildTables {} {    variable shift    variable casemap    variable tablemap    variable pMap {}    variable pages {}    variable groups {}    set info {}			;# temporary page info        set mask [expr {(1 << $shift) - 1}]    set next 0    for {set i 0} {$i <= 0xffff} {incr i} {	set gIndex [getGroup [getValue $i]]	# Split character index into offset and page number	set offset [expr {$i & $mask}]	set page [expr {($i >> $shift)}]	# Add the group index to the info for the current page	lappend info $gIndex	# If this is the last entry in the page, add the page	if {$offset == $mask} {	    addPage $info	    set info {}	}    }    return}proc uni::main {} {    global argc argv0 argv    variable pMap    variable pages    variable groups    variable shift    variable multicasemap    if {$argc != 2} {	puts stderr "\nusage: $argv0 <datafile> <outdir>\n"	exit 1    }    set f [open [lindex $argv 0] r]    set data [read $f]    close $f    load_tables $data    buildTables    puts "X = [llength $pMap]  Y= [llength $pages]  A= [llength $groups]"    set size [expr {[llength $pMap] + [llength $pages]*(1<<$shift)}]    puts "shift = 6, space = $size"    set f [open [file join [lindex $argv 1] uni_data.c] w]    fconfigure $f -translation lf    puts $f "/* * uni_data.c -- * *	Declarations of Unicode character information tables.  This file is *	automatically generated by the uni_parse.tcl script.  Do not *	modify this file by hand. * * Copyright (c) 1998 by Scriptics Corporation. * All rights reserved. * * Modified for ejabberd by Alexey Shchepin * * RCS: @(#) \$Id\$ *//* * A 16-bit Unicode character is split into two parts in order to index * into the following tables.  The lower OFFSET_BITS comprise an offset * into a page of characters.  The upper bits comprise the page number. */#define OFFSET_BITS $shift/* * The pageMap is indexed by page number and returns an alternate page number * that identifies a unique page of characters.  Many Unicode characters map * to the same alternate page number. */static unsigned char pageMap\[\] = {"    set line "    "    set last [expr {[llength $pMap] - 1}]    for {set i 0} {$i <= $last} {incr i} {	append line [lindex $pMap $i]	if {$i != $last} {	    append line ", "	}	if {[string length $line] > 70} {	    puts $f $line	    set line "    "	}    }    puts $f $line    puts $f "};/* * The groupMap is indexed by combining the alternate page number with * the page offset and returns a group number that identifies a unique * set of character attributes. */static unsigned char groupMap\[\] = {"    set line "    "    set lasti [expr {[llength $pages] - 1}]    for {set i 0} {$i <= $lasti} {incr i} {	set page [lindex $pages $i]	set lastj [expr {[llength $page] - 1}]	for {set j 0} {$j <= $lastj} {incr j} {	    append line [lindex $page $j]	    if {$j != $lastj || $i != $lasti} {		append line ", "	    }	    if {[string length $line] > 70} {		puts $f $line		set line "    "	    }	}    }    puts $f $line    puts $f "};/* * Each group represents a unique set of character attributes.  The attributes * are encoded into a 32-bit value as follows: * * Bit  0	A.1 | C.1.2 | C.2.2 | C.3 -- C.9 * * Bit  1	C.1.1 * * Bit  2	C.2.1 * * Bit  3	B.1 * * Bit  4	B.1 * * Bit  5	D.1 * * Bit  6	D.2 * * Bit  7	Case maps to several characters * * Bits 8-15	Reserved for future use. * * Bits 16-31	Case delta: delta for case conversions.  This should be the *			    highest field so we can easily sign extend. */static int groups\[\] = {"    set line "    "    set last [expr {[llength $groups] - 1}]    for {set i 0} {$i <= $last} {incr i} {	set val [lindex $groups $i]	append line [format "%d" $val]	if {$i != $last} {	    append line ", "	}	if {[string length $line] > 65} {	    puts $f $line	    set line "    "	}    }    puts $f $line    puts $f "};/* * Table for characters that lowercased to multiple ones */static int multiCaseTable\[\]\[4\] = {"    set last [expr {[llength $multicasemap] - 1}]    for {set i 0} {$i <= $last} {incr i} {	set val [lindex $multicasemap $i]	set line "    "	append line [format "{%d, %s}" [llength $val] [join $val ", "]]	if {$i != $last} {	    append line ", "	}	puts $f $line    }    puts $f "};/* * The following constants are used to determine the category of a * Unicode character. */#define ACMask  (1 << 0)#define C11Mask (1 << 1)#define C21Mask (1 << 2)#define B1Mask  (1 << 3)#define D1Mask  (1 << 4)#define D2Mask  (1 << 5)#define XNPMask (1 << 6)#define MCMask  (1 << 7)/* * The following macros extract the fields of the character info.  The * GetDelta() macro is complicated because we can't rely on the C compiler * to do sign extension on right shifts. */#define GetCaseType(info) (((info) & 0xE0) >> 5)#define GetCategory(info) ((info) & 0x1F)#define GetDelta(info) (((info) > 0) ? ((info) >> 16) : (~(~((info)) >> 16)))#define GetMC(info) (multiCaseTable\[GetDelta(info)\])/* * This macro extracts the information about a character from the * Unicode character tables. */#define GetUniCharInfo(ch) (groups\[groupMap\[(pageMap\[(((int)(ch)) & 0xffff) >> OFFSET_BITS\] << OFFSET_BITS) | ((ch) & ((1 << OFFSET_BITS)-1))\]\])"    close $f}uni::mainreturn

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -