首页 › 资源下载 › 其他 › tcl是工具命令语言 › 源码查看
uniparse.tcl

来自「tcl是工具命令语言」· TCL 代码 · 共 387 行
TCL
387 行
# uniParse.tcl --##	This program parses the UnicodeData file and generates the#	corresponding tclUniData.c file with compressed character#	data tables.  The input to this program should be the latest#	UnicodeData file from:#	    ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData-Latest.txt## Copyright (c) 1998-1999 by Scriptics Corporation.# All rights reserved.# # RCS: @(#) $Id: uniParse.tcl,v 1.4 2001/05/28 04:37:57 hobbs Exp $namespace eval uni {    set shift 5;		# number of bits of data within a page				# This value can be adjusted to find the				# best split to minimize table size    variable pMap;		# map from page to page index, each entry is				# an index into the pages table, indexed by				# page number    variable pages;		# map from page index to page info, each				# entry is a list of indices into the groups				# table, the list is indexed by the offset    variable groups;		# list of character info values, indexed by				# group number, initialized with the				# unassigned character group    variable categories {	Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp	Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So    };				# Ordered list of character categories, must				# match the enumeration in the header file.    variable titleCount 0;	# Count of the number of title case				# characters.  This value is used in the				# regular expression code to allocate enough				# space for the title case variants.}proc uni::getValue {items index} {    variable categories    variable titleCount    # Extract character info    set category [lindex $items 2]    if {[scan [lindex $items 12] %4x toupper] == 1} {	set toupper [expr {$index - $toupper}]    } else {	set toupper {}    }    if {[scan [lindex $items 13] %4x tolower] == 1} {	set tolower [expr {$tolower - $index}]    } else {	set tolower {}    }    if {[scan [lindex $items 14] %4x totitle] == 1} {	set totitle [expr {$index - $totitle}]    } else {	set totitle {}    }    set categoryIndex [lsearch -exact $categories $category]    if {$categoryIndex < 0} {	puts "Unexpected character category: $index($category)"	set categoryIndex 0    } elseif {$category == "Lt"} {	incr titleCount    }    return "$categoryIndex,$toupper,$tolower,$totitle"}proc uni::getGroup {value} {    variable groups    set gIndex [lsearch -exact $groups $value]    if {$gIndex == -1} {	set gIndex [llength $groups]	lappend groups $value    }    return $gIndex}proc uni::addPage {info} {    variable pMap    variable pages        set pIndex [lsearch -exact $pages $info]    if {$pIndex == -1} {	set pIndex [llength $pages]	lappend pages $info    }    lappend pMap $pIndex    return}    proc uni::buildTables {data} {    variable shift    variable pMap {}    variable pages {}    variable groups {{0,,,}}    set info {}			;# temporary page info        set mask [expr {(1 << $shift) - 1}]    set next 0    foreach line [split $data \n] {	if {$line == ""} {	    set line "FFFF;;Cn;0;ON;;;;;N;;;;;\n"	}	set items [split $line \;]	scan [lindex $items 0] %4x index	set index [format 0x%0.4x $index]		set gIndex [getGroup [getValue $items $index]]	# Since the input table omits unassigned characters, these will	# show up as gaps in the index sequence.  There are a few special cases	# where the gaps correspond to a uniform block of assigned characters.	# These are indicated as such in the character name.	# Enter all unassigned characters up to the current character.	if {($index > $next) \		&& ![regexp "Last>$" [lindex $items 1]]} {	    for {} {$next < $index} {incr next} {		lappend info 0		if {($next & $mask) == $mask} {		    addPage $info		    set info {}		}	    }	}	# Enter all assigned characters up to the current character	for {set i $next} {$i <= $index} {incr i} {	    # Split character index into offset and page number	    set offset [expr {$i & $mask}]	    set page [expr {($i >> $shift)}]	    # Add the group index to the info for the current page	    lappend info $gIndex	    # If this is the last entry in the page, add the page	    if {$offset == $mask} {		addPage $info		set info {}	    }	}	set next [expr {$index + 1}]    }    return}proc uni::main {} {    global argc argv0 argv    variable pMap    variable pages    variable groups    variable shift    variable titleCount    if {$argc != 2} {	puts stderr "\nusage: $argv0 <datafile> <outdir>\n"	exit 1    }    set f [open [lindex $argv 0] r]    set data [read $f]    close $f    buildTables $data    puts "X = [llength $pMap]  Y= [llength $pages]  A= [llength $groups]"    set size [expr {[llength $pMap] + [llength $pages]*(1<<$shift)}]    puts "shift = 6, space = $size"    puts "title case count = $titleCount"    set f [open [file join [lindex $argv 1] tclUniData.c] w]    fconfigure $f -translation lf    puts $f "/* * tclUniData.c -- * *	Declarations of Unicode character information tables.  This file is *	automatically generated by the tools/uniParse.tcl script.  Do not *	modify this file by hand. * * Copyright (c) 1998 by Scriptics Corporation. * All rights reserved. * * RCS: @(#) \$Id\$ *//* * A 16-bit Unicode character is split into two parts in order to index * into the following tables.  The lower OFFSET_BITS comprise an offset * into a page of characters.  The upper bits comprise the page number. */#define OFFSET_BITS $shift/* * The pageMap is indexed by page number and returns an alternate page number * that identifies a unique page of characters.  Many Unicode characters map * to the same alternate page number. */static unsigned char pageMap\[\] = {"    set line "    "    set last [expr {[llength $pMap] - 1}]    for {set i 0} {$i <= $last} {incr i} {	append line [lindex $pMap $i]	if {$i != $last} {	    append line ", "	}	if {[string length $line] > 70} {	    puts $f $line	    set line "    "	}    }    puts $f $line    puts $f "};/* * The groupMap is indexed by combining the alternate page number with * the page offset and returns a group number that identifies a unique * set of character attributes. */static unsigned char groupMap\[\] = {"    set line "    "    set lasti [expr {[llength $pages] - 1}]    for {set i 0} {$i <= $lasti} {incr i} {	set page [lindex $pages $i]	set lastj [expr {[llength $page] - 1}]	for {set j 0} {$j <= $lastj} {incr j} {	    append line [lindex $page $j]	    if {$j != $lastj || $i != $lasti} {		append line ", "	    }	    if {[string length $line] > 70} {		puts $f $line		set line "    "	    }	}    }    puts $f $line    puts $f "};/* * Each group represents a unique set of character attributes.  The attributes * are encoded into a 32-bit value as follows: * * Bits 0-4	Character category: see the constants listed below. * * Bits 5-7	Case delta type: 000 = identity *				 010 = add delta for lower *				 011 = add delta for lower, add 1 for title *				 100 = sutract delta for title/upper *				 101 = sub delta for upper, sub 1 for title *				 110 = sub delta for upper, add delta for lower * * Bits 8-21	Reserved for future use. * * Bits 22-31	Case delta: delta for case conversions.  This should be the *			    highest field so we can easily sign extend. */static int groups\[\] = {"    set line "    "    set last [expr {[llength $groups] - 1}]    for {set i 0} {$i <= $last} {incr i} {	foreach {type toupper tolower totitle} [split [lindex $groups $i] ,] {}		# Compute the case conversion type and delta	if {$totitle != ""} {	    if {$totitle == $toupper} {		# subtract delta for title or upper		set case 4		set delta $toupper	    } elseif {$toupper != ""} {		# subtract delta for upper, subtract 1 for title		set case 5		set delta $toupper	    } else {		# add delta for lower, add 1 for title		set case 3		set delta $tolower	    }	} elseif {$toupper != ""} {	    # subtract delta for upper, add delta for lower	    set case 6	    set delta $toupper	} elseif {$tolower != ""} {	    # add delta for lower	    set case 2	    set delta $tolower	} else {	    # noop	    set case 0	    set delta 0	}	set val [expr {($delta << 22) | ($case << 5) | $type}]	append line [format "%d" $val]	if {$i != $last} {	    append line ", "	}	if {[string length $line] > 65} {	    puts $f $line	    set line "    "	}    }    puts $f $line    puts $f "};/* * The following constants are used to determine the category of a * Unicode character. */#define UNICODE_CATEGORY_MASK 0X1Fenum {    UNASSIGNED,    UPPERCASE_LETTER,    LOWERCASE_LETTER,    TITLECASE_LETTER,    MODIFIER_LETTER,    OTHER_LETTER,    NON_SPACING_MARK,    ENCLOSING_MARK,    COMBINING_SPACING_MARK,    DECIMAL_DIGIT_NUMBER,    LETTER_NUMBER,    OTHER_NUMBER,    SPACE_SEPARATOR,    LINE_SEPARATOR,    PARAGRAPH_SEPARATOR,    CONTROL,    FORMAT,    PRIVATE_USE,    SURROGATE,    CONNECTOR_PUNCTUATION,    DASH_PUNCTUATION,    OPEN_PUNCTUATION,    CLOSE_PUNCTUATION,    INITIAL_QUOTE_PUNCTUATION,    FINAL_QUOTE_PUNCTUATION,    OTHER_PUNCTUATION,    MATH_SYMBOL,    CURRENCY_SYMBOL,    MODIFIER_SYMBOL,    OTHER_SYMBOL};/* * The following macros extract the fields of the character info.  The * GetDelta() macro is complicated because we can't rely on the C compiler * to do sign extension on right shifts. */#define GetCaseType(info) (((info) & 0xE0) >> 5)#define GetCategory(info) ((info) & 0x1F)#define GetDelta(info) (((info) > 0) ? ((info) >> 22) : (~(~((info)) >> 22)))/* * This macro extracts the information about a character from the * Unicode character tables. */#define GetUniCharInfo(ch) (groups\[groupMap\[(pageMap\[(((int)(ch)) & 0xffff) >> OFFSET_BITS\] << OFFSET_BITS) | ((ch) & ((1 << OFFSET_BITS)-1))\]\])"    close $f}uni::mainreturn
uniparse.tcl - 源码说明

本页面展示了「tcl是工具命令语言」中的 uniparse.tcl 源码文件，采用 TCL 编程语言编写，共 387 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫开发者社区收录了大量与TCL语言相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?