genunicode.py

来自「xml开源解析代码.版本为libxml2-2.6.29,可支持GB3212.网络」· Python 代码 · 共 479 行
479 行
#!/usr/bin/python -u## Original script modified in November 2003 to take advantage of# the character-validation range routines, and updated to the# current Unicode information (Version 4.0.1)## NOTE: there is an 'alias' facility for blocks which are not present in#	the current release, but are needed for ABI compatibility.  This#	must be accomplished MANUALLY!  Please see the comments below under#     'blockAliases'#import sysimport stringimport timewebpage = "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html"sources = "Blocks-4.0.1.txt UnicodeData-4.0.1.txt"## blockAliases is a small hack - it is used for mapping block names which# were were used in the 3.1 release, but are missing or changed in the current# release.  The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]"blockAliases = []blockAliases.append("CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols")blockAliases.append("Greek:GreekandCoptic")blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," + 	"SupplementaryPrivateUseArea-B")# minTableSize gives the minimum number of ranges which must be present# before a range table is produced.  If there are less than this# number, inline comparisons are generatedminTableSize = 8(blockfile, catfile) = string.split(sources)## Now process the "blocks" file, reducing it to a dictionary# indexed by blockname, containing a tuple with the applicable# block range#BlockNames = {}try:    blocks = open(blockfile, "r")except:    print "Missing %s, aborting ..." % blockfile    sys.exit(1)for line in blocks.readlines():    if line[0] == '#':        continue    line = string.strip(line)    if line == '':        continue    try:        fields = string.split(line, ';')        range = string.strip(fields[0])        (start, end) = string.split(range, "..")        name = string.strip(fields[1])        name = string.replace(name, ' ', '')    except:        print "Failed to process line: %s" % (line)        continue    start = "0x" + start    end = "0x" + end    try:        BlockNames[name].append((start, end))    except:        BlockNames[name] = [(start, end)]blocks.close()print "Parsed %d blocks descriptions" % (len(BlockNames.keys()))for block in blockAliases:    alias = string.split(block,':')    alist = string.split(alias[1],',')    for comp in alist:        if BlockNames.has_key(comp):            if alias[0] not in BlockNames:                BlockNames[alias[0]] = []            for r in BlockNames[comp]:                BlockNames[alias[0]].append(r)        else:            print "Alias %s: %s not in Blocks" % (alias[0], comp)            continue## Next process the Categories file. This is more complex, since# the file is in code sequence, and we need to invert it.  We use# a dictionary with index category-name, with each entry containing# all the ranges (codepoints) of that category.  Note that category# names comprise two parts - the general category, and the "subclass"# within that category.  Therefore, both "general category" (which is# the first character of the 2-character category-name) and the full# (2-character) name are entered into this dictionary.#try:    data = open(catfile, "r")except:    print "Missing %s, aborting ..." % catfile    sys.exit(1)nbchar = 0;Categories = {}for line in data.readlines():    if line[0] == '#':        continue    line = string.strip(line)    if line == '':        continue    try:        fields = string.split(line, ';')        point = string.strip(fields[0])        value = 0        while point != '':            value = value * 16            if point[0] >= '0' and point[0] <= '9':                value = value + ord(point[0]) - ord('0')            elif point[0] >= 'A' and point[0] <= 'F':                value = value + 10 + ord(point[0]) - ord('A')            elif point[0] >= 'a' and point[0] <= 'f':                value = value + 10 + ord(point[0]) - ord('a')            point = point[1:]        name = fields[2]    except:        print "Failed to process line: %s" % (line)        continue        nbchar = nbchar + 1    # update entry for "full name"    try:        Categories[name].append(value)    except:        try:            Categories[name] = [value]        except:            print "Failed to process line: %s" % (line)    # update "general category" name    try:        Categories[name[0]].append(value)    except:        try:            Categories[name[0]] = [value]        except:            print "Failed to process line: %s" % (line)blocks.close()print "Parsed %d char generating %d categories" % (nbchar, len(Categories.keys()))## The data is now all read.  Time to process it into a more useful form.## reduce the number list into rangesfor cat in Categories.keys():    list = Categories[cat]    start = -1    prev = -1    end = -1    ranges = []    for val in list:        if start == -1:            start = val            prev = val            continue        elif val == prev + 1:            prev = val            continue        elif prev == start:            ranges.append((prev, prev))            start = val            prev = val            continue        else:            ranges.append((start, prev))            start = val            prev = val            continue    if prev == start:        ranges.append((prev, prev))    else:        ranges.append((start, prev))    Categories[cat] = ranges## Assure all data is in alphabetic order, since we will be doing binary# searches on the tables.#bkeys = BlockNames.keys()bkeys.sort()ckeys = Categories.keys()ckeys.sort()## Generate the resulting files#try:    header = open("include/libxml/xmlunicode.h", "w")except:    print "Failed to open include/libxml/xmlunicode.h"    sys.exit(1)try:    output = open("xmlunicode.c", "w")except:    print "Failed to open xmlunicode.c"    sys.exit(1)date = time.asctime(time.localtime(time.time()))header.write("""/* * Summary: Unicode character APIs * Description: API for the Unicode character APIs * * This file is automatically generated from the * UCS description files of the Unicode Character Database * %s * using the genUnicode.py Python script. * * Generation date: %s * Sources: %s * Author: Daniel Veillard */#ifndef __XML_UNICODE_H__#define __XML_UNICODE_H__#include <libxml/xmlversion.h>#ifdef LIBXML_UNICODE_ENABLED#ifdef __cplusplusextern "C" {#endif""" % (webpage, date, sources));output.write("""/* * xmlunicode.c: this module implements the Unicode character APIs * * This file is automatically generated from the * UCS description files of the Unicode Character Database * %s * using the genUnicode.py Python script. * * Generation date: %s * Sources: %s * Daniel Veillard <veillard@redhat.com> */#define IN_LIBXML#include "libxml.h"#ifdef LIBXML_UNICODE_ENABLED#include <string.h>#include <libxml/xmlversion.h>#include <libxml/xmlunicode.h>#include <libxml/chvalid.h>typedef int (xmlIntFunc)(int);	/* just to keep one's mind untwisted */typedef struct {    const char *rangename;    xmlIntFunc *func;} xmlUnicodeRange;typedef struct {    xmlUnicodeRange *table;    int		    numentries;} xmlUnicodeNameTable;static xmlIntFunc *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname);static xmlUnicodeRange xmlUnicodeBlocks[] = {""" % (webpage, date, sources));flag = 0for block in bkeys:    name = string.replace(block, '-', '')    if flag:        output.write(',\n')    else:        flag = 1    output.write('  {"%s", xmlUCSIs%s}' % (block, name))output.write('};\n\n')output.write('static xmlUnicodeRange xmlUnicodeCats[] = {\n')flag = 0;for name in ckeys:    if flag:        output.write(',\n')    else:        flag = 1    output.write('  {"%s", xmlUCSIsCat%s}' % (name, name))output.write('};\n\n')## For any categories with more than minTableSize ranges we generate# a range table suitable for xmlCharInRange#for name in ckeys:  if len(Categories[name]) > minTableSize:    numshort = 0    numlong = 0    ranges = Categories[name]    sptr = "NULL"    lptr = "NULL"    for range in ranges:      (low, high) = range      if high < 0x10000:        if numshort == 0:          pline = "static const xmlChSRange xml%sS[] = {" % name          sptr = "xml%sS" % name        else:          pline += ", "        numshort += 1      else:        if numlong == 0:          if numshort > 0:            output.write(pline + " };\n")          pline = "static const xmlChLRange xml%sL[] = {" % name          lptr = "xml%sL" % name        else:          pline += ", "        numlong += 1      if len(pline) > 60:        output.write(pline + "\n")        pline = "    "      pline += "{%s, %s}" % (hex(low), hex(high))    output.write(pline + " };\nstatic xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n"         % (name, numshort, numlong, sptr, lptr))output.write("""static xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s};static xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s};/** * xmlUnicodeLookup: * @tptr: pointer to the name table * @name: name to be found * * binary table lookup for user-supplied name * * Returns pointer to range function if found, otherwise NULL */static xmlIntFunc*xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname) {    int low, high, mid, cmp;    xmlUnicodeRange *sptr;    if ((tptr == NULL) || (tname == NULL)) return(NULL);    low = 0;    high = tptr->numentries - 1;    sptr = tptr->table;    while (low <= high) {	mid = (low + high) / 2;	if ((cmp=strcmp(tname, sptr[mid].rangename)) == 0)	    return (sptr[mid].func);	if (cmp < 0)	    high = mid - 1;	else	    low = mid + 1;    }    return (NULL);    }""" % (len(BlockNames), len(Categories)) )for block in bkeys:    name = string.replace(block, '-', '')    header.write("XMLPUBFUN int XMLCALL xmlUCSIs%s\t(int code);\n" % name)    output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name))    output.write(" *\n * Check whether the character is part of %s UCS Block\n"%                 (block))    output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");    output.write("int\nxmlUCSIs%s(int code) {\n    return(" % name)    flag = 0    for (start, end) in BlockNames[block]:        if flag:            output.write(" ||\n           ")        else:            flag = 1        output.write("((code >= %s) && (code <= %s))" % (start, end))    output.write(");\n}\n\n")header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsBlock\t(int code, const char *block);\n\n")output.write("""/** * xmlUCSIsBlock: * @code: UCS code point * @block: UCS block name * * Check whether the character is part of the UCS Block * * Returns 1 if true, 0 if false and -1 on unknown block */intxmlUCSIsBlock(int code, const char *block) {    xmlIntFunc *func;    func = xmlUnicodeLookup(&xmlUnicodeBlockTbl, block);    if (func == NULL)	return (-1);    return (func(code));}""")for name in ckeys:    ranges = Categories[name]    header.write("XMLPUBFUN int XMLCALL xmlUCSIsCat%s\t(int code);\n" % name)    output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name))    output.write(" *\n * Check whether the character is part of %s UCS Category\n"%                 (name))    output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");    output.write("int\nxmlUCSIsCat%s(int code) {\n" % name)    if len(Categories[name]) > minTableSize:        output.write("    return(xmlCharInRange((unsigned int)code, &xml%sG)"            % name)    else:        start = 1        for range in ranges:            (begin, end) = range;            if start:                output.write("    return(");                start = 0            else:                output.write(" ||\n           ");            if (begin == end):                output.write("(code == %s)" % (hex(begin)))            else:                output.write("((code >= %s) && (code <= %s))" % (                         hex(begin), hex(end)))    output.write(");\n}\n\n")header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsCat\t(int code, const char *cat);\n")output.write("""/** * xmlUCSIsCat: * @code: UCS code point * @cat: UCS Category name * * Check whether the character is part of the UCS Category * * Returns 1 if true, 0 if false and -1 on unknown category */intxmlUCSIsCat(int code, const char *cat) {    xmlIntFunc *func;    func = xmlUnicodeLookup(&xmlUnicodeCatTbl, cat);    if (func == NULL)	return (-1);    return (func(code));}#define bottom_xmlunicode#include "elfgcchack.h"#endif /* LIBXML_UNICODE_ENABLED */""")header.write("""#ifdef __cplusplus}#endif#endif /* LIBXML_UNICODE_ENABLED */#endif /* __XML_UNICODE_H__ */""");header.close()output.close()
genunicode.py - 源码说明

本页面展示了「xml开源解析代码.版本为libxml2-2.6.29,可支持GB3212.网络消息发送XML时很有用.」中的 genunicode.py 源码文件，采用 Python 编程语言编写，共 479 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫开发者社区收录了大量与libxml2相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?