📄 old_str_util.py
字号:
#!/usr/bin/python## Urwid unicode character processing tables# Copyright (C) 2004-2006 Ian Ward## This library is free software; you can redistribute it and/or# modify it under the terms of the GNU Lesser General Public# License as published by the Free Software Foundation; either# version 2.1 of the License, or (at your option) any later version.## This library is distributed in the hope that it will be useful,# but WITHOUT ANY WARRANTY; without even the implied warranty of# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU# Lesser General Public License for more details.## You should have received a copy of the GNU Lesser General Public# License along with this library; if not, write to the Free Software# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA## Urwid web site: http://excess.org/urwid/import reSAFE_ASCII_RE = re.compile("^[ -~]*$")_byte_encoding = None# GENERATED DATA# generated from # http://www.unicode.org/Public/4.0-Update/EastAsianWidth-4.0.0.txtwidths = [ (126, 1), (159, 0), (687, 1), (710, 0), (711, 1), (727, 0), (733, 1), (879, 0), (1154, 1), (1161, 0), (4347, 1), (4447, 2), (7467, 1), (7521, 0), (8369, 1), (8426, 0), (9000, 1), (9002, 2), (11021, 1), (12350, 2), (12351, 1), (12438, 2), (12442, 0), (19893, 2), (19967, 1), (55203, 2), (63743, 1), (64106, 2), (65039, 1), (65059, 0), (65131, 2), (65279, 1), (65376, 2), (65500, 1), (65510, 2), (120831, 1), (262141, 2), (1114109, 1),]# ACCESSOR FUNCTIONSdef get_width( o ): """Return the screen column width for unicode ordinal o.""" global widths if o == 0xe or o == 0xf: return 0 for num, wid in widths: if o <= num: return wid return 1def decode_one( text, pos ): """Return (ordinal at pos, next position) for UTF-8 encoded text.""" b1 = ord(text[pos]) if not b1 & 0x80: return b1, pos+1 error = ord("?"), pos+1 lt = len(text) lt = lt-pos if lt < 2: return error if b1 & 0xe0 == 0xc0: b2 = ord(text[pos+1]) if b2 & 0xc0 != 0x80: return error o = ((b1&0x1f)<<6)|(b2&0x3f) if o < 0x80: return error return o, pos+2 if lt < 3: return error if b1 & 0xf0 == 0xe0: b2 = ord(text[pos+1]) if b2 & 0xc0 != 0x80: return error b3 = ord(text[pos+2]) if b3 & 0xc0 != 0x80: return error o = ((b1&0x0f)<<12)|((b2&0x3f)<<6)|(b3&0x3f) if o < 0x800: return error return o, pos+3 if lt < 4: return error if b1 & 0xf8 == 0xf0: b2 = ord(text[pos+1]) if b2 & 0xc0 != 0x80: return error b3 = ord(text[pos+2]) if b3 & 0xc0 != 0x80: return error b4 = ord(text[pos+2]) if b4 & 0xc0 != 0x80: return error o = ((b1&0x07)<<18)|((b2&0x3f)<<12)|((b3&0x3f)<<6)|(b4&0x3f) if o < 0x10000: return error return o, pos+4 return errordef decode_one_right( text, pos): """ Return (ordinal at pos, next position) for UTF-8 encoded text. pos is assumed to be on the trailing byte of a utf-8 sequence.""" error = ord("?"), pos-1 p = pos while p >= 0: if ord(text[p])&0xc0 != 0x80: o, next = decode_one( text, p ) return o, p-1 p -=1 if p == p-4: return errordef set_byte_encoding(enc): assert enc in ('utf8', 'narrow', 'wide') global _byte_encoding _byte_encoding = encdef get_byte_encoding(): return _byte_encodingdef calc_text_pos( text, start_offs, end_offs, pref_col ): """ Calculate the closest position to the screen column pref_col in text where start_offs is the offset into text assumed to be screen column 0 and end_offs is the end of the range to search. Returns (position, actual_col). """ assert start_offs <= end_offs, `start_offs, end_offs` utfs = (type(text) == type("") and _byte_encoding == "utf8") if type(text) == type(u"") or utfs: i = start_offs sc = 0 n = 1 # number to advance by while i < end_offs: if utfs: o, n = decode_one(text, i) else: o = ord(text[i]) n = i + 1 w = get_width(o) if w+sc > pref_col: return i, sc i = n sc += w return i, sc assert type(text) == type(""), `text` # "wide" and "narrow" i = start_offs+pref_col if i >= end_offs: return end_offs, end_offs-start_offs if _byte_encoding == "wide": if within_double_byte( text, start_offs, i ) == 2: i -= 1 return i, i-start_offsdef calc_width( text, start_offs, end_offs ): """ Return the screen column width of text between start_offs and end_offs. """ assert start_offs <= end_offs, `start_offs, end_offs` utfs = (type(text) == type("") and _byte_encoding == "utf8") if (type(text) == type(u"") or utfs) and not SAFE_ASCII_RE.match(text): i = start_offs sc = 0 n = 1 # number to advance by while i < end_offs: if utfs: o, n = decode_one(text, i) else: o = ord(text[i]) n = i + 1 w = get_width(o) i = n sc += w return sc # "wide" and "narrow" return end_offs - start_offs def is_wide_char( text, offs ): """ Test if the character at offs within text is wide. """ if type(text) == type(u""): o = ord(text[offs]) return get_width(o) == 2 assert type(text) == type("") if _byte_encoding == "utf8": o, n = decode_one(text, offs) return get_width(o) == 2 if _byte_encoding == "wide": return within_double_byte(text, offs, offs) == 1 return Falsedef move_prev_char( text, start_offs, end_offs ): """ Return the position of the character before end_offs. """ assert start_offs < end_offs if type(text) == type(u""): return end_offs-1 assert type(text) == type("") if _byte_encoding == "utf8": o = end_offs-1 while ord(text[o])&0xc0 == 0x80: o -= 1 return o if _byte_encoding == "wide" and within_double_byte( text, start_offs, end_offs-1) == 2: return end_offs-2 return end_offs-1def move_next_char( text, start_offs, end_offs ): """ Return the position of the character after start_offs. """ assert start_offs < end_offs if type(text) == type(u""): return start_offs+1 assert type(text) == type("") if _byte_encoding == "utf8": o = start_offs+1 while o<end_offs and ord(text[o])&0xc0 == 0x80: o += 1 return o if _byte_encoding == "wide" and within_double_byte(text, start_offs, start_offs) == 1: return start_offs +2 return start_offs+1def within_double_byte(str, line_start, pos): """Return whether pos is within a double-byte encoded character. str -- string in question line_start -- offset of beginning of line (< pos) pos -- offset in question Return values: 0 -- not within dbe char, or double_byte_encoding == False 1 -- pos is on the 1st half of a dbe char 2 -- pos is on the 2nd half og a dbe char """ v = ord(str[pos]) if v >= 0x40 and v < 0x7f: # might be second half of big5, uhc or gbk encoding if pos == line_start: return 0 if ord(str[pos-1]) >= 0x81: if within_double_byte(str, line_start, pos-1) == 1: return 2 return 0 if v < 0x80: return 0 i = pos -1 while i >= line_start: if ord(str[i]) < 0x80: break i -= 1 if (pos - i) & 1: return 1 return 2# TABLE GENERATION CODEdef process_east_asian_width(): import sys out = [] last = None for line in sys.stdin.readlines(): if line[:1] == "#": continue line = line.strip() hex,rest = line.split(";",1) wid,rest = rest.split(" # ",1) word1 = rest.split(" ",1)[0] if "." in hex: hex = hex.split("..")[1] num = int(hex, 16) if word1 in ("COMBINING","MODIFIER","<control>"): l = 0 elif wid in ("W", "F"): l = 2 else: l = 1 if last is None: out.append((0, l)) last = l if last == l: out[-1] = (num, l) else: out.append( (num, l) ) last = l print "widths = [" for o in out[1:]: # treat control characters same as ascii print "\t"+`o`+"," print "]" if __name__ == "__main__": process_east_asian_width()
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -