charset.py
来自「mallet是自然语言处理、机器学习领域的一个开源项目。」· Python 代码 · 共 394 行 · 第 1/2 页
PY
394 行
# Copyright (C) 2001,2002 Python Software Foundation# Author: che@debian.org (Ben Gertzfield), barry@zope.com (Barry Warsaw)from types import UnicodeTypefrom email.Encoders import encode_7or8bitimport email.base64MIMEimport email.quopriMIMEdef _isunicode(s): return isinstance(s, UnicodeType)# Python 2.2.1 and beyond has these symbolstry: True, Falseexcept NameError: True = 1 False = 0# Flags for types of header encodingsQP = 1 # Quoted-PrintableBASE64 = 2 # Base64SHORTEST = 3 # the shorter of QP and base64, but only for headers# In "=?charset?q?hello_world?=", the =?, ?q?, and ?= add up to 7MISC_LEN = 7DEFAULT_CHARSET = 'us-ascii'# DefaultsCHARSETS = { # input header enc body enc output conv 'iso-8859-1': (QP, QP, None), 'iso-8859-2': (QP, QP, None), 'iso-8859-3': (QP, QP, None), 'iso-8859-4': (QP, QP, None), # iso-8859-5 is Cyrillic, and not especially used # iso-8859-6 is Arabic, also not particularly used # iso-8859-7 is Greek, QP will not make it readable # iso-8859-8 is Hebrew, QP will not make it readable 'iso-8859-9': (QP, QP, None), 'iso-8859-10': (QP, QP, None), # iso-8859-11 is Thai, QP will not make it readable 'iso-8859-13': (QP, QP, None), 'iso-8859-14': (QP, QP, None), 'iso-8859-15': (QP, QP, None), 'windows-1252':(QP, QP, None), 'viscii': (QP, QP, None), 'us-ascii': (None, None, None), 'big5': (BASE64, BASE64, None), 'gb2312': (BASE64, BASE64, None), 'euc-jp': (BASE64, None, 'iso-2022-jp'), 'shift_jis': (BASE64, None, 'iso-2022-jp'), 'iso-2022-jp': (BASE64, None, None), 'koi8-r': (BASE64, BASE64, None), 'utf-8': (SHORTEST, BASE64, 'utf-8'), # We're making this one up to represent raw unencoded 8-bit '8bit': (None, BASE64, 'utf-8'), }# Aliases for other commonly-used names for character sets. Map# them to the real ones used in email.ALIASES = { 'latin_1': 'iso-8859-1', 'latin-1': 'iso-8859-1', 'latin_2': 'iso-8859-2', 'latin-2': 'iso-8859-2', 'latin_3': 'iso-8859-3', 'latin-3': 'iso-8859-3', 'latin_4': 'iso-8859-4', 'latin-4': 'iso-8859-4', 'latin_5': 'iso-8859-9', 'latin-5': 'iso-8859-9', 'latin_6': 'iso-8859-10', 'latin-6': 'iso-8859-10', 'latin_7': 'iso-8859-13', 'latin-7': 'iso-8859-13', 'latin_8': 'iso-8859-14', 'latin-8': 'iso-8859-14', 'latin_9': 'iso-8859-15', 'latin-9': 'iso-8859-15', 'cp949': 'ks_c_5601-1987', 'euc_jp': 'euc-jp', 'euc_kr': 'euc-kr', 'ascii': 'us-ascii', }# Map charsets to their Unicode codec strings. Note that Python doesn't come# with any Asian codecs by default. Here's where to get them:## Japanese -- http://www.asahi-net.or.jp/~rd6t-kjym/python# Korean -- http://sf.net/projects/koco# Chinese -- http://sf.net/projects/python-codecs## Note that these codecs have their own lifecycle and may be in varying states# of stability and useability.CODEC_MAP = { 'euc-jp': 'japanese.euc-jp', 'iso-2022-jp': 'japanese.iso-2022-jp', 'shift_jis': 'japanese.shift_jis', 'euc-kr': 'korean.euc-kr', 'ks_c_5601-1987': 'korean.cp949', 'iso-2022-kr': 'korean.iso-2022-kr', 'johab': 'korean.johab', 'gb2132': 'eucgb2312_cn', 'big5': 'big5_tw', 'utf-8': 'utf-8', # Hack: We don't want *any* conversion for stuff marked us-ascii, as all # sorts of garbage might be sent to us in the guise of 7-bit us-ascii. # Let that stuff pass through without conversion to/from Unicode. 'us-ascii': None, }# Convenience functions for extending the above mappingsdef add_charset(charset, header_enc=None, body_enc=None, output_charset=None): """Add character set properties to the global registry. charset is the input character set, and must be the canonical name of a character set. Optional header_enc and body_enc is either Charset.QP for quoted-printable, Charset.BASE64 for base64 encoding, Charset.SHORTEST for the shortest of qp or base64 encoding, or None for no encoding. SHORTEST is only valid for header_enc. It describes how message headers and message bodies in the input charset are to be encoded. Default is no encoding. Optional output_charset is the character set that the output should be in. Conversions will proceed from input charset, to Unicode, to the output charset when the method Charset.convert() is called. The default is to output in the same character set as the input. Both input_charset and output_charset must have Unicode codec entries in the module's charset-to-codec mapping; use add_codec(charset, codecname) to add codecs the module does not know about. See the codecs module's documentation for more information. """ if body_enc == SHORTEST: raise ValueError, 'SHORTEST not allowed for body_enc' CHARSETS[charset] = (header_enc, body_enc, output_charset)def add_alias(alias, canonical): """Add a character set alias. alias is the alias name, e.g. latin-1 canonical is the character set's canonical name, e.g. iso-8859-1 """ ALIASES[alias] = canonicaldef add_codec(charset, codecname): """Add a codec that map characters in the given charset to/from Unicode. charset is the canonical name of a character set. codecname is the name of a Python codec, as appropriate for the second argument to the unicode() built-in, or to the encode() method of a Unicode string. """ CODEC_MAP[charset] = codecnameclass Charset: """Map character sets to their email properties. This class provides information about the requirements imposed on email for a specific character set. It also provides convenience routines for converting between character sets, given the availability of the applicable codecs. Given a character set, it will do its best to provide information on how to use that character set in an email in an RFC-compliant way. Certain character sets must be encoded with quoted-printable or base64 when used in email headers or bodies. Certain character sets must be converted outright, and are not allowed in email. Instances of this module expose the following information about a character set: input_charset: The initial character set specified. Common aliases are converted to their `official' email names (e.g. latin_1 is converted to iso-8859-1). Defaults to 7-bit us-ascii. header_encoding: If the character set must be encoded before it can be used in an email header, this attribute will be set to Charset.QP (for quoted-printable), Charset.BASE64 (for base64 encoding), or Charset.SHORTEST for the shortest of QP or BASE64 encoding. Otherwise, it will be None. body_encoding: Same as header_encoding, but describes the encoding for the mail message's body, which indeed may be different than the header encoding. Charset.SHORTEST is not allowed for body_encoding.
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?