charset.py

来自「mallet是自然语言处理、机器学习领域的一个开源项目。」· Python 代码 · 共 394 行 · 第 1/2 页

PY
394
字号
# Copyright (C) 2001,2002 Python Software Foundation# Author: che@debian.org (Ben Gertzfield), barry@zope.com (Barry Warsaw)from types import UnicodeTypefrom email.Encoders import encode_7or8bitimport email.base64MIMEimport email.quopriMIMEdef _isunicode(s):    return isinstance(s, UnicodeType)# Python 2.2.1 and beyond has these symbolstry:    True, Falseexcept NameError:    True = 1    False = 0# Flags for types of header encodingsQP     = 1   # Quoted-PrintableBASE64 = 2   # Base64SHORTEST = 3 # the shorter of QP and base64, but only for headers# In "=?charset?q?hello_world?=", the =?, ?q?, and ?= add up to 7MISC_LEN = 7DEFAULT_CHARSET = 'us-ascii'# DefaultsCHARSETS = {    # input        header enc  body enc output conv    'iso-8859-1':  (QP,        QP,      None),    'iso-8859-2':  (QP,        QP,      None),    'iso-8859-3':  (QP,        QP,      None),    'iso-8859-4':  (QP,        QP,      None),    # iso-8859-5 is Cyrillic, and not especially used    # iso-8859-6 is Arabic, also not particularly used    # iso-8859-7 is Greek, QP will not make it readable    # iso-8859-8 is Hebrew, QP will not make it readable    'iso-8859-9':  (QP,        QP,      None),    'iso-8859-10': (QP,        QP,      None),    # iso-8859-11 is Thai, QP will not make it readable    'iso-8859-13': (QP,        QP,      None),    'iso-8859-14': (QP,        QP,      None),    'iso-8859-15': (QP,        QP,      None),    'windows-1252':(QP,        QP,      None),    'viscii':      (QP,        QP,      None),    'us-ascii':    (None,      None,    None),    'big5':        (BASE64,    BASE64,  None),    'gb2312':      (BASE64,    BASE64,  None),    'euc-jp':      (BASE64,    None,    'iso-2022-jp'),    'shift_jis':   (BASE64,    None,    'iso-2022-jp'),    'iso-2022-jp': (BASE64,    None,    None),    'koi8-r':      (BASE64,    BASE64,  None),    'utf-8':       (SHORTEST,  BASE64, 'utf-8'),    # We're making this one up to represent raw unencoded 8-bit    '8bit':        (None,      BASE64, 'utf-8'),    }# Aliases for other commonly-used names for character sets.  Map# them to the real ones used in email.ALIASES = {    'latin_1': 'iso-8859-1',    'latin-1': 'iso-8859-1',    'latin_2': 'iso-8859-2',    'latin-2': 'iso-8859-2',    'latin_3': 'iso-8859-3',    'latin-3': 'iso-8859-3',    'latin_4': 'iso-8859-4',    'latin-4': 'iso-8859-4',    'latin_5': 'iso-8859-9',    'latin-5': 'iso-8859-9',    'latin_6': 'iso-8859-10',    'latin-6': 'iso-8859-10',    'latin_7': 'iso-8859-13',    'latin-7': 'iso-8859-13',    'latin_8': 'iso-8859-14',    'latin-8': 'iso-8859-14',    'latin_9': 'iso-8859-15',    'latin-9': 'iso-8859-15',    'cp949':   'ks_c_5601-1987',    'euc_jp':  'euc-jp',    'euc_kr':  'euc-kr',    'ascii':   'us-ascii',    }# Map charsets to their Unicode codec strings.  Note that Python doesn't come# with any Asian codecs by default.  Here's where to get them:## Japanese -- http://www.asahi-net.or.jp/~rd6t-kjym/python# Korean   -- http://sf.net/projects/koco# Chinese  -- http://sf.net/projects/python-codecs## Note that these codecs have their own lifecycle and may be in varying states# of stability and useability.CODEC_MAP = {    'euc-jp':      'japanese.euc-jp',    'iso-2022-jp': 'japanese.iso-2022-jp',    'shift_jis':   'japanese.shift_jis',    'euc-kr':      'korean.euc-kr',    'ks_c_5601-1987': 'korean.cp949',    'iso-2022-kr': 'korean.iso-2022-kr',    'johab':       'korean.johab',    'gb2132':      'eucgb2312_cn',    'big5':        'big5_tw',    'utf-8':       'utf-8',    # Hack: We don't want *any* conversion for stuff marked us-ascii, as all    # sorts of garbage might be sent to us in the guise of 7-bit us-ascii.    # Let that stuff pass through without conversion to/from Unicode.    'us-ascii':    None,    }# Convenience functions for extending the above mappingsdef add_charset(charset, header_enc=None, body_enc=None, output_charset=None):    """Add character set properties to the global registry.    charset is the input character set, and must be the canonical name of a    character set.    Optional header_enc and body_enc is either Charset.QP for    quoted-printable, Charset.BASE64 for base64 encoding, Charset.SHORTEST for    the shortest of qp or base64 encoding, or None for no encoding.  SHORTEST    is only valid for header_enc.  It describes how message headers and    message bodies in the input charset are to be encoded.  Default is no    encoding.    Optional output_charset is the character set that the output should be    in.  Conversions will proceed from input charset, to Unicode, to the    output charset when the method Charset.convert() is called.  The default    is to output in the same character set as the input.    Both input_charset and output_charset must have Unicode codec entries in    the module's charset-to-codec mapping; use add_codec(charset, codecname)    to add codecs the module does not know about.  See the codecs module's    documentation for more information.    """    if body_enc == SHORTEST:        raise ValueError, 'SHORTEST not allowed for body_enc'    CHARSETS[charset] = (header_enc, body_enc, output_charset)def add_alias(alias, canonical):    """Add a character set alias.    alias is the alias name, e.g. latin-1    canonical is the character set's canonical name, e.g. iso-8859-1    """    ALIASES[alias] = canonicaldef add_codec(charset, codecname):    """Add a codec that map characters in the given charset to/from Unicode.    charset is the canonical name of a character set.  codecname is the name    of a Python codec, as appropriate for the second argument to the unicode()    built-in, or to the encode() method of a Unicode string.    """    CODEC_MAP[charset] = codecnameclass Charset:    """Map character sets to their email properties.    This class provides information about the requirements imposed on email    for a specific character set.  It also provides convenience routines for    converting between character sets, given the availability of the    applicable codecs.  Given a character set, it will do its best to provide    information on how to use that character set in an email in an    RFC-compliant way.    Certain character sets must be encoded with quoted-printable or base64    when used in email headers or bodies.  Certain character sets must be    converted outright, and are not allowed in email.  Instances of this    module expose the following information about a character set:    input_charset: The initial character set specified.  Common aliases                   are converted to their `official' email names (e.g. latin_1                   is converted to iso-8859-1).  Defaults to 7-bit us-ascii.    header_encoding: If the character set must be encoded before it can be                     used in an email header, this attribute will be set to                     Charset.QP (for quoted-printable), Charset.BASE64 (for                     base64 encoding), or Charset.SHORTEST for the shortest of                     QP or BASE64 encoding.  Otherwise, it will be None.    body_encoding: Same as header_encoding, but describes the encoding for the                   mail message's body, which indeed may be different than the                   header encoding.  Charset.SHORTEST is not allowed for                   body_encoding.

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?