charset.py

来自「mallet是自然语言处理、机器学习领域的一个开源项目。」· Python 代码 · 共 394 行 · 第 1/2 页
394 行
    output_charset: Some character sets must be converted before the can be                    used in email headers or bodies.  If the input_charset is                    one of them, this attribute will contain the name of the                    charset output will be converted to.  Otherwise, it will                    be None.    input_codec: The name of the Python codec used to convert the                 input_charset to Unicode.  If no conversion codec is                 necessary, this attribute will be None.    output_codec: The name of the Python codec used to convert Unicode                  to the output_charset.  If no conversion codec is necessary,                  this attribute will have the same value as the input_codec.    """    def __init__(self, input_charset=DEFAULT_CHARSET):        # RFC 2046, $4.1.2 says charsets are not case sensitive        input_charset = input_charset.lower()        # Set the input charset after filtering through the aliases        self.input_charset = ALIASES.get(input_charset, input_charset)        # We can try to guess which encoding and conversion to use by the        # charset_map dictionary.  Try that first, but let the user override        # it.        henc, benc, conv = CHARSETS.get(self.input_charset,                                        (SHORTEST, BASE64, None))        # Set the attributes, allowing the arguments to override the default.        self.header_encoding = henc        self.body_encoding = benc        self.output_charset = ALIASES.get(conv, conv)        # Now set the codecs.  If one isn't defined for input_charset,        # guess and try a Unicode codec with the same name as input_codec.        self.input_codec = CODEC_MAP.get(self.input_charset,                                         self.input_charset)        self.output_codec = CODEC_MAP.get(self.output_charset,                                            self.input_codec)    def __str__(self):        return self.input_charset.lower()    __repr__ = __str__    def __eq__(self, other):        return str(self) == str(other).lower()    def __ne__(self, other):        return not self.__eq__(other)    def get_body_encoding(self):        """Return the content-transfer-encoding used for body encoding.        This is either the string `quoted-printable' or `base64' depending on        the encoding used, or it is a function in which case you should call        the function with a single argument, the Message object being        encoded.  The function should then set the Content-Transfer-Encoding        header itself to whatever is appropriate.        Returns "quoted-printable" if self.body_encoding is QP.        Returns "base64" if self.body_encoding is BASE64.        Returns "7bit" otherwise.        """        assert self.body_encoding <> SHORTEST        if self.body_encoding == QP:            return 'quoted-printable'        elif self.body_encoding == BASE64:            return 'base64'        else:            return encode_7or8bit    def convert(self, s):        """Convert a string from the input_codec to the output_codec."""        if self.input_codec <> self.output_codec:            return unicode(s, self.input_codec).encode(self.output_codec)        else:            return s    def to_splittable(self, s):        """Convert a possibly multibyte string to a safely splittable format.        Uses the input_codec to try and convert the string to Unicode, so it        can be safely split on character boundaries (even for multibyte        characters).        Returns the string as-is if it isn't known how to convert it to        Unicode with the input_charset.        Characters that could not be converted to Unicode will be replaced        with the Unicode replacement character U+FFFD.        """        if _isunicode(s) or self.input_codec is None:            return s        try:            return unicode(s, self.input_codec, 'replace')        except LookupError:            # Input codec not installed on system, so return the original            # string unchanged.            return s    def from_splittable(self, ustr, to_output=True):        """Convert a splittable string back into an encoded string.        Uses the proper codec to try and convert the string from Unicode back        into an encoded format.  Return the string as-is if it is not Unicode,        or if it could not be converted from Unicode.        Characters that could not be converted from Unicode will be replaced        with an appropriate character (usually '?').        If to_output is True (the default), uses output_codec to convert to an        encoded format.  If to_output is False, uses input_codec.        """        if to_output:            codec = self.output_codec        else:            codec = self.input_codec        if not _isunicode(ustr) or codec is None:            return ustr        try:            return ustr.encode(codec, 'replace')        except LookupError:            # Output codec not installed            return ustr    def get_output_charset(self):        """Return the output character set.        This is self.output_charset if that is not None, otherwise it is        self.input_charset.        """        return self.output_charset or self.input_charset    def encoded_header_len(self, s):        """Return the length of the encoded header string."""        cset = self.get_output_charset()        # The len(s) of a 7bit encoding is len(s)        if self.header_encoding == BASE64:            return email.base64MIME.base64_len(s) + len(cset) + MISC_LEN        elif self.header_encoding == QP:            return email.quopriMIME.header_quopri_len(s) + len(cset) + MISC_LEN        elif self.header_encoding == SHORTEST:            lenb64 = email.base64MIME.base64_len(s)            lenqp = email.quopriMIME.header_quopri_len(s)            return min(lenb64, lenqp) + len(cset) + MISC_LEN        else:            return len(s)    def header_encode(self, s, convert=False):        """Header-encode a string, optionally converting it to output_charset.        If convert is True, the string will be converted from the input        charset to the output charset automatically.  This is not useful for        multibyte character sets, which have line length issues (multibyte        characters must be split on a character, not a byte boundary); use the        high-level Header class to deal with these issues.  convert defaults        to False.        The type of encoding (base64 or quoted-printable) will be based on        self.header_encoding.        """        cset = self.get_output_charset()        if convert:            s = self.convert(s)        # 7bit/8bit encodings return the string unchanged (modulo conversions)        if self.header_encoding == BASE64:            return email.base64MIME.header_encode(s, cset)        elif self.header_encoding == QP:            return email.quopriMIME.header_encode(s, cset, maxlinelen=None)        elif self.header_encoding == SHORTEST:            lenb64 = email.base64MIME.base64_len(s)            lenqp = email.quopriMIME.header_quopri_len(s)            if lenb64 < lenqp:                return email.base64MIME.header_encode(s, cset)            else:                return email.quopriMIME.header_encode(s, cset, maxlinelen=None)        else:            return s    def body_encode(self, s, convert=True):        """Body-encode a string and convert it to output_charset.        If convert is True (the default), the string will be converted from        the input charset to output charset automatically.  Unlike        header_encode(), there are no issues with byte boundaries and        multibyte charsets in email bodies, so this is usually pretty safe.        The type of encoding (base64 or quoted-printable) will be based on        self.body_encoding.        """        if convert:            s = self.convert(s)        # 7bit/8bit encodings return the string unchanged (module conversions)        if self.body_encoding is BASE64:            return email.base64MIME.body_encode(s)        elif self.body_encoding is QP:            return email.quopriMIME.body_encode(s)        else:            return s
charset.py - 源码说明

本页面展示了「mallet是自然语言处理、机器学习领域的一个开源项目。」中的 charset.py 源码文件，采用 Python 编程语言编写，共 394 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与mallet相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?