📄 codecs.py

📁 mallet是自然语言处理、机器学习领域的一个开源项目。
💻 PY
📖 第 1 页 / 共 2 页
字号:
上一页 12
        self.reader = Reader(stream, errors)        self.writer = Writer(stream, errors)        self.errors = errors    def read(self, size=-1):        return self.reader.read(size)    def readline(self, size=None):        return self.reader.readline(size)    def readlines(self, sizehint=None):        return self.reader.readlines(sizehint)    def write(self, data):        return self.writer.write(data)    def writelines(self, list):        return self.writer.writelines(list)    def reset(self):        self.reader.reset()        self.writer.reset()    def __getattr__(self, name,                    getattr=getattr):        """ Inherit all other methods from the underlying stream.        """        return getattr(self.stream, name)###class StreamRecoder:    """ StreamRecoder instances provide a frontend - backend        view of encoding data.        They use the complete set of APIs returned by the        codecs.lookup() function to implement their task.        Data written to the stream is first decoded into an        intermediate format (which is dependent on the given codec        combination) and then written to the stream using an instance        of the provided Writer class.        In the other direction, data is read from the stream using a        Reader instance and then return encoded data to the caller.    """    # Optional attributes set by the file wrappers below    data_encoding = 'unknown'    file_encoding = 'unknown'    def __init__(self, stream, encode, decode, Reader, Writer,                 errors='strict'):        """ Creates a StreamRecoder instance which implements a two-way            conversion: encode and decode work on the frontend (the            input to .read() and output of .write()) while            Reader and Writer work on the backend (reading and            writing to the stream).            You can use these objects to do transparent direct            recodings from e.g. latin-1 to utf-8 and back.            stream must be a file-like object.            encode, decode must adhere to the Codec interface, Reader,            Writer must be factory functions or classes providing the            StreamReader, StreamWriter interface resp.            encode and decode are needed for the frontend translation,            Reader and Writer for the backend translation. Unicode is            used as intermediate encoding.            Error handling is done in the same way as defined for the            StreamWriter/Readers.        """        self.stream = stream        self.encode = encode        self.decode = decode        self.reader = Reader(stream, errors)        self.writer = Writer(stream, errors)        self.errors = errors    def read(self, size=-1):        data = self.reader.read(size)        data, bytesencoded = self.encode(data, self.errors)        return data    def readline(self, size=None):        if size is None:            data = self.reader.readline()        else:            data = self.reader.readline(size)        data, bytesencoded = self.encode(data, self.errors)        return data    def readlines(self, sizehint=None):        if sizehint is None:            data = self.reader.read()        else:            data = self.reader.read(sizehint)        data, bytesencoded = self.encode(data, self.errors)        return data.splitlines(1)    def write(self, data):        data, bytesdecoded = self.decode(data, self.errors)        return self.writer.write(data)    def writelines(self, list):        data = ''.join(list)        data, bytesdecoded = self.decode(data, self.errors)        return self.writer.write(data)    def reset(self):        self.reader.reset()        self.writer.reset()    def __getattr__(self, name,                    getattr=getattr):        """ Inherit all other methods from the underlying stream.        """        return getattr(self.stream, name)### Shortcutsdef open(filename, mode='rb', encoding=None, errors='strict', buffering=1):    """ Open an encoded file using the given mode and return        a wrapped version providing transparent encoding/decoding.        Note: The wrapped version will only accept the object format        defined by the codecs, i.e. Unicode objects for most builtin        codecs. Output is also codec dependent and will usually by        Unicode as well.        Files are always opened in binary mode, even if no binary mode        was specified. Thisis done to avoid data loss due to encodings        using 8-bit values. The default file mode is 'rb' meaning to        open the file in binary read mode.        encoding specifies the encoding which is to be used for the        the file.        errors may be given to define the error handling. It defaults        to 'strict' which causes ValueErrors to be raised in case an        encoding error occurs.        buffering has the same meaning as for the builtin open() API.        It defaults to line buffered.        The returned wrapped file object provides an extra attribute        .encoding which allows querying the used encoding. This        attribute is only available if an encoding was specified as        parameter.    """    if encoding is not None and \       'b' not in mode:        # Force opening of the file in binary mode        mode = mode + 'b'    file = __builtin__.open(filename, mode, buffering)    if encoding is None:        return file    (e, d, sr, sw) = lookup(encoding)    srw = StreamReaderWriter(file, sr, sw, errors)    # Add attributes to simplify introspection    srw.encoding = encoding    return srwdef EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):    """ Return a wrapped version of file which provides transparent        encoding translation.        Strings written to the wrapped file are interpreted according        to the given data_encoding and then written to the original        file as string using file_encoding. The intermediate encoding        will usually be Unicode but depends on the specified codecs.        Strings are read from the file using file_encoding and then        passed back to the caller as string using data_encoding.        If file_encoding is not given, it defaults to data_encoding.        errors may be given to define the error handling. It defaults        to 'strict' which causes ValueErrors to be raised in case an        encoding error occurs.        The returned wrapped file object provides two extra attributes        .data_encoding and .file_encoding which reflect the given        parameters of the same name. The attributes can be used for        introspection by Python programs.    """    if file_encoding is None:        file_encoding = data_encoding    encode, decode = lookup(data_encoding)[:2]    Reader, Writer = lookup(file_encoding)[2:]    sr = StreamRecoder(file,                       encode, decode, Reader, Writer,                       errors)    # Add attributes to simplify introspection    sr.data_encoding = data_encoding    sr.file_encoding = file_encoding    return sr### Helpers for codec lookupdef getencoder(encoding):    """ Lookup up the codec for the given encoding and return        its encoder function.        Raises a LookupError in case the encoding cannot be found.    """    return lookup(encoding)[0]def getdecoder(encoding):    """ Lookup up the codec for the given encoding and return        its decoder function.        Raises a LookupError in case the encoding cannot be found.    """    return lookup(encoding)[1]def getreader(encoding):    """ Lookup up the codec for the given encoding and return        its StreamReader class or factory function.        Raises a LookupError in case the encoding cannot be found.    """    return lookup(encoding)[2]def getwriter(encoding):    """ Lookup up the codec for the given encoding and return        its StreamWriter class or factory function.        Raises a LookupError in case the encoding cannot be found.    """    return lookup(encoding)[3]### Helpers for charmap-based codecsdef make_identity_dict(rng):    """ make_identity_dict(rng) -> dict        Return a dictionary where elements of the rng sequence are        mapped to themselves.    """    res = {}    for i in rng:        res[i]=i    return resdef make_encoding_map(decoding_map):    """ Creates an encoding map from a decoding map.        If a target mapping in the decoding map occurrs multiple        times, then that target is mapped to None (undefined mapping),        causing an exception when encountered by the charmap codec        during translation.        One example where this happens is cp875.py which decodes        multiple character to \u001a.    """    m = {}    for k,v in decoding_map.items():        if not m.has_key(v):            m[v] = k        else:            m[v] = None    return m# Tell modulefinder that using codecs probably needs the encodings# package_false = 0if _false:    import encodings### Testsif __name__ == '__main__':    import sys    # Make stdout translate Latin-1 output into UTF-8 output    sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')    # Have stdin translate Latin-1 input into UTF-8 input    sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -