utf8reader.java

来自「JAVA的一些源码 JAVA2 STANDARD EDITION DEVELO」· Java 代码 · 共 733 行 · 第 1/2 页
JAVA
733 行
/* * The Apache Software License, Version 1.1 * * * Copyright (c) 2000-2004 The Apache Software Foundation.  All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright *    notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright *    notice, this list of conditions and the following disclaimer in *    the documentation and/or other materials provided with the *    distribution. * * 3. The end-user documentation included with the redistribution, *    if any, must include the following acknowledgment: *       "This product includes software developed by the *        Apache Software Foundation (http://www.apache.org/)." *    Alternately, this acknowledgment may appear in the software itself, *    if and wherever such third-party acknowledgments normally appear. * * 4. The names "Xerces" and "Apache Software Foundation" must *    not be used to endorse or promote products derived from this *    software without prior written permission. For written *    permission, please contact apache@apache.org. * * 5. Products derived from this software may not be called "Apache", *    nor may "Apache" appear in their name, without prior written *    permission of the Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation and was * originally based on software copyright (c) 1999, International * Business Machines, Inc., http://www.apache.org.  For more * information on the Apache Software Foundation, please see * <http://www.apache.org/>. */package com.sun.org.apache.xerces.internal.impl.io;import java.io.InputStream;import java.io.IOException;import java.io.Reader;import java.util.Locale;import com.sun.org.apache.xerces.internal.util.MessageFormatter;import com.sun.org.apache.xerces.internal.impl.msg.XMLMessageFormatter;/** * <p>A UTF-8 reader.</p> *  * @author Andy Clark, IBM * * @version $Id: UTF8Reader.java,v 1.10 2004/03/04 19:27:13 mrglavas Exp $ */public class UTF8Reader    extends Reader {    //    // Constants    //    /** Default byte buffer size (2048). */    public static final int DEFAULT_BUFFER_SIZE = 2048;    // debugging    /** Debug read. */    private static final boolean DEBUG_READ = false;    //    // Data    //    /** Input stream. */    protected InputStream fInputStream;    /** Byte buffer. */    protected byte[] fBuffer;    /** Offset into buffer. */    protected int fOffset;    /** Surrogate character. */    private int fSurrogate = -1;    // message formatter; used to produce localized    // exception messages    private MessageFormatter fFormatter = null;    //Locale to use for messages    private Locale fLocale = null;    //    // Constructors    //    /**     * Constructs a UTF-8 reader from the specified input stream     * using the default buffer size.  Primarily for testing.     *     * @param inputStream The input stream.     */    public UTF8Reader(InputStream inputStream) {        this(inputStream, DEFAULT_BUFFER_SIZE, new XMLMessageFormatter(), Locale.getDefault());    } // <init>(InputStream, MessageFormatter)    /**     * Constructs a UTF-8 reader from the specified input stream     * using the default buffer size and the given MessageFormatter.     *     * @param inputStream The input stream.     * @param messageFormatter  given MessageFormatter     * @param locale    Locale to use for messages     */    public UTF8Reader(InputStream inputStream, MessageFormatter messageFormatter,            Locale locale) {        this(inputStream, DEFAULT_BUFFER_SIZE, messageFormatter, locale);    } // <init>(InputStream, MessageFormatter, Locale)    /**     * Constructs a UTF-8 reader from the specified input stream,     * buffer size and MessageFormatter.     *     * @param inputStream The input stream.     * @param size        The initial buffer size.     * @param messageFormatter  the formatter for localizing/formatting errors.     * @param locale    the Locale to use for messages     */    public UTF8Reader(InputStream inputStream, int size,            MessageFormatter messageFormatter, Locale locale) {        fInputStream = inputStream;        fBuffer = new byte[size];        fFormatter = messageFormatter;        fLocale = locale;    } // <init>(InputStream, int, MessageFormatter, Locale)    //    // Reader methods    //    /**     * Read a single character.  This method will block until a character is     * available, an I/O error occurs, or the end of the stream is reached.     *     * <p> Subclasses that intend to support efficient single-character input     * should override this method.     *     * @return     The character read, as an integer in the range 0 to 16383     *             (<tt>0x00-0xffff</tt>), or -1 if the end of the stream has     *             been reached     *     * @exception  IOException  If an I/O error occurs     */    public int read() throws IOException {        // decode character        int c = fSurrogate;        if (fSurrogate == -1) {            // NOTE: We use the index into the buffer if there are remaining            //       bytes from the last block read. -Ac            int index = 0;            // get first byte            int b0 = index == fOffset                   ? fInputStream.read() : fBuffer[index++] & 0x00FF;            if (b0 == -1) {                return -1;            }            // UTF-8:   [0xxx xxxx]            // Unicode: [0000 0000] [0xxx xxxx]            if (b0 < 0x80) {                c = (char)b0;            }            // UTF-8:   [110y yyyy] [10xx xxxx]            // Unicode: [0000 0yyy] [yyxx xxxx]            else if ((b0 & 0xE0) == 0xC0 && (b0 & 0x1E) != 0) {                int b1 = index == fOffset                       ? fInputStream.read() : fBuffer[index++] & 0x00FF;                if (b1 == -1) {                    expectedByte(2, 2);                }                if ((b1 & 0xC0) != 0x80) {                    invalidByte(2, 2, b1);                }                c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F);            }            // UTF-8:   [1110 zzzz] [10yy yyyy] [10xx xxxx]            // Unicode: [zzzz yyyy] [yyxx xxxx]            else if ((b0 & 0xF0) == 0xE0) {                int b1 = index == fOffset                       ? fInputStream.read() : fBuffer[index++] & 0x00FF;                if (b1 == -1) {                    expectedByte(2, 3);                }                if ((b1 & 0xC0) != 0x80                     || (b0 == 0xED && b1 >= 0xA0)                    || ((b0 & 0x0F) == 0 && (b1 & 0x20) == 0)) {                    invalidByte(2, 3, b1);                }                int b2 = index == fOffset                       ? fInputStream.read() : fBuffer[index++] & 0x00FF;                if (b2 == -1) {                    expectedByte(3, 3);                }                if ((b2 & 0xC0) != 0x80) {                    invalidByte(3, 3, b2);                }                c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0) |                    (b2 & 0x003F);            }            // UTF-8:   [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*            // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)            //          [1101 11yy] [yyxx xxxx] (low surrogate)            //          * uuuuu = wwww + 1            else if ((b0 & 0xF8) == 0xF0) {                int b1 = index == fOffset                       ? fInputStream.read() : fBuffer[index++] & 0x00FF;                if (b1 == -1) {                    expectedByte(2, 4);                }                if ((b1 & 0xC0) != 0x80                    || ((b1 & 0x30) == 0 && (b0 & 0x07) == 0)) {                    invalidByte(2, 3, b1);                }                int b2 = index == fOffset                       ? fInputStream.read() : fBuffer[index++] & 0x00FF;                if (b2 == -1) {                    expectedByte(3, 4);                }                if ((b2 & 0xC0) != 0x80) {                    invalidByte(3, 3, b2);                }                int b3 = index == fOffset                       ? fInputStream.read() : fBuffer[index++] & 0x00FF;                if (b3 == -1) {                    expectedByte(4, 4);                }                if ((b3 & 0xC0) != 0x80) {                    invalidByte(4, 4, b3);                }                int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003);                if (uuuuu > 0x10) {                    invalidSurrogate(uuuuu);                }                int wwww = uuuuu - 1;                int hs = 0xD800 |                         ((wwww << 6) & 0x03C0) | ((b1 << 2) & 0x003C) |                         ((b2 >> 4) & 0x0003);                int ls = 0xDC00 | ((b2 << 6) & 0x03C0) | (b3 & 0x003F);                c = hs;                fSurrogate = ls;            }            // error            else {                invalidByte(1, 1, b0);            }        }        // use surrogate        else {            fSurrogate = -1;        }        // return character        if (DEBUG_READ) {            System.out.println("read(): 0x"+Integer.toHexString(c));        }        return c;    } // read():int    /**     * Read characters into a portion of an array.  This method will block     * until some input is available, an I/O error occurs, or the end of the     * stream is reached.     *     * @param      ch     Destination buffer     * @param      offset Offset at which to start storing characters     * @param      length Maximum number of characters to read     *     * @return     The number of characters read, or -1 if the end of the     *             stream has been reached     *     * @exception  IOException  If an I/O error occurs     */    public int read(char ch[], int offset, int length) throws IOException {        // handle surrogate        int out = offset;        if (fSurrogate != -1) {            ch[offset + 1] = (char)fSurrogate;            fSurrogate = -1;            length--;            out++;        }        // read bytes        int count = 0;        if (fOffset == 0) {            // adjust length to read            if (length > fBuffer.length) {                length = fBuffer.length;            }            // perform read operation            count = fInputStream.read(fBuffer, 0, length);            if (count == -1) {                return -1;            }            count += out - offset;        }        // skip read; last character was in error        // NOTE: Having an offset value other than zero means that there was        //       an error in the last character read. In this case, we have        //       skipped the read so we don't consume any bytes past the        //       error. By signalling the error on the next block read we        //       allow the method to return the most valid characters that        //       it can on the previous block read. -Ac        else {            count = fOffset;            fOffset = 0;        }        // convert bytes to characters        final int total = count;        int in;        byte byte1;        final byte byte0 = 0;        for (in = 0; in < total; in++) {            byte1 = fBuffer[in];            if (byte1 >= byte0) {                ch[out++] = (char)byte1;            }            else   {                break;            }        }        for ( ; in < total; in++) {            byte1 = fBuffer[in];            // UTF-8:   [0xxx xxxx]
utf8reader.java - 源码说明

本页面展示了「JAVA的一些源码 JAVA2 STANDARD EDITION DEVELOPMENT KIT 5.0」中的 utf8reader.java 源码文件，采用 Java 编程语言编写，共 733 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与Java相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?