utf8reader.java
来自「JAVA的一些源码 JAVA2 STANDARD EDITION DEVELO」· Java 代码 · 共 733 行 · 第 1/2 页
JAVA
733 行
/* * The Apache Software License, Version 1.1 * * * Copyright (c) 2000-2004 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. The end-user documentation included with the redistribution, * if any, must include the following acknowledgment: * "This product includes software developed by the * Apache Software Foundation (http://www.apache.org/)." * Alternately, this acknowledgment may appear in the software itself, * if and wherever such third-party acknowledgments normally appear. * * 4. The names "Xerces" and "Apache Software Foundation" must * not be used to endorse or promote products derived from this * software without prior written permission. For written * permission, please contact apache@apache.org. * * 5. Products derived from this software may not be called "Apache", * nor may "Apache" appear in their name, without prior written * permission of the Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation and was * originally based on software copyright (c) 1999, International * Business Machines, Inc., http://www.apache.org. For more * information on the Apache Software Foundation, please see * <http://www.apache.org/>. */package com.sun.org.apache.xerces.internal.impl.io;import java.io.InputStream;import java.io.IOException;import java.io.Reader;import java.util.Locale;import com.sun.org.apache.xerces.internal.util.MessageFormatter;import com.sun.org.apache.xerces.internal.impl.msg.XMLMessageFormatter;/** * <p>A UTF-8 reader.</p> * * @author Andy Clark, IBM * * @version $Id: UTF8Reader.java,v 1.10 2004/03/04 19:27:13 mrglavas Exp $ */public class UTF8Reader extends Reader { // // Constants // /** Default byte buffer size (2048). */ public static final int DEFAULT_BUFFER_SIZE = 2048; // debugging /** Debug read. */ private static final boolean DEBUG_READ = false; // // Data // /** Input stream. */ protected InputStream fInputStream; /** Byte buffer. */ protected byte[] fBuffer; /** Offset into buffer. */ protected int fOffset; /** Surrogate character. */ private int fSurrogate = -1; // message formatter; used to produce localized // exception messages private MessageFormatter fFormatter = null; //Locale to use for messages private Locale fLocale = null; // // Constructors // /** * Constructs a UTF-8 reader from the specified input stream * using the default buffer size. Primarily for testing. * * @param inputStream The input stream. */ public UTF8Reader(InputStream inputStream) { this(inputStream, DEFAULT_BUFFER_SIZE, new XMLMessageFormatter(), Locale.getDefault()); } // <init>(InputStream, MessageFormatter) /** * Constructs a UTF-8 reader from the specified input stream * using the default buffer size and the given MessageFormatter. * * @param inputStream The input stream. * @param messageFormatter given MessageFormatter * @param locale Locale to use for messages */ public UTF8Reader(InputStream inputStream, MessageFormatter messageFormatter, Locale locale) { this(inputStream, DEFAULT_BUFFER_SIZE, messageFormatter, locale); } // <init>(InputStream, MessageFormatter, Locale) /** * Constructs a UTF-8 reader from the specified input stream, * buffer size and MessageFormatter. * * @param inputStream The input stream. * @param size The initial buffer size. * @param messageFormatter the formatter for localizing/formatting errors. * @param locale the Locale to use for messages */ public UTF8Reader(InputStream inputStream, int size, MessageFormatter messageFormatter, Locale locale) { fInputStream = inputStream; fBuffer = new byte[size]; fFormatter = messageFormatter; fLocale = locale; } // <init>(InputStream, int, MessageFormatter, Locale) // // Reader methods // /** * Read a single character. This method will block until a character is * available, an I/O error occurs, or the end of the stream is reached. * * <p> Subclasses that intend to support efficient single-character input * should override this method. * * @return The character read, as an integer in the range 0 to 16383 * (<tt>0x00-0xffff</tt>), or -1 if the end of the stream has * been reached * * @exception IOException If an I/O error occurs */ public int read() throws IOException { // decode character int c = fSurrogate; if (fSurrogate == -1) { // NOTE: We use the index into the buffer if there are remaining // bytes from the last block read. -Ac int index = 0; // get first byte int b0 = index == fOffset ? fInputStream.read() : fBuffer[index++] & 0x00FF; if (b0 == -1) { return -1; } // UTF-8: [0xxx xxxx] // Unicode: [0000 0000] [0xxx xxxx] if (b0 < 0x80) { c = (char)b0; } // UTF-8: [110y yyyy] [10xx xxxx] // Unicode: [0000 0yyy] [yyxx xxxx] else if ((b0 & 0xE0) == 0xC0 && (b0 & 0x1E) != 0) { int b1 = index == fOffset ? fInputStream.read() : fBuffer[index++] & 0x00FF; if (b1 == -1) { expectedByte(2, 2); } if ((b1 & 0xC0) != 0x80) { invalidByte(2, 2, b1); } c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F); } // UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx] // Unicode: [zzzz yyyy] [yyxx xxxx] else if ((b0 & 0xF0) == 0xE0) { int b1 = index == fOffset ? fInputStream.read() : fBuffer[index++] & 0x00FF; if (b1 == -1) { expectedByte(2, 3); } if ((b1 & 0xC0) != 0x80 || (b0 == 0xED && b1 >= 0xA0) || ((b0 & 0x0F) == 0 && (b1 & 0x20) == 0)) { invalidByte(2, 3, b1); } int b2 = index == fOffset ? fInputStream.read() : fBuffer[index++] & 0x00FF; if (b2 == -1) { expectedByte(3, 3); } if ((b2 & 0xC0) != 0x80) { invalidByte(3, 3, b2); } c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0) | (b2 & 0x003F); } // UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]* // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate) // [1101 11yy] [yyxx xxxx] (low surrogate) // * uuuuu = wwww + 1 else if ((b0 & 0xF8) == 0xF0) { int b1 = index == fOffset ? fInputStream.read() : fBuffer[index++] & 0x00FF; if (b1 == -1) { expectedByte(2, 4); } if ((b1 & 0xC0) != 0x80 || ((b1 & 0x30) == 0 && (b0 & 0x07) == 0)) { invalidByte(2, 3, b1); } int b2 = index == fOffset ? fInputStream.read() : fBuffer[index++] & 0x00FF; if (b2 == -1) { expectedByte(3, 4); } if ((b2 & 0xC0) != 0x80) { invalidByte(3, 3, b2); } int b3 = index == fOffset ? fInputStream.read() : fBuffer[index++] & 0x00FF; if (b3 == -1) { expectedByte(4, 4); } if ((b3 & 0xC0) != 0x80) { invalidByte(4, 4, b3); } int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003); if (uuuuu > 0x10) { invalidSurrogate(uuuuu); } int wwww = uuuuu - 1; int hs = 0xD800 | ((wwww << 6) & 0x03C0) | ((b1 << 2) & 0x003C) | ((b2 >> 4) & 0x0003); int ls = 0xDC00 | ((b2 << 6) & 0x03C0) | (b3 & 0x003F); c = hs; fSurrogate = ls; } // error else { invalidByte(1, 1, b0); } } // use surrogate else { fSurrogate = -1; } // return character if (DEBUG_READ) { System.out.println("read(): 0x"+Integer.toHexString(c)); } return c; } // read():int /** * Read characters into a portion of an array. This method will block * until some input is available, an I/O error occurs, or the end of the * stream is reached. * * @param ch Destination buffer * @param offset Offset at which to start storing characters * @param length Maximum number of characters to read * * @return The number of characters read, or -1 if the end of the * stream has been reached * * @exception IOException If an I/O error occurs */ public int read(char ch[], int offset, int length) throws IOException { // handle surrogate int out = offset; if (fSurrogate != -1) { ch[offset + 1] = (char)fSurrogate; fSurrogate = -1; length--; out++; } // read bytes int count = 0; if (fOffset == 0) { // adjust length to read if (length > fBuffer.length) { length = fBuffer.length; } // perform read operation count = fInputStream.read(fBuffer, 0, length); if (count == -1) { return -1; } count += out - offset; } // skip read; last character was in error // NOTE: Having an offset value other than zero means that there was // an error in the last character read. In this case, we have // skipped the read so we don't consume any bytes past the // error. By signalling the error on the next block read we // allow the method to return the most valid characters that // it can on the previous block read. -Ac else { count = fOffset; fOffset = 0; } // convert bytes to characters final int total = count; int in; byte byte1; final byte byte0 = 0; for (in = 0; in < total; in++) { byte1 = fBuffer[in]; if (byte1 >= byte0) { ch[out++] = (char)byte1; } else { break; } } for ( ; in < total; in++) { byte1 = fBuffer[in]; // UTF-8: [0xxx xxxx]
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?