📄 ucsreader.java
字号:
/*
* The Apache Software License, Version 1.1
*
*
* Copyright (c) 2000-2002 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Xerces" and "Apache Software Foundation" must
* not be used to endorse or promote products derived from this
* software without prior written permission. For written
* permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* nor may "Apache" appear in their name, without prior written
* permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation and was
* originally based on software copyright (c) 1999, International
* Business Machines, Inc., http://www.apache.org. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
//package org.apache.xerces.impl.io;
package org.geoserver.ows.util;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
/**
* Reader for UCS-2 and UCS-4 encodings.
* (more precisely ISO-10646-UCS-(2|4) encodings).
*
* This variant is modified to handle supplementary Unicode code points
* correctly. Though this required a lot of new code and definitely
* reduced the perfomance comparing to original version. I tried my best
* to preserve exsiting code and comments whenever it was possible.
* I performed some basic tests, but not too thorough ones, so
* some bugs may still nest in the code. -AK
*
* @author Neil Graham, IBM
*
* @version $Id: UCSReader.java 6177 2007-02-19 10:11:27Z aaime $
*/
public class UCSReader extends Reader {
//
// Constants
//
/**
* Default byte buffer size (8192, larger than that of ASCIIReader
* since it's reasonable to surmise that the average UCS-4-encoded
* file should be 4 times as large as the average ASCII-encoded file).
*/
public static final int DEFAULT_BUFFER_SIZE = 8192;
/**
* Starting size of the internal char buffer. Internal char buffer is
* maintained to hold excess chars that may left from previous read
* operation when working with UCS-4 data (never used for UCS-2).
*/
public static final int CHAR_BUFFER_INITIAL_SIZE = 1024;
public static final short UCS2LE = 1;
public static final short UCS2BE = 2;
public static final short UCS4LE = 4;
public static final short UCS4BE = 8;
/**
* The minimum value of a supplementary code point.
*/
public static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x010000;
/**
* The minimum value of a Unicode code point.
*/
public static final int MIN_CODE_POINT = 0x000000;
/**
* The maximum value of a Unicode code point.
*/
public static final int MAX_CODE_POINT = 0x10ffff;
//
// Data
//
/** Input stream. */
protected InputStream fInputStream;
/** Byte buffer. */
protected byte[] fBuffer;
/** what kind of data we're dealing with */
protected short fEncoding;
/**
* Stores aforeread or "excess" characters that may appear during
* <code>read</code> methods invocation due to the fact that one input
* UCS-4 supplementary character results in two output Java
* <code>char</code>`s - high surrogate and low surrogate code units.
* Because of that, if <code>read()</code> method encounters supplementary
* code point in the input stream, it returns UTF-16-encoded high surrogate
* code unit and stores low surrogate in buffer. When called next time,
* <code>read()</code> will return this low surrogate, instead of reading
* more bytes from the <code>InputStream</code>. Similarly if
* <code>read(char[], int, int)</code> is invoked to read, for example,
* 10 chars into specified buffer, and 4 of them turn out to
* be supplementary Unicode characters, each written as two chars, then we
* end up having 4 excess chars that we cannot immediately return or
* push back to the input stream. So we need to store them in the buffer
* awaiting further <code>read</code> invocations.
* Note that char buffer functions like a stack, i.e. chars and surrogate
* pairs are stored in reverse order.
*/
protected char[] fCharBuf;
/**
* Count of Java chars currently being stored in in the
* <code>fCharBuf</code> array.
*/
protected int fCharCount;
//
// Constructors
//
/**
* Constructs an <code>ISO-10646-UCS-(2|4)</code> reader from the specified
* input stream using default buffer size. The Endianness and exact input
* encoding (<code>UCS-2</code> or <code>UCS-4</code>) also should be known
* in advance.
*
* @param inputStream input stream with UCS-2|4 encoded data
* @param encoding One of UCS2LE, UCS2BE, UCS4LE or UCS4BE.
*/
public UCSReader(InputStream inputStream, short encoding) {
this(inputStream, DEFAULT_BUFFER_SIZE, encoding);
} // <init>(InputStream, short)
/**
* Constructs an <code>ISO-10646-UCS-(2|4)</code> reader from the source
* input stream using explicitly specified initial buffer size. Endianness
* and exact input encoding (<code>UCS-2</code> or <code>UCS-4</code>) also
* should be known in advance.
*
* @param inputStream input stream with UCS-2|4 encoded data
* @param size The initial buffer size. You better make sure
* this number is divisible by 4 if you plan to
* to read UCS-4 with this class.
* @param encoding One of UCS2LE, UCS2BE, UCS4LE or UCS4BE
*/
public UCSReader(InputStream inputStream, int size, short encoding) {
fInputStream = inputStream;
fBuffer = new byte[size];
fEncoding = encoding;
fCharBuf = new char[CHAR_BUFFER_INITIAL_SIZE];
fCharCount = 0;
} // <init>(InputStream, int, short)
//
// Reader methods
//
/**
* Read a single character. This method will block until a character is
* available, an I/O error occurs, or the end of the stream is reached.
*
* If supplementary Unicode character is encountered in <code>UCS-4</code>
* input, it will be encoded into <code>UTF-16</code> surrogate pair
* according to RFC 2781. High surrogate code unit will be returned
* immediately, and low surrogate saved in the internal buffer to be read
* during next <code>read()</code> or <code>read(char[], int, int)</code>
* invocation. -AK
*
* @return Java 16-bit <code>char</code> value containing UTF-16 code
* unit which may be either code point from Basic Multilingual
* Plane or one of the surrogate code units (high or low)
* of the pair representing supplementary Unicode character
* (one in <code>0x10000 - 0x10FFFF</code> range) -AK
*
* @exception IOException when I/O error occurs
*/
public int read() throws IOException {
// If we got something in the char buffer, let's use it.
if (0 != fCharCount) {
fCharCount--;
return ((int) fCharBuf[fCharCount]) & 0xFFFF;
}
int b0 = fInputStream.read() & 0xff; // 1st byte
if (b0 == 0xff) {
return -1;
}
int b1 = fInputStream.read() & 0xff; // 2nd byte
if (b1 == 0xff) {
return -1;
}
if (fEncoding >= 4) { // UCS-4
int b2 = fInputStream.read() & 0xff; // 3rd byte
if (b2 == 0xff) {
return -1;
}
int b3 = fInputStream.read() & 0xff; // 4th byte
if (b3 == 0xff) {
return -1;
}
int codepoint;
if (UCS4BE == fEncoding) {
codepoint = ((b0 << 24) + (b1 << 16) + (b2 << 8) + b3);
} else {
codepoint = ((b3 << 24) + (b2 << 16) + (b1 << 8) + b0);
}
/*
* Encoding from UCS-4 to UTF-16 as described in RFC 2781
* In theory there should be additional `isValidCodePoint()` check
* but I simply don't know what to do if invalid one is encountered.
*/
if (!isSupplementaryCodePoint(codepoint)) {
return codepoint;
} else {
int cp1 = (codepoint - 0x10000) & 0xFFFFF;
int highSurrogate = 0xD800 + (cp1 >>> 10); // ">>" should work too
// Saving low surrogate for future use
fCharBuf[fCharCount] = (char) (0xDC00 + (cp1 & 0x3FF));
// low surrogate code unit will be returned during next call
return highSurrogate;
}
} else { // UCS-2
if (fEncoding == UCS2BE) {
return (b0 << 8) + b1;
} else {
return (b1 << 8) + b0;
}
}
} // read():int
/**
* Read characters into a portion of an array. This method will block
* until some input is available, an I/O error occurs, or the end of the
* stream is reached.
*
* I suspect that the whole stuff works awfully slow, so if you know
* for sure that your <code>UCS-4</code> input does not contain any
* supplementary code points you probably should use original
* <code>UCSReader</code> class from Xerces team
* (<code>org.apache.xerces.impl.io.UCSReader</code>). -AK
*
* @param ch Destination buffer
* @param offset Offset at which to start storing characters
* @param length Maximum number of characters to read
*
* @return The number of characters read, or <code>-1</code> if the
* end of the stream has been reached. Note that this is not
* a number of <code>UCS-4</code> characters read, but
* instead number of <code>UTF-16</code> code units. These
* two are equal only if there were no supplementary Unicode
* code points among read chars.
*
* @exception IOException If an I/O error occurs
*/
public int read(char[] ch, int offset, int length)
throws IOException {
/*
* The behavior of this method is _intended_ to be like this:
*
* 1. In case if we are working with UCS-2 data, `readUCS2` method
* handles the stuff.
*
* 2. For UCS-4 data method first looks if there is some data stored in
* the internal character buffer (fCharBuf). Usually this data is
* left from previous reading operation if there were any
* supplementary Unicode (ISO-10646) characters.
*
* 3. If buffer holds something, these chars are put directly in passed
* `ch` buffer (maximum `length` of them).
*
* 4. If char buffer ends and more data can be put into `ch`,
* then they are read from the underlying byte stream.
*
* 5. Method tries to read maximum possible number of bytes from
* InputStream, as if all read code points were from BMP (Basic
* Multilingual Plane).
*
* 6. Read UCS-4 characters are encoded to UTF-16 (which is native Java
* encoding) ant put into `ch` array.
*
* 7. It is possible that we end up with more chars than we can
* currently put into passed buffer due to the fact that
* supplementary Unicode characters are encoded into _two_ Java
* char's each. In this situation excess chars are stored in the
* internal char buffer (in reverse order, i.e. those read last
* are at the beginning of the `fCharBuf`). They are usually picked
* up during next call(s) to one of the `read` methods.
*/
if ((0 > offset) || (offset > ch.length) || (0 > length) || ((offset + length) > ch.length)
|| (0 > (offset + length))) {
throw new IndexOutOfBoundsException();
} else if (0 == length) {
return 0;
}
/*
* Well, it is clear that the code should be separated for
* UCS-2 and UCS-4 now with all that char buffer stuff around.
* Things are already getting nasty.
*/
if (fEncoding < 4) {
return readUCS2(ch, offset, length);
}
// First using chars from internal char buffer (if any)
int charsRead = 0;
while (charsRead <= length) {
if (0 != fCharCount) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -