📄 ucsreader.java

📁 电子地图服务器,搭建自己的地图服务
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
/*
 * The Apache Software License, Version 1.1
 *
 *
 * Copyright (c) 2000-2002 The Apache Software Foundation.  All rights
 * reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * 3. The end-user documentation included with the redistribution,
 *    if any, must include the following acknowledgment:
 *       "This product includes software developed by the
 *        Apache Software Foundation (http://www.apache.org/)."
 *    Alternately, this acknowledgment may appear in the software itself,
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Xerces" and "Apache Software Foundation" must
 *    not be used to endorse or promote products derived from this
 *    software without prior written permission. For written
 *    permission, please contact apache@apache.org.
 *
 * 5. Products derived from this software may not be called "Apache",
 *    nor may "Apache" appear in their name, without prior written
 *    permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 * ====================================================================
 *
 * This software consists of voluntary contributions made by many
 * individuals on behalf of the Apache Software Foundation and was
 * originally based on software copyright (c) 1999, International
 * Business Machines, Inc., http://www.apache.org.  For more
 * information on the Apache Software Foundation, please see
 * <http://www.apache.org/>.
 */

//package org.apache.xerces.impl.io;
package org.geoserver.ows.util;

import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;


/**
 * Reader for UCS-2 and UCS-4 encodings.
 * (more precisely ISO-10646-UCS-(2|4) encodings).
 *
 * This variant is modified to handle supplementary Unicode code points
 * correctly. Though this required a lot of new code and definitely
 * reduced the perfomance comparing to original version. I tried my best
 * to preserve exsiting code and comments whenever it was possible.
 * I performed some basic tests, but not too thorough ones, so
 * some bugs may still nest in the code. -AK
 *
 * @author Neil Graham, IBM
 *
 * @version $Id: UCSReader.java 6177 2007-02-19 10:11:27Z aaime $
 */
public class UCSReader extends Reader {
    //
    // Constants
    //

    /**
     * Default byte buffer size (8192, larger than that of ASCIIReader
     * since it's reasonable to surmise that the average UCS-4-encoded
     * file should be 4 times as large as the average ASCII-encoded file).
     */
    public static final int DEFAULT_BUFFER_SIZE = 8192;

    /**
     * Starting size of the internal char buffer. Internal char buffer is
     * maintained to hold excess chars that may left from previous read
     * operation when working with UCS-4 data (never used for UCS-2).
     */
    public static final int CHAR_BUFFER_INITIAL_SIZE = 1024;
    public static final short UCS2LE = 1;
    public static final short UCS2BE = 2;
    public static final short UCS4LE = 4;
    public static final short UCS4BE = 8;

    /**
     * The minimum value of a supplementary code point.
     */
    public static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x010000;

    /**
     * The minimum value of a Unicode code point.
     */
    public static final int MIN_CODE_POINT = 0x000000;

    /**
     * The maximum value of a Unicode code point.
     */
    public static final int MAX_CODE_POINT = 0x10ffff;

    //
    // Data
    //

    /** Input stream. */
    protected InputStream fInputStream;

    /** Byte buffer. */
    protected byte[] fBuffer;

    /** what kind of data we're dealing with */
    protected short fEncoding;

    /**
     * Stores aforeread or "excess" characters that may appear during
     * <code>read</code> methods invocation due to the fact that one input
     * UCS-4 supplementary character results in two output Java
     * <code>char</code>`s - high surrogate and low surrogate code units.
     * Because of that, if <code>read()</code> method encounters supplementary
     * code point in the input stream, it returns UTF-16-encoded high surrogate
     * code unit and stores low surrogate in buffer. When called next time,
     * <code>read()</code> will return this low surrogate, instead of reading
     * more bytes from the <code>InputStream</code>. Similarly if
     * <code>read(char[], int, int)</code> is invoked to read, for example,
     * 10 chars into specified buffer, and 4 of them turn out to
     * be supplementary Unicode characters, each written as two chars, then we
     * end up having 4 excess chars that we cannot immediately return or
     * push back to the input stream. So we need to store them in the buffer
     * awaiting further <code>read</code> invocations.
     * Note that char buffer functions like a stack, i.e. chars and surrogate
     * pairs are stored in reverse order.
     */
    protected char[] fCharBuf;

    /**
     * Count of Java chars currently being stored in in the
     * <code>fCharBuf</code> array.
     */
    protected int fCharCount;

    //
    // Constructors
    //

    /**
     * Constructs an <code>ISO-10646-UCS-(2|4)</code> reader from the specified
     * input stream using default buffer size. The Endianness and exact input
     * encoding (<code>UCS-2</code> or <code>UCS-4</code>) also should be known
     * in advance.
     *
     * @param inputStream input stream with UCS-2|4 encoded data
     * @param encoding    One of UCS2LE, UCS2BE, UCS4LE or UCS4BE.
     */
    public UCSReader(InputStream inputStream, short encoding) {
        this(inputStream, DEFAULT_BUFFER_SIZE, encoding);
    } // <init>(InputStream, short)

    /**
     * Constructs an <code>ISO-10646-UCS-(2|4)</code> reader from the source
     * input stream using explicitly specified initial buffer size. Endianness
     * and exact input encoding (<code>UCS-2</code> or <code>UCS-4</code>) also
     * should be known in advance.
     *
     * @param inputStream input stream with UCS-2|4 encoded data
     * @param size        The initial buffer size. You better make sure
     *                    this number is divisible by 4 if you plan to
     *                    to read UCS-4 with this class.
     * @param encoding    One of UCS2LE, UCS2BE, UCS4LE or UCS4BE
     */
    public UCSReader(InputStream inputStream, int size, short encoding) {
        fInputStream = inputStream;
        fBuffer = new byte[size];
        fEncoding = encoding;

        fCharBuf = new char[CHAR_BUFFER_INITIAL_SIZE];
        fCharCount = 0;
    } // <init>(InputStream, int, short)

    //
    // Reader methods
    //

    /**
     * Read a single character.  This method will block until a character is
     * available, an I/O error occurs, or the end of the stream is reached.
     *
     * If supplementary Unicode character is encountered in <code>UCS-4</code>
     * input, it will be encoded into <code>UTF-16</code> surrogate pair
     * according to RFC 2781. High surrogate code unit will be returned
     * immediately, and low surrogate saved in the internal buffer to be read
     * during next <code>read()</code> or <code>read(char[], int, int)</code>
     * invocation. -AK
     *
     * @return     Java 16-bit <code>char</code> value containing UTF-16 code
     *             unit which may be either code point from Basic Multilingual
     *             Plane or one of the surrogate code units (high or low)
     *             of the pair representing supplementary Unicode character
     *             (one in <code>0x10000 - 0x10FFFF</code> range) -AK
     *
     * @exception  IOException  when I/O error occurs
     */
    public int read() throws IOException {
        // If we got something in the char buffer, let's use it.
        if (0 != fCharCount) {
            fCharCount--;

            return ((int) fCharBuf[fCharCount]) & 0xFFFF;
        }

        int b0 = fInputStream.read() & 0xff; // 1st byte

        if (b0 == 0xff) {
            return -1;
        }

        int b1 = fInputStream.read() & 0xff; // 2nd byte

        if (b1 == 0xff) {
            return -1;
        }

        if (fEncoding >= 4) { // UCS-4

            int b2 = fInputStream.read() & 0xff; // 3rd byte

            if (b2 == 0xff) {
                return -1;
            }

            int b3 = fInputStream.read() & 0xff; // 4th byte

            if (b3 == 0xff) {
                return -1;
            }

            int codepoint;

            if (UCS4BE == fEncoding) {
                codepoint = ((b0 << 24) + (b1 << 16) + (b2 << 8) + b3);
            } else {
                codepoint = ((b3 << 24) + (b2 << 16) + (b1 << 8) + b0);
            }

            /*
             * Encoding from UCS-4 to UTF-16 as described in RFC 2781
             * In theory there should be additional `isValidCodePoint()` check
             * but I simply don't know what to do if invalid one is encountered.
             */
            if (!isSupplementaryCodePoint(codepoint)) {
                return codepoint;
            } else {
                int cp1 = (codepoint - 0x10000) & 0xFFFFF;
                int highSurrogate = 0xD800 + (cp1 >>> 10); // ">>" should work too
                                                           // Saving low surrogate for future use

                fCharBuf[fCharCount] = (char) (0xDC00 + (cp1 & 0x3FF));

                // low surrogate code unit will be returned during next call
                return highSurrogate;
            }
        } else { // UCS-2

            if (fEncoding == UCS2BE) {
                return (b0 << 8) + b1;
            } else {
                return (b1 << 8) + b0;
            }
        }
    } // read():int

    /**
     * Read characters into a portion of an array.  This method will block
     * until some input is available, an I/O error occurs, or the end of the
     * stream is reached.
     *
     * I suspect that the whole stuff works awfully slow, so if you know
     * for sure that your <code>UCS-4</code> input does not contain any
     * supplementary code points you probably should use original
     * <code>UCSReader</code> class from Xerces team
     * (<code>org.apache.xerces.impl.io.UCSReader</code>). -AK
     *
     * @param      ch     Destination buffer
     * @param      offset Offset at which to start storing characters
     * @param      length Maximum number of characters to read
     *
     * @return     The number of characters read, or <code>-1</code> if the
     *             end of the stream has been reached. Note that this is not
     *             a number of <code>UCS-4</code> characters read, but
     *             instead number of <code>UTF-16</code> code units. These
     *             two are equal only if there were no supplementary Unicode
     *             code points among read chars.
     *
     * @exception  IOException  If an I/O error occurs
     */
    public int read(char[] ch, int offset, int length)
        throws IOException {
        /*
         * The behavior of this method is _intended_ to be like this:
         *
         * 1. In case if we are working with UCS-2 data, `readUCS2` method
         *    handles the stuff.
         *
         * 2. For UCS-4 data method first looks if there is some data stored in
         *    the internal character buffer (fCharBuf). Usually this data is
         *    left from previous reading operation if there were any
         *    supplementary Unicode (ISO-10646) characters.
         *
         * 3. If buffer holds something, these chars are put directly in passed
         *    `ch` buffer (maximum `length` of them).
         *
         * 4. If char buffer ends and more data can be put into `ch`,
         *    then they are read from the underlying byte stream.
         *
         * 5. Method tries to read maximum possible number of bytes from
         *    InputStream, as if all read code points were from BMP (Basic
         *    Multilingual Plane).
         *
         * 6. Read UCS-4 characters are encoded to UTF-16 (which is native Java
         *     encoding) ant put into `ch` array.
         *
         * 7. It is possible that we end up with more chars than we can
         *    currently put into passed buffer due to the fact that
         *    supplementary Unicode characters are encoded into _two_ Java
         *    char's each. In this situation excess chars are stored in the
         *    internal char buffer (in reverse order, i.e. those read last
         *    are at the beginning of the `fCharBuf`). They are usually picked
         *    up during next call(s) to one of the `read` methods.
         */
        if ((0 > offset) || (offset > ch.length) || (0 > length) || ((offset + length) > ch.length)
                || (0 > (offset + length))) {
            throw new IndexOutOfBoundsException();
        } else if (0 == length) {
            return 0;
        }

        /*
         * Well, it is clear that the code should be separated for
         * UCS-2 and UCS-4 now with all that char buffer stuff around.
         * Things are already getting nasty.
         */
        if (fEncoding < 4) {
            return readUCS2(ch, offset, length);
        }

        // First using chars from internal char buffer (if any)
        int charsRead = 0;

        while (charsRead <= length) {
            if (0 != fCharCount) {
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -