⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 xmlcharsetdetector.java

📁 电子地图服务器,搭建自己的地图服务
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
/* Copyright (c) 2001 - 2007 TOPP - www.openplans.org. All rights reserved.
 * This code is licensed under the GPL 2.0 license, availible at the root
 * application directory.
 */
package org.geoserver.ows.util;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


/**
 * Provides a methods that can be used to detect charset of some
 * XML document and (optionally) return a reader that is aware of
 * this charset and can correctly decode document's data.
 */
public class XmlCharsetDetector {
    protected static Logger LOGGER = org.geotools.util.logging.Logging.getLogger("org.vfny.geoserver.requests");

    /**
     * In current context naming this "GT", "GREATER_THAN" or like
     * would be misleading.
     */
    private static final char RIGHT_ANGLE_BRACKET = '\u003E';
    private static final Pattern ENCODING_PATTERN = Pattern.compile(
            "encoding\\s*\\=\\s*\"([^\"]+)\"");

    /**
     * Maximum number of characters we are expecting in XML Declaration.
     * There are probably will be less then 100, but just in case...
     */
    private static final int MAX_XMLDECL_SIZE = 100;

    /**
     * Based on Xerces-J code, this method will try its best to return a
     * reader which is able to decode content of incoming XML document
     * properly. To achieve this goal, it first infers general
     * encoding scheme of the above document and then uses this
     * information to extract actual charset from XML declaration. In
     * any recoverable error situation default UTF-8 reader will be
     * created.
     *
     * @param istream Byte stream (most probably obtained with
     *                <code>HttpServletRequest.getInputStream</code>
     *                that gives access to XML document in question).
     *
     * @param encInfo Instance of EncodingInfo where information about
     *                detected charset will be stored. You can then
     *                use it, for example, to form a response encoded
     *                with this charset.
     *
     * @throws IOException in case of any unrecoverable I/O errors.
     * @throws UnsupportedCharsetException <code>InputStreamReader</code>'s
     *             constructor will probably throw this exception if
     *             inferred charset of XML document is not supported by
     *             current JVM.
     */
    public static Reader getCharsetAwareReader(InputStream istream, EncodingInfo encInfo)
        throws IOException, UnsupportedCharsetException {
        RewindableInputStream stream;
        stream = new RewindableInputStream(istream, false);

        //
        // Phase 1. Reading first four bytes and determining encoding scheme.
        final byte[] b4 = new byte[4];

        int count = 0;

        for (; count < 4; count++) {
            int b = stream.read();

            if (-1 != b) {
                b4[count] = (byte) b;
            } else {
                break;
            }
        }

        if (LOGGER.isLoggable(Level.FINER)) {
            // Such number of concatenating strings makes me sick.
            // But using StringBuffer will make this uglier, not?
            LOGGER.finer("First 4 bytes of XML doc are : "
                + Integer.toHexString((int) b4[0] & 0xff).toUpperCase() + " ('" + (char) b4[0]
                + "') " + Integer.toHexString((int) b4[1] & 0xff).toUpperCase() + " ('"
                + (char) b4[1] + "') " + Integer.toHexString((int) b4[2] & 0xff).toUpperCase()
                + " ('" + (char) b4[2] + "') "
                + Integer.toHexString((int) b4[3] & 0xff).toUpperCase() + " ('" + (char) b4[3]
                + "')");
        }

        /*
         * `getEncodingName()` is capable of detecting following encoding
         * schemes:
         * "UTF-8", "UTF-16LE", "UTF-16BE", "ISO-10646-UCS-4",
         * or "CP037". It cannot distinguish between UTF-16 (without BOM)
         * and "ISO-10646-UCS-2", so latter will be interpreted as UTF-16
         * for the purpose of reading XML declaration. There shouldn't be
         * much trouble though as (I believe) these formats are identical for
         * the Basic Multilingual Plane, except that UTF-16-encoded text
         * can contain values from surrogate range and valid UCS-2 input
         * cannot (imho).
         * This ugly form of copying charset data is required to maintain
         * "reference integrity" of encInfo variable. As it can be possibly
         * used after this method call, it should point to the same memory
         * structure, and assignment or cloning doesn't work for me there.
         */
        encInfo.copyFrom(getEncodingName(b4, count));

        if (LOGGER.isLoggable(Level.FINE)) {
            LOGGER.fine("Charset detection phase 1. Inferred encoding: " + encInfo.toString());
        }

        // Rewinding to beginning of data
        stream.reset();

        String ENCODING = encInfo.getEncoding().toUpperCase(Locale.ENGLISH);
        Boolean isBigEndian = encInfo.isBigEndian();
        boolean hasBOM = encInfo.hasBOM();

        /*
         * Special case UTF-8 files with BOM created by Microsoft
         * tools. It's more efficient to consume the BOM than make
         * the reader perform extra checks. -Ac
         */
        if (hasBOM && ENCODING.equals("UTF-8")) {
            // ignore first three bytes...
            stream.skip(3);
        }

        /*
         * The specifics of `getEncodingName` work is that it always returns
         * UTF-16 with BOM as either UTF-16LE or UTF-16BE, and
         * InputStreamReader doesn't expect BOM coming with UTF-16LE|BE
         * encoded data. So this BOM should also be removed, if present.
         */
        if ((count > 1) && (ENCODING.equals("UTF-16LE") || ENCODING.equals("UTF-16BE"))) {
            int b0 = b4[0] & 0xFF;
            int b1 = b4[1] & 0xFF;

            if (((b0 == 0xFF) && (b1 == 0xFE)) || ((b0 == 0xFE) && (b1 == 0xFF))) {
                // ignore first two bytes...
                stream.skip(2);
            }
        }

        Reader reader = null;

        /*
         * We must use custom class to read UCS-4 data, my JVM doesn't support
         * this encoding scheme by default and I doubt other JVMs are.
         *
         * There was another specific reader for UTF-8 encoding in Xerces
         * (org.apache.xerces.impl.io.UTF8Reader), which they say is
         * optimized one. May be it is really better than JVM's default
         * decoding algorithm but I doubt the necessity of porting just
         * another (not so small) class in order to "efficiently" extract
         * a couple of chars from XML declaration. Still I may be mistaking
         * there. Moreover, Xerces' UTF8Reader has some internal dependencies
         * and it will take much more effort to extract it from there.
         *
         * Also, at this stage it is quite impossible to have "ISO-10646-UCS-2"
         * as a value for ENCODING.
         *
         * You can avoid possible bugs in UCSReader by commenting out this
         * block of code together with following `if`. Then you will get an
         * UnsupportedEncodingException for UCS-4 encoded data.
         */
        if ("ISO-10646-UCS-4".equals(ENCODING)) {
            if (null != isBigEndian) {
                boolean isBE = isBigEndian.booleanValue();

                if (isBE) {
                    reader = new UCSReader(stream, UCSReader.UCS4BE);
                } else {
                    reader = new UCSReader(stream, UCSReader.UCS4LE);
                }
            } else {
                // Fatal error, UCSReader will fail to decode this properly
                String s = "Unsupported byte order for ISO-10646-UCS-4 encoding.";
                throw new UnsupportedCharsetException(s);
            }
        }

        if (null == reader) {
            reader = new InputStreamReader(stream, ENCODING);
        }

        //
        // Phase 2. Reading XML declaration and extracting charset info from it.
        String declEncoding = getXmlEncoding(reader);

        if (LOGGER.isLoggable(Level.FINE)) {
            LOGGER.fine("Charset detection phase 2. Charset in XML declaration " + "is `"
                + declEncoding + "`.");
        }

        stream.reset();

        /*
         * Now RewindableInputStream is allowed to return more than one byte
         * per read operation. It also will not buffer bytes read using
         * `read(byte[], int, int)` method.
         */
        stream.setChunkedMode(true);

        /*
         * Reusing existing reader if possible, creating new one only if
         * declared charset name differs from guessed one
         */
        if ((null != declEncoding) && !declEncoding.equals(ENCODING)) {
            /*
             * I believe that for UCS-2 encoding default UTF-16 reader
             * (which is already created at this time) should suffice
             * in most cases. Though, we can always construct a new
             * UCSReader instance, if I am wrong here.
             */
            if (!declEncoding.equals("ISO-10646-UCS-2")) {
                if (LOGGER.isLoggable(Level.FINE)) {
                    LOGGER.fine("Declared charset differs from inferred one. "
                        + "Trying to construct InputStreamReader for `" + declEncoding + "`.");
                }

                reader = new InputStreamReader(stream, declEncoding);
                encInfo.setEncoding(declEncoding);
            }
        }

        return reader;
    } // END getCharsetAwareReader(InputStream) : Reader

    /**
     * Use this variant when you aren't interested in encoding data, and just
     * want to get a suitable reader for incoming request.
     *
     * @param istream See <code>getCharsetAwareReader(InputStream,
     *                              EncodingInfo)</code>.
     *
     */
    public static Reader getCharsetAwareReader(InputStream istream)
        throws IOException, UnsupportedCharsetException {
        return getCharsetAwareReader(istream, new EncodingInfo());
    }

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -