📄 xmlcharsetdetector.java

📁 电子地图服务器,搭建自己的地图服务
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12

    /**
     * Creates a new reader on top of the given <code>InputStream</code> using
     * existing (external) encoding information. Unlike
     * <code>getCharsetAwareReader</code>, this method never tries to detect
     * charset or encoding scheme of <code>InputStream</code>'s data. This also
     * means that it <em>must</em> be provided with valid
     * <code>EncodingInfo</code> instance, which may be obtained, for example,
     * from previous <code>getCharsetAwareReader(InputStream, EncodingInfo)</code>
     * call.
     *
     * @param istream byte-stream containing textual (presumably XML) data
     * @param encInfo correctly initialized object which holds information of
     *                the above byte-stream's contents charset.
     *
     * @throws IllegalArgumentException      if charset name is not specified
     * @throws UnsupportedEncodingException  in cases when specified charset is
     *             not supported by platform or due to invalid byte order for
     *             <code>ISO-10646-UCS-2|4</code> charsets.
     *
     */
    public static Reader createReader(InputStream istream, EncodingInfo encInfo)
        throws IllegalArgumentException, UnsupportedEncodingException {
        String charset = encInfo.getEncoding();
        Boolean isBigEndian = encInfo.isBigEndian();

        // We MUST know encoding (in fact, charset) name, and as EncodingInfo
        // have non-arg constructor, its `getEncoding` can return null.
        if (null == charset) {
            String s = "Name of the charset must not be NULL!";
            throw new IllegalArgumentException(s);
        }

        if (LOGGER.isLoggable(Level.FINE)) {
            LOGGER.fine("Trying to create reader basing on existing charset " + "information: `"
                + encInfo + "`.");
        }

        Reader reader = null;

        // UCS-2|4 charsets are handled with custom reader
        if ("ISO-10646-UCS-4".equals(charset)) {
            if (null != isBigEndian) {
                boolean isBE = isBigEndian.booleanValue();

                if (isBE) {
                    reader = new UCSReader(istream, UCSReader.UCS4BE);
                } else {
                    reader = new UCSReader(istream, UCSReader.UCS4LE);
                }
            } else {
                // Fatal error, UCSReader will fail to decode this properly
                String s = "Unsupported byte order for ISO-10646-UCS-4 encoding.";
                throw new UnsupportedEncodingException(s);
            }
        } else if ("ISO-10646-UCS-2".equals(charset)) {
            if (null != isBigEndian) {
                boolean isBE = isBigEndian.booleanValue();

                if (isBE) {
                    reader = new UCSReader(istream, UCSReader.UCS4BE);
                } else {
                    reader = new UCSReader(istream, UCSReader.UCS4LE);
                }
            } else {
                // Cannot construct UCSReader without byte order info
                String s = "Byte order must be specified for ISO-10646-UCS-2.";
                throw new UnsupportedEncodingException(s);
            }
        } else {
            reader = new InputStreamReader(istream, charset);
        }

        return reader;
    } // END createReader(InputStream, EncodingInfo) : Reader

    /**
     * Returns the IANA encoding name that is auto-detected from
     * the bytes specified, with the endian-ness of that encoding where
     * appropriate. Note, that encoding obtained this way is only an
     * <em>encoding scheme</em> of the request, i.e. step 1 of detection
     * process. To learn the exact <em>charset</em> of the request data,
     * you should also perform step 2 - read XML declaration and get the
     * value of its <code>encoding</code> pseudoattribute.
     *
     * @param b4    The first four bytes of the input.
     * @param count The number of bytes actually read.
     * @return Instance of EncodingInfo incapsulating all encoding-related data.
     */
    public static EncodingInfo getEncodingName(byte[] b4, int count) {
        if (count < 2) {
            return new EncodingInfo("UTF-8", null);
        }

        // UTF-16, with BOM
        int b0 = b4[0] & 0xFF;
        int b1 = b4[1] & 0xFF;

        if ((b0 == 0xFE) && (b1 == 0xFF)) {
            // UTF-16, big-endian
            return new EncodingInfo("UTF-16BE", new Boolean(true), true);
        }

        if ((b0 == 0xFF) && (b1 == 0xFE)) {
            // UTF-16, little-endian
            return new EncodingInfo("UTF-16LE", new Boolean(false), true);
        }

        // default to UTF-8 if we don't have enough bytes to make a
        // good determination of the encoding
        if (count < 3) {
            return new EncodingInfo("UTF-8", null);
        }

        // UTF-8 with a BOM
        int b2 = b4[2] & 0xFF;

        if ((b0 == 0xEF) && (b1 == 0xBB) && (b2 == 0xBF)) {
            return new EncodingInfo("UTF-8", null, true);
        }

        // default to UTF-8 if we don't have enough bytes to make a
        // good determination of the encoding
        if (count < 4) {
            return new EncodingInfo("UTF-8", null);
        }

        // other encodings
        int b3 = b4[3] & 0xFF;

        if ((b0 == 0x00) && (b1 == 0x00) && (b2 == 0x00) && (b3 == 0x3C)) {
            // UCS-4, big endian (1234)
            return new EncodingInfo("ISO-10646-UCS-4", new Boolean(true));
        }

        if ((b0 == 0x3C) && (b1 == 0x00) && (b2 == 0x00) && (b3 == 0x00)) {
            // UCS-4, little endian (4321)
            return new EncodingInfo("ISO-10646-UCS-4", new Boolean(false));
        }

        if ((b0 == 0x00) && (b1 == 0x00) && (b2 == 0x3C) && (b3 == 0x00)) {
            // UCS-4, unusual octet order (2143)
            // REVISIT: What should this be? (Currently this would be
            // an exception :)
            return new EncodingInfo("ISO-10646-UCS-4", null);
        }

        if ((b0 == 0x00) && (b1 == 0x3C) && (b2 == 0x00) && (b3 == 0x00)) {
            // UCS-4, unusual octect order (3412)
            // REVISIT: What should this be?
            return new EncodingInfo("ISO-10646-UCS-4", null);
        }

        if ((b0 == 0x00) && (b1 == 0x3C) && (b2 == 0x00) && (b3 == 0x3F)) {
            // UTF-16, big-endian, no BOM
            // (or could turn out to be UCS-2...
            // REVISIT: What should this be?
            return new EncodingInfo("UTF-16BE", new Boolean(true));
        }

        if ((b0 == 0x3C) && (b1 == 0x00) && (b2 == 0x3F) && (b3 == 0x00)) {
            // UTF-16, little-endian, no BOM
            // (or could turn out to be UCS-2...
            return new EncodingInfo("UTF-16LE", new Boolean(false));
        }

        if ((b0 == 0x4C) && (b1 == 0x6F) && (b2 == 0xA7) && (b3 == 0x94)) {
            // EBCDIC
            // a la xerces1, return CP037 instead of EBCDIC here
            return new EncodingInfo("CP037", null);
        }

        // default encoding
        return new EncodingInfo("UTF-8", null);
    } // END getEncodingName(byte[], int) : EncodingInfo

    /**
     * Gets the encoding of the xml request made to the dispatcher.  This
     * works by reading the temp file where we are storing the request,
     * looking to match the header specified encoding that should be present
     * on all xml files.  This call should only be made after the temp file
     * has been set.  If no encoding is found, or if an IOError is encountered
     * then null shall be returned.
     *
     * @param reader This character stream is supposed to contain XML data
     *               (i.e. it should start with valid XML declaration).
     *
     * @return The encoding specified in the xml header read from the supplied
     *         character stream.
     */
    protected static String getXmlEncoding(Reader reader) {
        try {
            StringWriter sw = new StringWriter(MAX_XMLDECL_SIZE);

            int c;
            int count = 0;

            for (; (6 > count) && (-1 != (c = reader.read())); count++) {
                sw.write(c);
            }

            /*
             * Hmm, checking for the case when there is no XML declaration and
             * document begins with processing instruction whose target name
             * starts with "<?xml" ("<?xmlfoo"). Sounds like a nearly impossible
             * thing, but Xerces guys are checking for that somewhere in the
             * depths of their code :)
             */
            if ((6 > count) || (!"<?xml ".equals(sw.toString()))) {
                if (LOGGER.isLoggable(Level.FINER)) {
                    LOGGER.finer("Invalid(?) XML declaration: " + sw.toString() + ".");
                }

                return null;
            }

            /*
             * Continuing reading declaration(?) til the first '>' ('\u003E')
             * encountered. Conversion from `int` to `char` should be safe
             * for our purposes, at least I'm not expecting any extended
             * (0x10000+) characters in xml declaration. I also limited
             * the total number of chars read this way to prevent any
             * malformed (no '>') input potentially forcing us to read
             * megabytes of useless data :)
             */
            for (;
                    (MAX_XMLDECL_SIZE > count) && (-1 != (c = reader.read()))
                    && (RIGHT_ANGLE_BRACKET != (char) c); count++) {
                sw.write(c);
            }

            Matcher m = ENCODING_PATTERN.matcher(sw.toString());

            if (m.find()) {
                String result = m.group(1);

                return result;
            } else {
                return null;
            }
        } catch (IOException e) {
            if (LOGGER.isLoggable(Level.WARNING)) {
                LOGGER.warning("Failed to extract charset info from XML "
                    + "declaration due to IOException: " + e.getMessage());
            }

            return null;
        }
    } // END getXmlEncoding(Reader) : String
} // END class XmlCharsetDetector
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -