📄 xmlcharsetdetector.java
字号:
/* Copyright (c) 2001 - 2007 TOPP - www.openplans.org. All rights reserved.
* This code is licensed under the GPL 2.0 license, availible at the root
* application directory.
*/
package org.geoserver.ows.util;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Provides a methods that can be used to detect charset of some
* XML document and (optionally) return a reader that is aware of
* this charset and can correctly decode document's data.
*/
public class XmlCharsetDetector {
protected static Logger LOGGER = org.geotools.util.logging.Logging.getLogger("org.vfny.geoserver.requests");
/**
* In current context naming this "GT", "GREATER_THAN" or like
* would be misleading.
*/
private static final char RIGHT_ANGLE_BRACKET = '\u003E';
private static final Pattern ENCODING_PATTERN = Pattern.compile(
"encoding\\s*\\=\\s*\"([^\"]+)\"");
/**
* Maximum number of characters we are expecting in XML Declaration.
* There are probably will be less then 100, but just in case...
*/
private static final int MAX_XMLDECL_SIZE = 100;
/**
* Based on Xerces-J code, this method will try its best to return a
* reader which is able to decode content of incoming XML document
* properly. To achieve this goal, it first infers general
* encoding scheme of the above document and then uses this
* information to extract actual charset from XML declaration. In
* any recoverable error situation default UTF-8 reader will be
* created.
*
* @param istream Byte stream (most probably obtained with
* <code>HttpServletRequest.getInputStream</code>
* that gives access to XML document in question).
*
* @param encInfo Instance of EncodingInfo where information about
* detected charset will be stored. You can then
* use it, for example, to form a response encoded
* with this charset.
*
* @throws IOException in case of any unrecoverable I/O errors.
* @throws UnsupportedCharsetException <code>InputStreamReader</code>'s
* constructor will probably throw this exception if
* inferred charset of XML document is not supported by
* current JVM.
*/
public static Reader getCharsetAwareReader(InputStream istream, EncodingInfo encInfo)
throws IOException, UnsupportedCharsetException {
RewindableInputStream stream;
stream = new RewindableInputStream(istream, false);
//
// Phase 1. Reading first four bytes and determining encoding scheme.
final byte[] b4 = new byte[4];
int count = 0;
for (; count < 4; count++) {
int b = stream.read();
if (-1 != b) {
b4[count] = (byte) b;
} else {
break;
}
}
if (LOGGER.isLoggable(Level.FINER)) {
// Such number of concatenating strings makes me sick.
// But using StringBuffer will make this uglier, not?
LOGGER.finer("First 4 bytes of XML doc are : "
+ Integer.toHexString((int) b4[0] & 0xff).toUpperCase() + " ('" + (char) b4[0]
+ "') " + Integer.toHexString((int) b4[1] & 0xff).toUpperCase() + " ('"
+ (char) b4[1] + "') " + Integer.toHexString((int) b4[2] & 0xff).toUpperCase()
+ " ('" + (char) b4[2] + "') "
+ Integer.toHexString((int) b4[3] & 0xff).toUpperCase() + " ('" + (char) b4[3]
+ "')");
}
/*
* `getEncodingName()` is capable of detecting following encoding
* schemes:
* "UTF-8", "UTF-16LE", "UTF-16BE", "ISO-10646-UCS-4",
* or "CP037". It cannot distinguish between UTF-16 (without BOM)
* and "ISO-10646-UCS-2", so latter will be interpreted as UTF-16
* for the purpose of reading XML declaration. There shouldn't be
* much trouble though as (I believe) these formats are identical for
* the Basic Multilingual Plane, except that UTF-16-encoded text
* can contain values from surrogate range and valid UCS-2 input
* cannot (imho).
* This ugly form of copying charset data is required to maintain
* "reference integrity" of encInfo variable. As it can be possibly
* used after this method call, it should point to the same memory
* structure, and assignment or cloning doesn't work for me there.
*/
encInfo.copyFrom(getEncodingName(b4, count));
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine("Charset detection phase 1. Inferred encoding: " + encInfo.toString());
}
// Rewinding to beginning of data
stream.reset();
String ENCODING = encInfo.getEncoding().toUpperCase(Locale.ENGLISH);
Boolean isBigEndian = encInfo.isBigEndian();
boolean hasBOM = encInfo.hasBOM();
/*
* Special case UTF-8 files with BOM created by Microsoft
* tools. It's more efficient to consume the BOM than make
* the reader perform extra checks. -Ac
*/
if (hasBOM && ENCODING.equals("UTF-8")) {
// ignore first three bytes...
stream.skip(3);
}
/*
* The specifics of `getEncodingName` work is that it always returns
* UTF-16 with BOM as either UTF-16LE or UTF-16BE, and
* InputStreamReader doesn't expect BOM coming with UTF-16LE|BE
* encoded data. So this BOM should also be removed, if present.
*/
if ((count > 1) && (ENCODING.equals("UTF-16LE") || ENCODING.equals("UTF-16BE"))) {
int b0 = b4[0] & 0xFF;
int b1 = b4[1] & 0xFF;
if (((b0 == 0xFF) && (b1 == 0xFE)) || ((b0 == 0xFE) && (b1 == 0xFF))) {
// ignore first two bytes...
stream.skip(2);
}
}
Reader reader = null;
/*
* We must use custom class to read UCS-4 data, my JVM doesn't support
* this encoding scheme by default and I doubt other JVMs are.
*
* There was another specific reader for UTF-8 encoding in Xerces
* (org.apache.xerces.impl.io.UTF8Reader), which they say is
* optimized one. May be it is really better than JVM's default
* decoding algorithm but I doubt the necessity of porting just
* another (not so small) class in order to "efficiently" extract
* a couple of chars from XML declaration. Still I may be mistaking
* there. Moreover, Xerces' UTF8Reader has some internal dependencies
* and it will take much more effort to extract it from there.
*
* Also, at this stage it is quite impossible to have "ISO-10646-UCS-2"
* as a value for ENCODING.
*
* You can avoid possible bugs in UCSReader by commenting out this
* block of code together with following `if`. Then you will get an
* UnsupportedEncodingException for UCS-4 encoded data.
*/
if ("ISO-10646-UCS-4".equals(ENCODING)) {
if (null != isBigEndian) {
boolean isBE = isBigEndian.booleanValue();
if (isBE) {
reader = new UCSReader(stream, UCSReader.UCS4BE);
} else {
reader = new UCSReader(stream, UCSReader.UCS4LE);
}
} else {
// Fatal error, UCSReader will fail to decode this properly
String s = "Unsupported byte order for ISO-10646-UCS-4 encoding.";
throw new UnsupportedCharsetException(s);
}
}
if (null == reader) {
reader = new InputStreamReader(stream, ENCODING);
}
//
// Phase 2. Reading XML declaration and extracting charset info from it.
String declEncoding = getXmlEncoding(reader);
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine("Charset detection phase 2. Charset in XML declaration " + "is `"
+ declEncoding + "`.");
}
stream.reset();
/*
* Now RewindableInputStream is allowed to return more than one byte
* per read operation. It also will not buffer bytes read using
* `read(byte[], int, int)` method.
*/
stream.setChunkedMode(true);
/*
* Reusing existing reader if possible, creating new one only if
* declared charset name differs from guessed one
*/
if ((null != declEncoding) && !declEncoding.equals(ENCODING)) {
/*
* I believe that for UCS-2 encoding default UTF-16 reader
* (which is already created at this time) should suffice
* in most cases. Though, we can always construct a new
* UCSReader instance, if I am wrong here.
*/
if (!declEncoding.equals("ISO-10646-UCS-2")) {
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine("Declared charset differs from inferred one. "
+ "Trying to construct InputStreamReader for `" + declEncoding + "`.");
}
reader = new InputStreamReader(stream, declEncoding);
encInfo.setEncoding(declEncoding);
}
}
return reader;
} // END getCharsetAwareReader(InputStream) : Reader
/**
* Use this variant when you aren't interested in encoding data, and just
* want to get a suitable reader for incoming request.
*
* @param istream See <code>getCharsetAwareReader(InputStream,
* EncodingInfo)</code>.
*
*/
public static Reader getCharsetAwareReader(InputStream istream)
throws IOException, UnsupportedCharsetException {
return getCharsetAwareReader(istream, new EncodingInfo());
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -