📄 xmlcharsetdetector.java
字号:
/**
* Creates a new reader on top of the given <code>InputStream</code> using
* existing (external) encoding information. Unlike
* <code>getCharsetAwareReader</code>, this method never tries to detect
* charset or encoding scheme of <code>InputStream</code>'s data. This also
* means that it <em>must</em> be provided with valid
* <code>EncodingInfo</code> instance, which may be obtained, for example,
* from previous <code>getCharsetAwareReader(InputStream, EncodingInfo)</code>
* call.
*
* @param istream byte-stream containing textual (presumably XML) data
* @param encInfo correctly initialized object which holds information of
* the above byte-stream's contents charset.
*
* @throws IllegalArgumentException if charset name is not specified
* @throws UnsupportedEncodingException in cases when specified charset is
* not supported by platform or due to invalid byte order for
* <code>ISO-10646-UCS-2|4</code> charsets.
*
*/
public static Reader createReader(InputStream istream, EncodingInfo encInfo)
throws IllegalArgumentException, UnsupportedEncodingException {
String charset = encInfo.getEncoding();
Boolean isBigEndian = encInfo.isBigEndian();
// We MUST know encoding (in fact, charset) name, and as EncodingInfo
// have non-arg constructor, its `getEncoding` can return null.
if (null == charset) {
String s = "Name of the charset must not be NULL!";
throw new IllegalArgumentException(s);
}
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine("Trying to create reader basing on existing charset " + "information: `"
+ encInfo + "`.");
}
Reader reader = null;
// UCS-2|4 charsets are handled with custom reader
if ("ISO-10646-UCS-4".equals(charset)) {
if (null != isBigEndian) {
boolean isBE = isBigEndian.booleanValue();
if (isBE) {
reader = new UCSReader(istream, UCSReader.UCS4BE);
} else {
reader = new UCSReader(istream, UCSReader.UCS4LE);
}
} else {
// Fatal error, UCSReader will fail to decode this properly
String s = "Unsupported byte order for ISO-10646-UCS-4 encoding.";
throw new UnsupportedEncodingException(s);
}
} else if ("ISO-10646-UCS-2".equals(charset)) {
if (null != isBigEndian) {
boolean isBE = isBigEndian.booleanValue();
if (isBE) {
reader = new UCSReader(istream, UCSReader.UCS4BE);
} else {
reader = new UCSReader(istream, UCSReader.UCS4LE);
}
} else {
// Cannot construct UCSReader without byte order info
String s = "Byte order must be specified for ISO-10646-UCS-2.";
throw new UnsupportedEncodingException(s);
}
} else {
reader = new InputStreamReader(istream, charset);
}
return reader;
} // END createReader(InputStream, EncodingInfo) : Reader
/**
* Returns the IANA encoding name that is auto-detected from
* the bytes specified, with the endian-ness of that encoding where
* appropriate. Note, that encoding obtained this way is only an
* <em>encoding scheme</em> of the request, i.e. step 1 of detection
* process. To learn the exact <em>charset</em> of the request data,
* you should also perform step 2 - read XML declaration and get the
* value of its <code>encoding</code> pseudoattribute.
*
* @param b4 The first four bytes of the input.
* @param count The number of bytes actually read.
* @return Instance of EncodingInfo incapsulating all encoding-related data.
*/
public static EncodingInfo getEncodingName(byte[] b4, int count) {
if (count < 2) {
return new EncodingInfo("UTF-8", null);
}
// UTF-16, with BOM
int b0 = b4[0] & 0xFF;
int b1 = b4[1] & 0xFF;
if ((b0 == 0xFE) && (b1 == 0xFF)) {
// UTF-16, big-endian
return new EncodingInfo("UTF-16BE", new Boolean(true), true);
}
if ((b0 == 0xFF) && (b1 == 0xFE)) {
// UTF-16, little-endian
return new EncodingInfo("UTF-16LE", new Boolean(false), true);
}
// default to UTF-8 if we don't have enough bytes to make a
// good determination of the encoding
if (count < 3) {
return new EncodingInfo("UTF-8", null);
}
// UTF-8 with a BOM
int b2 = b4[2] & 0xFF;
if ((b0 == 0xEF) && (b1 == 0xBB) && (b2 == 0xBF)) {
return new EncodingInfo("UTF-8", null, true);
}
// default to UTF-8 if we don't have enough bytes to make a
// good determination of the encoding
if (count < 4) {
return new EncodingInfo("UTF-8", null);
}
// other encodings
int b3 = b4[3] & 0xFF;
if ((b0 == 0x00) && (b1 == 0x00) && (b2 == 0x00) && (b3 == 0x3C)) {
// UCS-4, big endian (1234)
return new EncodingInfo("ISO-10646-UCS-4", new Boolean(true));
}
if ((b0 == 0x3C) && (b1 == 0x00) && (b2 == 0x00) && (b3 == 0x00)) {
// UCS-4, little endian (4321)
return new EncodingInfo("ISO-10646-UCS-4", new Boolean(false));
}
if ((b0 == 0x00) && (b1 == 0x00) && (b2 == 0x3C) && (b3 == 0x00)) {
// UCS-4, unusual octet order (2143)
// REVISIT: What should this be? (Currently this would be
// an exception :)
return new EncodingInfo("ISO-10646-UCS-4", null);
}
if ((b0 == 0x00) && (b1 == 0x3C) && (b2 == 0x00) && (b3 == 0x00)) {
// UCS-4, unusual octect order (3412)
// REVISIT: What should this be?
return new EncodingInfo("ISO-10646-UCS-4", null);
}
if ((b0 == 0x00) && (b1 == 0x3C) && (b2 == 0x00) && (b3 == 0x3F)) {
// UTF-16, big-endian, no BOM
// (or could turn out to be UCS-2...
// REVISIT: What should this be?
return new EncodingInfo("UTF-16BE", new Boolean(true));
}
if ((b0 == 0x3C) && (b1 == 0x00) && (b2 == 0x3F) && (b3 == 0x00)) {
// UTF-16, little-endian, no BOM
// (or could turn out to be UCS-2...
return new EncodingInfo("UTF-16LE", new Boolean(false));
}
if ((b0 == 0x4C) && (b1 == 0x6F) && (b2 == 0xA7) && (b3 == 0x94)) {
// EBCDIC
// a la xerces1, return CP037 instead of EBCDIC here
return new EncodingInfo("CP037", null);
}
// default encoding
return new EncodingInfo("UTF-8", null);
} // END getEncodingName(byte[], int) : EncodingInfo
/**
* Gets the encoding of the xml request made to the dispatcher. This
* works by reading the temp file where we are storing the request,
* looking to match the header specified encoding that should be present
* on all xml files. This call should only be made after the temp file
* has been set. If no encoding is found, or if an IOError is encountered
* then null shall be returned.
*
* @param reader This character stream is supposed to contain XML data
* (i.e. it should start with valid XML declaration).
*
* @return The encoding specified in the xml header read from the supplied
* character stream.
*/
protected static String getXmlEncoding(Reader reader) {
try {
StringWriter sw = new StringWriter(MAX_XMLDECL_SIZE);
int c;
int count = 0;
for (; (6 > count) && (-1 != (c = reader.read())); count++) {
sw.write(c);
}
/*
* Hmm, checking for the case when there is no XML declaration and
* document begins with processing instruction whose target name
* starts with "<?xml" ("<?xmlfoo"). Sounds like a nearly impossible
* thing, but Xerces guys are checking for that somewhere in the
* depths of their code :)
*/
if ((6 > count) || (!"<?xml ".equals(sw.toString()))) {
if (LOGGER.isLoggable(Level.FINER)) {
LOGGER.finer("Invalid(?) XML declaration: " + sw.toString() + ".");
}
return null;
}
/*
* Continuing reading declaration(?) til the first '>' ('\u003E')
* encountered. Conversion from `int` to `char` should be safe
* for our purposes, at least I'm not expecting any extended
* (0x10000+) characters in xml declaration. I also limited
* the total number of chars read this way to prevent any
* malformed (no '>') input potentially forcing us to read
* megabytes of useless data :)
*/
for (;
(MAX_XMLDECL_SIZE > count) && (-1 != (c = reader.read()))
&& (RIGHT_ANGLE_BRACKET != (char) c); count++) {
sw.write(c);
}
Matcher m = ENCODING_PATTERN.matcher(sw.toString());
if (m.find()) {
String result = m.group(1);
return result;
} else {
return null;
}
} catch (IOException e) {
if (LOGGER.isLoggable(Level.WARNING)) {
LOGGER.warning("Failed to extract charset info from XML "
+ "declaration due to IOException: " + e.getMessage());
}
return null;
}
} // END getXmlEncoding(Reader) : String
} // END class XmlCharsetDetector
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -