📄 xmlencodingdetector.java
字号:
/*
* Copyright 1999,2004 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation and was
* originally based on software copyright (c) 1999, International
* Business Machines, Inc., http://www.apache.org. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
package org.apache.jasper.xmlparser;
import java.io.EOFException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.IOException;
import java.io.Reader;
import java.util.Locale;
import java.util.jar.JarFile;
import org.apache.jasper.JasperException;
import org.apache.jasper.JspCompilationContext;
import org.apache.jasper.compiler.ErrorDispatcher;
import org.apache.jasper.compiler.JspUtil;
public class XMLEncodingDetector {
private InputStream stream;
private String encoding;
private boolean isEncodingSetInProlog;
private Boolean isBigEndian;
private Reader reader;
// org.apache.xerces.impl.XMLEntityManager fields
public static final int DEFAULT_BUFFER_SIZE = 2048;
public static final int DEFAULT_XMLDECL_BUFFER_SIZE = 64;
private boolean fAllowJavaEncodings;
private SymbolTable fSymbolTable;
private XMLEncodingDetector fCurrentEntity;
private int fBufferSize = DEFAULT_BUFFER_SIZE;
// org.apache.xerces.impl.XMLEntityManager.ScannedEntity fields
private int lineNumber = 1;
private int columnNumber = 1;
private boolean literal;
private char[] ch = new char[DEFAULT_BUFFER_SIZE];
private int position;
private int count;
private boolean mayReadChunks = false;
// org.apache.xerces.impl.XMLScanner fields
private XMLString fString = new XMLString();
private XMLStringBuffer fStringBuffer = new XMLStringBuffer();
private XMLStringBuffer fStringBuffer2 = new XMLStringBuffer();
private final static String fVersionSymbol = "version";
private final static String fEncodingSymbol = "encoding";
private final static String fStandaloneSymbol = "standalone";
// org.apache.xerces.impl.XMLDocumentFragmentScannerImpl fields
private int fMarkupDepth = 0;
private String[] fStrings = new String[3];
private ErrorDispatcher err;
/**
* Constructor
*/
public XMLEncodingDetector() {
fSymbolTable = new SymbolTable();
fCurrentEntity = this;
}
/**
* Autodetects the encoding of the XML document supplied by the given
* input stream.
*
* Encoding autodetection is done according to the XML 1.0 specification,
* Appendix F.1: Detection Without External Encoding Information.
*
* @return Two-element array, where the first element (of type
* java.lang.String) contains the name of the (auto)detected encoding, and
* the second element (of type java.lang.Boolean) specifies whether the
* encoding was specified using the 'encoding' attribute of an XML prolog
* (TRUE) or autodetected (FALSE).
*/
public static Object[] getEncoding(String fname, JarFile jarFile,
JspCompilationContext ctxt,
ErrorDispatcher err)
throws IOException, JasperException
{
InputStream inStream = JspUtil.getInputStream(fname, jarFile, ctxt,
err);
XMLEncodingDetector detector = new XMLEncodingDetector();
Object[] ret = detector.getEncoding(inStream, err);
inStream.close();
return ret;
}
private Object[] getEncoding(InputStream in, ErrorDispatcher err)
throws IOException, JasperException
{
this.stream = in;
this.err=err;
createInitialReader();
scanXMLDecl();
return new Object[] { this.encoding,
new Boolean(this.isEncodingSetInProlog) };
}
// stub method
void endEntity() {
}
// Adapted from:
// org.apache.xerces.impl.XMLEntityManager.startEntity()
private void createInitialReader() throws IOException, JasperException {
// wrap this stream in RewindableInputStream
stream = new RewindableInputStream(stream);
// perform auto-detect of encoding if necessary
if (encoding == null) {
// read first four bytes and determine encoding
final byte[] b4 = new byte[4];
int count = 0;
for (; count<4; count++ ) {
b4[count] = (byte)stream.read();
}
if (count == 4) {
Object [] encodingDesc = getEncodingName(b4, count);
encoding = (String)(encodingDesc[0]);
isBigEndian = (Boolean)(encodingDesc[1]);
stream.reset();
// Special case UTF-8 files with BOM created by Microsoft
// tools. It's more efficient to consume the BOM than make
// the reader perform extra checks. -Ac
if (count > 2 && encoding.equals("UTF-8")) {
int b0 = b4[0] & 0xFF;
int b1 = b4[1] & 0xFF;
int b2 = b4[2] & 0xFF;
if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
// ignore first three bytes...
stream.skip(3);
}
}
reader = createReader(stream, encoding, isBigEndian);
} else {
reader = createReader(stream, encoding, isBigEndian);
}
}
}
// Adapted from:
// org.apache.xerces.impl.XMLEntityManager.createReader
/**
* Creates a reader capable of reading the given input stream in
* the specified encoding.
*
* @param inputStream The input stream.
* @param encoding The encoding name that the input stream is
* encoded using. If the user has specified that
* Java encoding names are allowed, then the
* encoding name may be a Java encoding name;
* otherwise, it is an ianaEncoding name.
* @param isBigEndian For encodings (like uCS-4), whose names cannot
* specify a byte order, this tells whether the order
* is bigEndian. null means unknown or not relevant.
*
* @return Returns a reader.
*/
private Reader createReader(InputStream inputStream, String encoding,
Boolean isBigEndian)
throws IOException, JasperException {
// normalize encoding name
if (encoding == null) {
encoding = "UTF-8";
}
// try to use an optimized reader
String ENCODING = encoding.toUpperCase(Locale.ENGLISH);
if (ENCODING.equals("UTF-8")) {
return new UTF8Reader(inputStream, fBufferSize);
}
if (ENCODING.equals("US-ASCII")) {
return new ASCIIReader(inputStream, fBufferSize);
}
if (ENCODING.equals("ISO-10646-UCS-4")) {
if (isBigEndian != null) {
boolean isBE = isBigEndian.booleanValue();
if (isBE) {
return new UCSReader(inputStream, UCSReader.UCS4BE);
} else {
return new UCSReader(inputStream, UCSReader.UCS4LE);
}
} else {
err.jspError("jsp.error.xml.encodingByteOrderUnsupported",
encoding);
}
}
if (ENCODING.equals("ISO-10646-UCS-2")) {
if (isBigEndian != null) { // sould never happen with this encoding...
boolean isBE = isBigEndian.booleanValue();
if (isBE) {
return new UCSReader(inputStream, UCSReader.UCS2BE);
} else {
return new UCSReader(inputStream, UCSReader.UCS2LE);
}
} else {
err.jspError("jsp.error.xml.encodingByteOrderUnsupported",
encoding);
}
}
// check for valid name
boolean validIANA = XMLChar.isValidIANAEncoding(encoding);
boolean validJava = XMLChar.isValidJavaEncoding(encoding);
if (!validIANA || (fAllowJavaEncodings && !validJava)) {
err.jspError("jsp.error.xml.encodingDeclInvalid", encoding);
// NOTE: AndyH suggested that, on failure, we use ISO Latin 1
// because every byte is a valid ISO Latin 1 character.
// It may not translate correctly but if we failed on
// the encoding anyway, then we're expecting the content
// of the document to be bad. This will just prevent an
// invalid UTF-8 sequence to be detected. This is only
// important when continue-after-fatal-error is turned
// on. -Ac
encoding = "ISO-8859-1";
}
// try to use a Java reader
String javaEncoding = EncodingMap.getIANA2JavaMapping(ENCODING);
if (javaEncoding == null) {
if (fAllowJavaEncodings) {
javaEncoding = encoding;
} else {
err.jspError("jsp.error.xml.encodingDeclInvalid", encoding);
// see comment above.
javaEncoding = "ISO8859_1";
}
}
return new InputStreamReader(inputStream, javaEncoding);
} // createReader(InputStream,String, Boolean): Reader
// Adapted from:
// org.apache.xerces.impl.XMLEntityManager.getEncodingName
/**
* Returns the IANA encoding name that is auto-detected from
* the bytes specified, with the endian-ness of that encoding where
* appropriate.
*
* @param b4 The first four bytes of the input.
* @param count The number of bytes actually read.
* @return a 2-element array: the first element, an IANA-encoding string,
* the second element a Boolean which is true iff the document is big
* endian, false if it's little-endian, and null if the distinction isn't
* relevant.
*/
private Object[] getEncodingName(byte[] b4, int count) {
if (count < 2) {
return new Object[]{"UTF-8", null};
}
// UTF-16, with BOM
int b0 = b4[0] & 0xFF;
int b1 = b4[1] & 0xFF;
if (b0 == 0xFE && b1 == 0xFF) {
// UTF-16, big-endian
return new Object [] {"UTF-16BE", new Boolean(true)};
}
if (b0 == 0xFF && b1 == 0xFE) {
// UTF-16, little-endian
return new Object [] {"UTF-16LE", new Boolean(false)};
}
// default to UTF-8 if we don't have enough bytes to make a
// good determination of the encoding
if (count < 3) {
return new Object [] {"UTF-8", null};
}
// UTF-8 with a BOM
int b2 = b4[2] & 0xFF;
if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
return new Object [] {"UTF-8", null};
}
// default to UTF-8 if we don't have enough bytes to make a
// good determination of the encoding
if (count < 4) {
return new Object [] {"UTF-8", null};
}
// other encodings
int b3 = b4[3] & 0xFF;
if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
// UCS-4, big endian (1234)
return new Object [] {"ISO-10646-UCS-4", new Boolean(true)};
}
if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
// UCS-4, little endian (4321)
return new Object [] {"ISO-10646-UCS-4", new Boolean(false)};
}
if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
// UCS-4, unusual octet order (2143)
// REVISIT: What should this be?
return new Object [] {"ISO-10646-UCS-4", null};
}
if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
// UCS-4, unusual octect order (3412)
// REVISIT: What should this be?
return new Object [] {"ISO-10646-UCS-4", null};
}
if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
// UTF-16, big-endian, no BOM
// (or could turn out to be UCS-2...
// REVISIT: What should this be?
return new Object [] {"UTF-16BE", new Boolean(true)};
}
if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
// UTF-16, little-endian, no BOM
// (or could turn out to be UCS-2...
return new Object [] {"UTF-16LE", new Boolean(false)};
}
if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
// EBCDIC
// a la xerces1, return CP037 instead of EBCDIC here
return new Object [] {"CP037", null};
}
// default encoding
return new Object [] {"UTF-8", null};
}
// Adapted from:
// org.apache.xerces.impl.XMLEntityManager.EntityScanner.isExternal
/** Returns true if the current entity being scanned is external. */
public boolean isExternal() {
return true;
}
// Adapted from:
// org.apache.xerces.impl.XMLEntityManager.EntityScanner.peekChar
/**
* Returns the next character on the input.
* <p>
* <strong>Note:</strong> The character is <em>not</em> consumed.
*
* @throws IOException Thrown if i/o error occurs.
* @throws EOFException Thrown on end of file.
*/
public int peekChar() throws IOException {
// load more characters, if needed
if (fCurrentEntity.position == fCurrentEntity.count) {
load(0, true);
}
// peek at character
int c = fCurrentEntity.ch[fCurrentEntity.position];
// return peeked character
if (fCurrentEntity.isExternal()) {
return c != '\r' ? c : '\n';
}
else {
return c;
}
} // peekChar():int
// Adapted from:
// org.apache.xerces.impl.XMLEntityManager.EntityScanner.scanChar
/**
* Returns the next character on the input.
* <p>
* <strong>Note:</strong> The character is consumed.
*
* @throws IOException Thrown if i/o error occurs.
* @throws EOFException Thrown on end of file.
*/
public int scanChar() throws IOException {
// load more characters, if needed
if (fCurrentEntity.position == fCurrentEntity.count) {
load(0, true);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -