📄 xmlencodingdetector.java

📁 精通tomcat书籍原代码,希望大家共同学习
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
12 3 4 下一页
/*
 * Copyright 1999,2004 The Apache Software Foundation.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * ====================================================================
 *
 * This software consists of voluntary contributions made by many
 * individuals on behalf of the Apache Software Foundation and was
 * originally based on software copyright (c) 1999, International
 * Business Machines, Inc., http://www.apache.org.  For more
 * information on the Apache Software Foundation, please see
 * <http://www.apache.org/>.
 */

package org.apache.jasper.xmlparser;

import java.io.EOFException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.IOException;
import java.io.Reader;
import java.util.Locale;
import java.util.jar.JarFile;

import org.apache.jasper.JasperException;
import org.apache.jasper.JspCompilationContext;
import org.apache.jasper.compiler.ErrorDispatcher;
import org.apache.jasper.compiler.JspUtil;

public class XMLEncodingDetector {
    
    private InputStream stream;
    private String encoding;
    private boolean isEncodingSetInProlog;
    private Boolean isBigEndian;
    private Reader reader;
    
    // org.apache.xerces.impl.XMLEntityManager fields
    public static final int DEFAULT_BUFFER_SIZE = 2048;
    public static final int DEFAULT_XMLDECL_BUFFER_SIZE = 64;
    private boolean fAllowJavaEncodings;
    private SymbolTable fSymbolTable;
    private XMLEncodingDetector fCurrentEntity;
    private int fBufferSize = DEFAULT_BUFFER_SIZE;
    
    // org.apache.xerces.impl.XMLEntityManager.ScannedEntity fields
    private int lineNumber = 1;
    private int columnNumber = 1;
    private boolean literal;
    private char[] ch = new char[DEFAULT_BUFFER_SIZE];
    private int position;
    private int count;
    private boolean mayReadChunks = false;
    
    // org.apache.xerces.impl.XMLScanner fields
    private XMLString fString = new XMLString();    
    private XMLStringBuffer fStringBuffer = new XMLStringBuffer();
    private XMLStringBuffer fStringBuffer2 = new XMLStringBuffer();
    private final static String fVersionSymbol = "version";
    private final static String fEncodingSymbol = "encoding";
    private final static String fStandaloneSymbol = "standalone";
    
    // org.apache.xerces.impl.XMLDocumentFragmentScannerImpl fields
    private int fMarkupDepth = 0;
    private String[] fStrings = new String[3];

    private ErrorDispatcher err;

    /**
     * Constructor
     */
    public XMLEncodingDetector() {
        fSymbolTable = new SymbolTable();
        fCurrentEntity = this;
    }

    /**
     * Autodetects the encoding of the XML document supplied by the given
     * input stream.
     *
     * Encoding autodetection is done according to the XML 1.0 specification,
     * Appendix F.1: Detection Without External Encoding Information.
     *
     * @return Two-element array, where the first element (of type
     * java.lang.String) contains the name of the (auto)detected encoding, and
     * the second element (of type java.lang.Boolean) specifies whether the 
     * encoding was specified using the 'encoding' attribute of an XML prolog
     * (TRUE) or autodetected (FALSE).
     */
    public static Object[] getEncoding(String fname, JarFile jarFile,
                                       JspCompilationContext ctxt,
                                       ErrorDispatcher err)
        throws IOException, JasperException
    {
        InputStream inStream = JspUtil.getInputStream(fname, jarFile, ctxt,
                                                      err);
        XMLEncodingDetector detector = new XMLEncodingDetector();
        Object[] ret = detector.getEncoding(inStream, err);
        inStream.close();

        return ret;
    }

    private Object[] getEncoding(InputStream in, ErrorDispatcher err)
        throws IOException, JasperException
    {
        this.stream = in;
        this.err=err;
        createInitialReader();
        scanXMLDecl();
	
        return new Object[] { this.encoding,
                              new Boolean(this.isEncodingSetInProlog) };
    }
    
    // stub method
    void endEntity() {
    }
    
    // Adapted from:
    // org.apache.xerces.impl.XMLEntityManager.startEntity()
    private void createInitialReader() throws IOException, JasperException {

	// wrap this stream in RewindableInputStream
	stream = new RewindableInputStream(stream);

	// perform auto-detect of encoding if necessary
	if (encoding == null) {
	    // read first four bytes and determine encoding
	    final byte[] b4 = new byte[4];
	    int count = 0;
	    for (; count<4; count++ ) {
		b4[count] = (byte)stream.read();
	    }
	    if (count == 4) {
		Object [] encodingDesc = getEncodingName(b4, count);
		encoding = (String)(encodingDesc[0]);
		isBigEndian = (Boolean)(encodingDesc[1]);

		stream.reset();
		// Special case UTF-8 files with BOM created by Microsoft
		// tools. It's more efficient to consume the BOM than make
		// the reader perform extra checks. -Ac
		if (count > 2 && encoding.equals("UTF-8")) {
		    int b0 = b4[0] & 0xFF;
		    int b1 = b4[1] & 0xFF;
		    int b2 = b4[2] & 0xFF;
		    if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
			// ignore first three bytes...
			stream.skip(3);
		    }
		}
		reader = createReader(stream, encoding, isBigEndian);
	    } else {
		reader = createReader(stream, encoding, isBigEndian);
	    }
	}
    }

    // Adapted from:
    // org.apache.xerces.impl.XMLEntityManager.createReader
    /**
     * Creates a reader capable of reading the given input stream in
     * the specified encoding.
     *
     * @param inputStream  The input stream.
     * @param encoding     The encoding name that the input stream is
     *                     encoded using. If the user has specified that
     *                     Java encoding names are allowed, then the
     *                     encoding name may be a Java encoding name;
     *                     otherwise, it is an ianaEncoding name.
     * @param isBigEndian   For encodings (like uCS-4), whose names cannot
     *                      specify a byte order, this tells whether the order
     *                      is bigEndian. null means unknown or not relevant.
     *
     * @return Returns a reader.
     */
    private Reader createReader(InputStream inputStream, String encoding,
				Boolean isBigEndian)
                throws IOException, JasperException {

        // normalize encoding name
        if (encoding == null) {
            encoding = "UTF-8";
        }

        // try to use an optimized reader
        String ENCODING = encoding.toUpperCase(Locale.ENGLISH);
        if (ENCODING.equals("UTF-8")) {
            return new UTF8Reader(inputStream, fBufferSize);
        }
        if (ENCODING.equals("US-ASCII")) {
            return new ASCIIReader(inputStream, fBufferSize);
        }
        if (ENCODING.equals("ISO-10646-UCS-4")) {
            if (isBigEndian != null) {
                boolean isBE = isBigEndian.booleanValue();
                if (isBE) {
                    return new UCSReader(inputStream, UCSReader.UCS4BE);
                } else {
                    return new UCSReader(inputStream, UCSReader.UCS4LE);
                }
            } else {
                err.jspError("jsp.error.xml.encodingByteOrderUnsupported",
			     encoding);
            }
        }
        if (ENCODING.equals("ISO-10646-UCS-2")) {
            if (isBigEndian != null) { // sould never happen with this encoding...
                boolean isBE = isBigEndian.booleanValue();
                if (isBE) {
                    return new UCSReader(inputStream, UCSReader.UCS2BE);
                } else {
                    return new UCSReader(inputStream, UCSReader.UCS2LE);
                }
            } else {
                err.jspError("jsp.error.xml.encodingByteOrderUnsupported",
			     encoding);
            }
        }

        // check for valid name
        boolean validIANA = XMLChar.isValidIANAEncoding(encoding);
        boolean validJava = XMLChar.isValidJavaEncoding(encoding);
        if (!validIANA || (fAllowJavaEncodings && !validJava)) {
            err.jspError("jsp.error.xml.encodingDeclInvalid", encoding);
            // NOTE: AndyH suggested that, on failure, we use ISO Latin 1
            //       because every byte is a valid ISO Latin 1 character.
            //       It may not translate correctly but if we failed on
            //       the encoding anyway, then we're expecting the content
            //       of the document to be bad. This will just prevent an
            //       invalid UTF-8 sequence to be detected. This is only
            //       important when continue-after-fatal-error is turned
            //       on. -Ac
            encoding = "ISO-8859-1";
        }

        // try to use a Java reader
        String javaEncoding = EncodingMap.getIANA2JavaMapping(ENCODING);
        if (javaEncoding == null) {
            if (fAllowJavaEncodings) {
		javaEncoding = encoding;
            } else {
                err.jspError("jsp.error.xml.encodingDeclInvalid", encoding);
                // see comment above.
                javaEncoding = "ISO8859_1";
            }
        }
        return new InputStreamReader(inputStream, javaEncoding);

    } // createReader(InputStream,String, Boolean): Reader

    // Adapted from:
    // org.apache.xerces.impl.XMLEntityManager.getEncodingName
    /**
     * Returns the IANA encoding name that is auto-detected from
     * the bytes specified, with the endian-ness of that encoding where
     * appropriate.
     *
     * @param b4    The first four bytes of the input.
     * @param count The number of bytes actually read.
     * @return a 2-element array:  the first element, an IANA-encoding string,
     *  the second element a Boolean which is true iff the document is big
     *  endian, false if it's little-endian, and null if the distinction isn't
     *  relevant.
     */
    private Object[] getEncodingName(byte[] b4, int count) {

        if (count < 2) {
            return new Object[]{"UTF-8", null};
        }

        // UTF-16, with BOM
        int b0 = b4[0] & 0xFF;
        int b1 = b4[1] & 0xFF;
        if (b0 == 0xFE && b1 == 0xFF) {
            // UTF-16, big-endian
            return new Object [] {"UTF-16BE", new Boolean(true)};
        }
        if (b0 == 0xFF && b1 == 0xFE) {
            // UTF-16, little-endian
            return new Object [] {"UTF-16LE", new Boolean(false)};
        }

        // default to UTF-8 if we don't have enough bytes to make a
        // good determination of the encoding
        if (count < 3) {
            return new Object [] {"UTF-8", null};
        }

        // UTF-8 with a BOM
        int b2 = b4[2] & 0xFF;
        if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
            return new Object [] {"UTF-8", null};
        }

        // default to UTF-8 if we don't have enough bytes to make a
        // good determination of the encoding
        if (count < 4) {
            return new Object [] {"UTF-8", null};
        }

        // other encodings
        int b3 = b4[3] & 0xFF;
        if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
            // UCS-4, big endian (1234)
            return new Object [] {"ISO-10646-UCS-4", new Boolean(true)};
        }
        if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
            // UCS-4, little endian (4321)
            return new Object [] {"ISO-10646-UCS-4", new Boolean(false)};
        }
        if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
            // UCS-4, unusual octet order (2143)
            // REVISIT: What should this be?
            return new Object [] {"ISO-10646-UCS-4", null};
        }
        if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
            // UCS-4, unusual octect order (3412)
            // REVISIT: What should this be?
            return new Object [] {"ISO-10646-UCS-4", null};
        }
        if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
            // UTF-16, big-endian, no BOM
            // (or could turn out to be UCS-2...
            // REVISIT: What should this be?
            return new Object [] {"UTF-16BE", new Boolean(true)};
        }
        if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
            // UTF-16, little-endian, no BOM
            // (or could turn out to be UCS-2...
            return new Object [] {"UTF-16LE", new Boolean(false)};
        }
        if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
            // EBCDIC
            // a la xerces1, return CP037 instead of EBCDIC here
            return new Object [] {"CP037", null};
        }

        // default encoding
        return new Object [] {"UTF-8", null};

    }

    // Adapted from:
    // org.apache.xerces.impl.XMLEntityManager.EntityScanner.isExternal
    /** Returns true if the current entity being scanned is external. */
    public boolean isExternal() {
	return true;
    }

    // Adapted from:
    // org.apache.xerces.impl.XMLEntityManager.EntityScanner.peekChar
    /**
     * Returns the next character on the input.
     * <p>
     * <strong>Note:</strong> The character is <em>not</em> consumed.
     *
     * @throws IOException  Thrown if i/o error occurs.
     * @throws EOFException Thrown on end of file.
     */
    public int peekChar() throws IOException {
	
	// load more characters, if needed
	if (fCurrentEntity.position == fCurrentEntity.count) {
	    load(0, true);
	}
	
	// peek at character
	int c = fCurrentEntity.ch[fCurrentEntity.position];

	// return peeked character
	if (fCurrentEntity.isExternal()) {
	    return c != '\r' ? c : '\n';
	}
	else {
	    return c;
	}
	
    } // peekChar():int
    
    // Adapted from:
    // org.apache.xerces.impl.XMLEntityManager.EntityScanner.scanChar
    /**
     * Returns the next character on the input.
     * <p>
     * <strong>Note:</strong> The character is consumed.
     *
     * @throws IOException  Thrown if i/o error occurs.
     * @throws EOFException Thrown on end of file.
     */
    public int scanChar() throws IOException {

	// load more characters, if needed
	if (fCurrentEntity.position == fCurrentEntity.count) {
	    load(0, true);
	}
12 3 4 下一页
💿 文件大小 26464 K
👤 上传用户 zhangtaoai007
📂 所属分类 Java编程
🏷️ 相关标签

#tomcat #书籍 #代码 #家
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -