📄 simplexmlparser.java

📁 iText是一个能够快速产生PDF文件的java类库。iText的java类对于那些要产生包含文本
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
/*
 * Copyright 2003 Paulo Soares
 *
 * The contents of this file are subject to the Mozilla Public License Version 1.1
 * (the "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the License.
 *
 * The Original Code is 'iText, a free JAVA-PDF library'.
 *
 * The Initial Developer of the Original Code is Bruno Lowagie. Portions created by
 * the Initial Developer are Copyright (C) 1999, 2000, 2001, 2002 by Bruno Lowagie.
 * All Rights Reserved.
 * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer
 * are Copyright (C) 2000, 2001, 2002 by Paulo Soares. All Rights Reserved.
 *
 * Contributor(s): all the names of the contributors are added in the source code
 * where applicable.
 *
 * Alternatively, the contents of this file may be used under the terms of the
 * LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the
 * provisions of LGPL are applicable instead of those above.  If you wish to
 * allow use of your version of this file only under the terms of the LGPL
 * License and not to allow others to use your version of this file under
 * the MPL, indicate your decision by deleting the provisions above and
 * replace them with the notice and other provisions required by the LGPL.
 * If you do not delete the provisions above, a recipient may use your version
 * of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE.
 *
 * This library is free software; you can redistribute it and/or modify it
 * under the terms of the MPL as stated above or under the terms of the GNU
 * Library General Public License as published by the Free Software Foundation;
 * either version 2 of the License, or any later version.
 *
 * This library is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more
 * details.
 *
 * If you didn't download this code from the following link, you should check if
 * you aren't using an obsolete version:
 * http://www.lowagie.com/iText/
 *
 * The code to recognize the encoding in this class and in the convenience class IanaEncodings was taken from Apache Xerces published under the following license:
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 * Part of this code is based on the Quick-and-Dirty XML parser by Steven Brandt.
 * The code for the Quick-and-Dirty parser was published in JavaWorld (java tip 128).
 * Steven Brandt and JavaWorld gave permission to use the code for free.
 * (Bruno Lowagie and Paulo Soares chose to use it under the MPL/LGPL in
 * conformance with the rest of the code).
 * The original code can be found on this url: <A HREF="http://www.javaworld.com/javatips/jw-javatip128_p.html">http://www.javaworld.com/javatips/jw-javatip128_p.html</A>.
 * It was substantially refactored by Bruno Lowagie.
 * 
 * The method 'private static String getEncodingName(byte[] b4)' was found
 * in org.apache.xerces.impl.XMLEntityManager, originaly published by the
 * Apache Software Foundation under the Apache Software License; now being
 * used in iText under the MPL.
 */
package com.lowagie.text.xml.simpleparser;

import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.HashMap;
import java.util.Stack;

/**
 * A simple XML and HTML parser.  This parser is, like the SAX parser,
 * an event based parser, but with much less functionality.
 * <p>
 * The parser can:
 * <p>
 * <ul>
 * <li>It recognizes the encoding used
 * <li>It recognizes all the elements' start tags and end tags
 * <li>It lists attributes, where attribute values can be enclosed in single or double quotes
 * <li>It recognizes the <code>&lt;[CDATA[ ... ]]&gt;</code> construct
 * <li>It recognizes the standard entities: &amp;amp;, &amp;lt;, &amp;gt;, &amp;quot;, and &amp;apos;, as well as numeric entities
 * <li>It maps lines ending in <code>\r\n</code> and <code>\r</code> to <code>\n</code> on input, in accordance with the XML Specification, Section 2.11
 * </ul>
 * <p>
 */
public class SimpleXMLParser {
    /** possible states */
	private final static int UNKNOWN = 0;
	private final static int TEXT = 1;
	private final static int TAG_ENCOUNTERED = 2;
	private final static int EXAMIN_TAG = 3;
	private final static int TAG_EXAMINED = 4;
	private final static int IN_CLOSETAG = 5;
	private final static int SINGLE_TAG = 6;
	private final static int CDATA = 7;
	private final static int COMMENT = 8;
	private final static int PI = 9;
	private final static int ENTITY = 10;
	private final static int QUOTE = 11;
	private final static int ATTRIBUTE_KEY = 12;
	private final static int ATTRIBUTE_EQUAL = 13;
	private final static int ATTRIBUTE_VALUE = 14;
    
	/** the state stack */
	protected Stack stack;
	/** The current character. */
	protected int character = 0;
	/** The previous character. */
	protected int previousCharacter = -1;
	/** the line we are currently reading */
	protected int lines = 1;
	/** the column where the current character occurs */
	protected int columns = 0;
	/** was the last character equivalent to a newline? */
	protected boolean eol = false;
	/** the current state */
	protected int state;
	/** Are we parsing HTML? */
	protected boolean html;
	/** current text (whatever is encountered between tags) */
	protected StringBuffer text = new StringBuffer();
	/** current entity (whatever is encountered between & and ;) */
	protected StringBuffer entity = new StringBuffer();
	/** current tagname */
	protected String tag = null;
	/** current attributes */
	protected HashMap attributes = null;
	/** The handler to which we are going to forward document content */
	protected SimpleXMLDocHandler doc;
	/** The handler to which we are going to forward comments. */
	protected SimpleXMLDocHandlerComment comment;
	/** Keeps track of the number of tags that are open. */
	int nested = 0;
	/** the quote character that was used to open the quote. */
	protected int quoteCharacter = '"';
	/** the attribute key. */
	String attributekey = null;
	/** the attribute value. */
	String attributevalue = null;
    
	/**
	 * Creates a Simple XML parser object.
	 * Call go(BufferedReader) immediately after creation.
	 */
    private SimpleXMLParser(SimpleXMLDocHandler doc, SimpleXMLDocHandlerComment comment, boolean html) {
    	this.doc = doc;
    	this.comment = comment;
    	this.html = html;
    	stack = new Stack();
    	state = html ? TEXT : UNKNOWN;
    }
    
    /**
     * Does the actual parsing. Perform this immediately
     * after creating the parser object.
     */
    private void go(Reader r) throws IOException {
        BufferedReader reader;
        if (r instanceof BufferedReader)
            reader = (BufferedReader)r;
        else
            reader = new BufferedReader(r);
        doc.startDocument();
        while(true) {
			// read a new character
			if (previousCharacter == -1) {
				character = reader.read();
			}
			// or re-examin the previous character
			else {
				character = previousCharacter;
				previousCharacter = -1;
			}
			
			// the end of the file was reached
			if (character == -1) {
				if (html) {
					if (html && state == TEXT)
						flush();
					doc.endDocument();
				} else {
					throwException("Missing end tag");
				}
				return;
			}
            
			// dealing with  \n and \r
			if (character == '\n' && eol) {
				eol = false;
				continue;
			} else if (eol) {
				eol = false;
			} else if (character == '\n') {
				lines++;
				columns = 0;
			} else if (character == '\r') {
				eol = true;
				character = '\n';
				lines++;
				columns = 0;
			} else {
				columns++;
			}
            
			switch(state) {
            // we are in an unknown state before there's actual content
			case UNKNOWN:
                if(character == '<') {
                    saveState(TEXT);
                    state = TAG_ENCOUNTERED;
                }
                break;
            // we can encounter any content
			case TEXT:
                if(character == '<') {
                    flush();
                    saveState(state);
                    state = TAG_ENCOUNTERED;
                } else if(character == '&') {
                    saveState(state);
                    entity.setLength(0);
                    state = ENTITY;
                } else
                    text.append((char)character);
                break;
            // we have just seen a < and are wondering what we are looking at
            // <foo>, </foo>, <!-- ... --->, etc.
			case TAG_ENCOUNTERED:
                initTag();
                if(character == '/') {
                    state = IN_CLOSETAG;
                } else if (character == '?') {
                    restoreState();
                    state = PI;
                } else {
                    text.append((char)character);
                    state = EXAMIN_TAG;
                }
                break;
            // we are processing something like this <foo ... >.
            // It could still be a <!-- ... --> or something.
			case EXAMIN_TAG:
                if(character == '>') {
                    doTag();
                    processTag(true);
                    initTag();
                    state = restoreState();
                } else if(character == '/') {
                    state = SINGLE_TAG;
                } else if(character == '-' && text.toString().equals("!-")) {
                    flush();
                    state = COMMENT;
                } else if(character == '[' && text.toString().equals("![CDATA")) {
                    flush();
                    state = CDATA;
                } else if(character == 'E' && text.toString().equals("!DOCTYP")) {
                    flush();
                    state = PI;
                } else if(Character.isWhitespace((char)character)) {
                    doTag();
                    state = TAG_EXAMINED;
                } else {
                    text.append((char)character);
                }
                break;
            // we know the name of the tag now.
			case TAG_EXAMINED:
                if(character == '>') {
                    processTag(true);
                    initTag();
                    state = restoreState();
                } else if(character == '/') {
                    state = SINGLE_TAG;
                } else if(Character.isWhitespace((char)character)) {
                    // empty
                } else {
                    text.append((char)character);
                    state = ATTRIBUTE_KEY;
                }
                break;
                
                // we are processing a closing tag: e.g. </foo>
			case IN_CLOSETAG:
                if(character == '>') {
                    doTag();
                    processTag(false);
                    if(!html && nested==0) return;
                    state = restoreState();
                } else {
                    if (!Character.isWhitespace((char)character))
                        text.append((char)character);
                }
                break;
                
            // we have just seen something like this: <foo a="b"/
            // and are looking for the final >.
			case SINGLE_TAG:
                if(character != '>')
                    throwException("Expected > for tag: <"+tag+"/>");
				doTag();
                processTag(true);
                processTag(false);
                initTag();
                if(!html && nested==0) {
                    doc.endDocument();
                    return;
                }
                state = restoreState();
                break;
                
            // we are processing CDATA
			case CDATA:
                if(character == '>'
                && text.toString().endsWith("]]")) {
                    text.setLength(text.length()-2);
                    flush();
                    state = restoreState();
                } else
                    text.append((char)character);
                break;
                
            // we are processing a comment.  We are inside
            // the <!-- .... --> looking for the -->.
			case COMMENT:
                if(character == '>'
                && text.toString().endsWith("--")) {
                    text.setLength(text.length() - 2);
                    flush();
                    state = restoreState();
                } else
                    text.append((char)character);
                break;
                
            // We are inside one of these <? ... ?> or one of these <!DOCTYPE ... >
			case PI:
                if(character == '>') {
                    state = restoreState();
                    if(state == TEXT) state = UNKNOWN;
                }
                break;
                
            // we are processing an entity, e.g. &lt;, &#187;, etc.
			case ENTITY:
                if(character == ';') {
                    state = restoreState();
                    String cent = entity.toString();
                    entity.setLength(0);
                    char ce = EntitiesToUnicode.decodeEntity(cent);
                    if (ce == '\0')
                    	text.append('&').append(cent).append(';');
                    else
                    	text.append(ce);
                } else if ((character != '#' && (character < '0' || character > '9') && (character < 'a' || character > 'z')
                    && (character < 'A' || character > 'Z')) || entity.length() >= 7) {
                    state = restoreState();
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -