📄 simplexmlparser.java

📁 处理PDF
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* * Copyright 2003 Paulo Soares * * The contents of this file are subject to the Mozilla Public License Version 1.1 * (the "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License * for the specific language governing rights and limitations under the License. * * The Original Code is 'iText, a free JAVA-PDF library'. * * The Initial Developer of the Original Code is Bruno Lowagie. Portions created by * the Initial Developer are Copyright (C) 1999, 2000, 2001, 2002 by Bruno Lowagie. * All Rights Reserved. * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer * are Copyright (C) 2000, 2001, 2002 by Paulo Soares. All Rights Reserved. * * Contributor(s): all the names of the contributors are added in the source code * where applicable. * * Alternatively, the contents of this file may be used under the terms of the * LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the * provisions of LGPL are applicable instead of those above.  If you wish to * allow use of your version of this file only under the terms of the LGPL * License and not to allow others to use your version of this file under * the MPL, indicate your decision by deleting the provisions above and * replace them with the notice and other provisions required by the LGPL. * If you do not delete the provisions above, a recipient may use your version * of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE. * * This library is free software; you can redistribute it and/or modify it * under the terms of the MPL as stated above or under the terms of the GNU * Library General Public License as published by the Free Software Foundation; * either version 2 of the License, or any later version. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more * details. * * If you didn't download this code from the following link, you should check if * you aren't using an obsolete version: * http://www.lowagie.com/iText/ * * The code to recognize the encoding in this class and in the convenience class IanaEncodings was taken from Apache Xerces published under the following license: * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements.  See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License.  You may obtain a copy of the License at *  *      http://www.apache.org/licenses/LICENSE-2.0 *  * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *  * Part of this code is based on the Quick-and-Dirty XML parser by Steven Brandt. * The code for the Quick-and-Dirty parser was published in JavaWorld (java tip 128). * Steven Brandt and JavaWorld gave permission to use the code for free. * (Bruno Lowagie and Paulo Soares chose to use it under the MPL/LGPL in * conformance with the rest of the code). * The original code can be found on this url: <A HREF="http://www.javaworld.com/javatips/jw-javatip128_p.html">http://www.javaworld.com/javatips/jw-javatip128_p.html</A>. * It was substantially refactored by Bruno Lowagie. *  * The method 'private static String getEncodingName(byte[] b4)' was found * in org.apache.xerces.impl.XMLEntityManager, originaly published by the * Apache Software Foundation under the Apache Software License; now being * used in iText under the MPL. */package com.lowagie.text.xml.simpleparser;import java.io.BufferedReader;import java.io.ByteArrayOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.Reader;import java.util.HashMap;import java.util.Stack;/** * A simple XML and HTML parser.  This parser is, like the SAX parser, * an event based parser, but with much less functionality. * <p> * The parser can: * <p> * <ul> * <li>It recognizes the encoding used * <li>It recognizes all the elements' start tags and end tags * <li>It lists attributes, where attribute values can be enclosed in single or double quotes * <li>It recognizes the <code>&lt;[CDATA[ ... ]]&gt;</code> construct * <li>It recognizes the standard entities: &amp;amp;, &amp;lt;, &amp;gt;, &amp;quot;, and &amp;apos;, as well as numeric entities * <li>It maps lines ending in <code>\r\n</code> and <code>\r</code> to <code>\n</code> on input, in accordance with the XML Specification, Section 2.11 * </ul> * <p> */public final class SimpleXMLParser {    /** possible states */	private final static int UNKNOWN = 0;	private final static int TEXT = 1;	private final static int TAG_ENCOUNTERED = 2;	private final static int EXAMIN_TAG = 3;	private final static int TAG_EXAMINED = 4;	private final static int IN_CLOSETAG = 5;	private final static int SINGLE_TAG = 6;	private final static int CDATA = 7;	private final static int COMMENT = 8;	private final static int PI = 9;	private final static int ENTITY = 10;	private final static int QUOTE = 11;	private final static int ATTRIBUTE_KEY = 12;	private final static int ATTRIBUTE_EQUAL = 13;	private final static int ATTRIBUTE_VALUE = 14;    	/** the state stack */	Stack stack;	/** The current character. */	int character = 0;	/** The previous character. */	int previousCharacter = -1;	/** the line we are currently reading */	int lines = 1;	/** the column where the current character occurs */	int columns = 0;	/** was the last character equivalent to a newline? */	boolean eol = false;	/** the current state */	int state;	/** Are we parsing HTML? */	boolean html;	/** current text (whatever is encountered between tags) */	StringBuffer text = new StringBuffer();	/** current entity (whatever is encountered between & and ;) */	StringBuffer entity = new StringBuffer();	/** current tagname */	String tag = null;	/** current attributes */	HashMap attributes = null;	/** The handler to which we are going to forward document content */	SimpleXMLDocHandler doc;	/** The handler to which we are going to forward comments. */	SimpleXMLDocHandlerComment comment;	/** Keeps track of the number of tags that are open. */	int nested = 0;	/** the quote character that was used to open the quote. */	int quoteCharacter = '"';	/** the attribute key. */	String attributekey = null;	/** the attribute value. */	String attributevalue = null;    	/**	 * Creates a Simple XML parser object.	 * Call go(BufferedReader) immediately after creation.	 */    private SimpleXMLParser(SimpleXMLDocHandler doc, SimpleXMLDocHandlerComment comment, boolean html) {    	this.doc = doc;    	this.comment = comment;    	this.html = html;    	stack = new Stack();    	state = html ? TEXT : UNKNOWN;    }        /**     * Does the actual parsing. Perform this immediately     * after creating the parser object.     */    private void go(Reader r) throws IOException {        BufferedReader reader;        if (r instanceof BufferedReader)            reader = (BufferedReader)r;        else            reader = new BufferedReader(r);        doc.startDocument();        while(true) {			// read a new character			if (previousCharacter == -1) {				character = reader.read();			}			// or re-examine the previous character			else {				character = previousCharacter;				previousCharacter = -1;			}						// the end of the file was reached			if (character == -1) {				if (html) {					if (html && state == TEXT)						flush();					doc.endDocument();				} else {					throwException("Missing end tag");				}				return;			}            			// dealing with  \n and \r			if (character == '\n' && eol) {				eol = false;				continue;			} else if (eol) {				eol = false;			} else if (character == '\n') {				lines++;				columns = 0;			} else if (character == '\r') {				eol = true;				character = '\n';				lines++;				columns = 0;			} else {				columns++;			}            			switch(state) {            // we are in an unknown state before there's actual content			case UNKNOWN:                if(character == '<') {                    saveState(TEXT);                    state = TAG_ENCOUNTERED;                }                break;            // we can encounter any content			case TEXT:                if(character == '<') {                    flush();                    saveState(state);                    state = TAG_ENCOUNTERED;                } else if(character == '&') {                    saveState(state);                    entity.setLength(0);                    state = ENTITY;                } else                    text.append((char)character);                break;            // we have just seen a < and are wondering what we are looking at            // <foo>, </foo>, <!-- ... --->, etc.			case TAG_ENCOUNTERED:                initTag();                if(character == '/') {                    state = IN_CLOSETAG;                } else if (character == '?') {                    restoreState();                    state = PI;                } else {                    text.append((char)character);                    state = EXAMIN_TAG;                }                break;            // we are processing something like this <foo ... >.            // It could still be a <!-- ... --> or something.			case EXAMIN_TAG:                if(character == '>') {                    doTag();                    processTag(true);                    initTag();                    state = restoreState();                } else if(character == '/') {                    state = SINGLE_TAG;                } else if(character == '-' && text.toString().equals("!-")) {                    flush();                    state = COMMENT;                } else if(character == '[' && text.toString().equals("![CDATA")) {                    flush();                    state = CDATA;                } else if(character == 'E' && text.toString().equals("!DOCTYP")) {                    flush();                    state = PI;                } else if(Character.isWhitespace((char)character)) {                    doTag();                    state = TAG_EXAMINED;                } else {                    text.append((char)character);                }                break;            // we know the name of the tag now.			case TAG_EXAMINED:                if(character == '>') {                    processTag(true);                    initTag();                    state = restoreState();                } else if(character == '/') {                    state = SINGLE_TAG;                } else if(Character.isWhitespace((char)character)) {                    // empty                } else {                    text.append((char)character);                    state = ATTRIBUTE_KEY;                }                break;                                // we are processing a closing tag: e.g. </foo>			case IN_CLOSETAG:                if(character == '>') {                    doTag();                    processTag(false);                    if(!html && nested==0) return;                    state = restoreState();                } else {                    if (!Character.isWhitespace((char)character))                        text.append((char)character);                }                break;                            // we have just seen something like this: <foo a="b"/            // and are looking for the final >.			case SINGLE_TAG:                if(character != '>')                    throwException("Expected > for tag: <"+tag+"/>");				doTag();                processTag(true);                processTag(false);                initTag();                if(!html && nested==0) {                    doc.endDocument();                    return;                }                state = restoreState();                break;                            // we are processing CDATA			case CDATA:                if(character == '>'                && text.toString().endsWith("]]")) {                    text.setLength(text.length()-2);                    flush();                    state = restoreState();                } else                    text.append((char)character);                break;                            // we are processing a comment.  We are inside            // the <!-- .... --> looking for the -->.			case COMMENT:                if(character == '>'                && text.toString().endsWith("--")) {                    text.setLength(text.length() - 2);                    flush();                    state = restoreState();                } else                    text.append((char)character);                break;                            // We are inside one of these <? ... ?> or one of these <!DOCTYPE ... >			case PI:                if(character == '>') {                    state = restoreState();                    if(state == TEXT) state = UNKNOWN;                }                break;                            // we are processing an entity, e.g. &lt;, &#187;, etc.			case ENTITY:                if(character == ';') {                    state = restoreState();                    String cent = entity.toString();                    entity.setLength(0);                    char ce = EntitiesToUnicode.decodeEntity(cent);                    if (ce == '\0')                    	text.append('&').append(cent).append(';');                    else                    	text.append(ce);                } else if ((character != '#' && (character < '0' || character > '9') && (character < 'a' || character > 'z')                    && (character < 'A' || character > 'Z')) || entity.length() >= 7) {                    state = restoreState();
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -