📄 simplexmlparser.java
字号:
/*
* Copyright 2003 Paulo Soares
*
* The contents of this file are subject to the Mozilla Public License Version 1.1
* (the "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the License.
*
* The Original Code is 'iText, a free JAVA-PDF library'.
*
* The Initial Developer of the Original Code is Bruno Lowagie. Portions created by
* the Initial Developer are Copyright (C) 1999, 2000, 2001, 2002 by Bruno Lowagie.
* All Rights Reserved.
* Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer
* are Copyright (C) 2000, 2001, 2002 by Paulo Soares. All Rights Reserved.
*
* Contributor(s): all the names of the contributors are added in the source code
* where applicable.
*
* Alternatively, the contents of this file may be used under the terms of the
* LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the
* provisions of LGPL are applicable instead of those above. If you wish to
* allow use of your version of this file only under the terms of the LGPL
* License and not to allow others to use your version of this file under
* the MPL, indicate your decision by deleting the provisions above and
* replace them with the notice and other provisions required by the LGPL.
* If you do not delete the provisions above, a recipient may use your version
* of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE.
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the MPL as stated above or under the terms of the GNU
* Library General Public License as published by the Free Software Foundation;
* either version 2 of the License, or any later version.
*
* This library is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more
* details.
*
* If you didn't download this code from the following link, you should check if
* you aren't using an obsolete version:
* http://www.lowagie.com/iText/
*
* The code to recognize the encoding in this class and in the convenience class IanaEncodings was taken from Apache Xerces published under the following license:
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Part of this code is based on the Quick-and-Dirty XML parser by Steven Brandt.
* The code for the Quick-and-Dirty parser was published in JavaWorld (java tip 128).
* Steven Brandt and JavaWorld gave permission to use the code for free.
* (Bruno Lowagie and Paulo Soares chose to use it under the MPL/LGPL in
* conformance with the rest of the code).
* The original code can be found on this url: <A HREF="http://www.javaworld.com/javatips/jw-javatip128_p.html">http://www.javaworld.com/javatips/jw-javatip128_p.html</A>.
* It was substantially refactored by Bruno Lowagie.
*
* The method 'private static String getEncodingName(byte[] b4)' was found
* in org.apache.xerces.impl.XMLEntityManager, originaly published by the
* Apache Software Foundation under the Apache Software License; now being
* used in iText under the MPL.
*/
package com.lowagie.text.xml.simpleparser;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.HashMap;
import java.util.Stack;
/**
* A simple XML and HTML parser. This parser is, like the SAX parser,
* an event based parser, but with much less functionality.
* <p>
* The parser can:
* <p>
* <ul>
* <li>It recognizes the encoding used
* <li>It recognizes all the elements' start tags and end tags
* <li>It lists attributes, where attribute values can be enclosed in single or double quotes
* <li>It recognizes the <code><[CDATA[ ... ]]></code> construct
* <li>It recognizes the standard entities: &amp;, &lt;, &gt;, &quot;, and &apos;, as well as numeric entities
* <li>It maps lines ending in <code>\r\n</code> and <code>\r</code> to <code>\n</code> on input, in accordance with the XML Specification, Section 2.11
* </ul>
* <p>
*/
public class SimpleXMLParser {
/** possible states */
private final static int UNKNOWN = 0;
private final static int TEXT = 1;
private final static int TAG_ENCOUNTERED = 2;
private final static int EXAMIN_TAG = 3;
private final static int TAG_EXAMINED = 4;
private final static int IN_CLOSETAG = 5;
private final static int SINGLE_TAG = 6;
private final static int CDATA = 7;
private final static int COMMENT = 8;
private final static int PI = 9;
private final static int ENTITY = 10;
private final static int QUOTE = 11;
private final static int ATTRIBUTE_KEY = 12;
private final static int ATTRIBUTE_EQUAL = 13;
private final static int ATTRIBUTE_VALUE = 14;
/** the state stack */
protected Stack stack;
/** The current character. */
protected int character = 0;
/** The previous character. */
protected int previousCharacter = -1;
/** the line we are currently reading */
protected int lines = 1;
/** the column where the current character occurs */
protected int columns = 0;
/** was the last character equivalent to a newline? */
protected boolean eol = false;
/** the current state */
protected int state;
/** Are we parsing HTML? */
protected boolean html;
/** current text (whatever is encountered between tags) */
protected StringBuffer text = new StringBuffer();
/** current entity (whatever is encountered between & and ;) */
protected StringBuffer entity = new StringBuffer();
/** current tagname */
protected String tag = null;
/** current attributes */
protected HashMap attributes = null;
/** The handler to which we are going to forward document content */
protected SimpleXMLDocHandler doc;
/** The handler to which we are going to forward comments. */
protected SimpleXMLDocHandlerComment comment;
/** Keeps track of the number of tags that are open. */
int nested = 0;
/** the quote character that was used to open the quote. */
protected int quoteCharacter = '"';
/** the attribute key. */
String attributekey = null;
/** the attribute value. */
String attributevalue = null;
/**
* Creates a Simple XML parser object.
* Call go(BufferedReader) immediately after creation.
*/
private SimpleXMLParser(SimpleXMLDocHandler doc, SimpleXMLDocHandlerComment comment, boolean html) {
this.doc = doc;
this.comment = comment;
this.html = html;
stack = new Stack();
state = html ? TEXT : UNKNOWN;
}
/**
* Does the actual parsing. Perform this immediately
* after creating the parser object.
*/
private void go(Reader r) throws IOException {
BufferedReader reader;
if (r instanceof BufferedReader)
reader = (BufferedReader)r;
else
reader = new BufferedReader(r);
doc.startDocument();
while(true) {
// read a new character
if (previousCharacter == -1) {
character = reader.read();
}
// or re-examin the previous character
else {
character = previousCharacter;
previousCharacter = -1;
}
// the end of the file was reached
if (character == -1) {
if (html) {
if (html && state == TEXT)
flush();
doc.endDocument();
} else {
throwException("Missing end tag");
}
return;
}
// dealing with \n and \r
if (character == '\n' && eol) {
eol = false;
continue;
} else if (eol) {
eol = false;
} else if (character == '\n') {
lines++;
columns = 0;
} else if (character == '\r') {
eol = true;
character = '\n';
lines++;
columns = 0;
} else {
columns++;
}
switch(state) {
// we are in an unknown state before there's actual content
case UNKNOWN:
if(character == '<') {
saveState(TEXT);
state = TAG_ENCOUNTERED;
}
break;
// we can encounter any content
case TEXT:
if(character == '<') {
flush();
saveState(state);
state = TAG_ENCOUNTERED;
} else if(character == '&') {
saveState(state);
entity.setLength(0);
state = ENTITY;
} else
text.append((char)character);
break;
// we have just seen a < and are wondering what we are looking at
// <foo>, </foo>, <!-- ... --->, etc.
case TAG_ENCOUNTERED:
initTag();
if(character == '/') {
state = IN_CLOSETAG;
} else if (character == '?') {
restoreState();
state = PI;
} else {
text.append((char)character);
state = EXAMIN_TAG;
}
break;
// we are processing something like this <foo ... >.
// It could still be a <!-- ... --> or something.
case EXAMIN_TAG:
if(character == '>') {
doTag();
processTag(true);
initTag();
state = restoreState();
} else if(character == '/') {
state = SINGLE_TAG;
} else if(character == '-' && text.toString().equals("!-")) {
flush();
state = COMMENT;
} else if(character == '[' && text.toString().equals("![CDATA")) {
flush();
state = CDATA;
} else if(character == 'E' && text.toString().equals("!DOCTYP")) {
flush();
state = PI;
} else if(Character.isWhitespace((char)character)) {
doTag();
state = TAG_EXAMINED;
} else {
text.append((char)character);
}
break;
// we know the name of the tag now.
case TAG_EXAMINED:
if(character == '>') {
processTag(true);
initTag();
state = restoreState();
} else if(character == '/') {
state = SINGLE_TAG;
} else if(Character.isWhitespace((char)character)) {
// empty
} else {
text.append((char)character);
state = ATTRIBUTE_KEY;
}
break;
// we are processing a closing tag: e.g. </foo>
case IN_CLOSETAG:
if(character == '>') {
doTag();
processTag(false);
if(!html && nested==0) return;
state = restoreState();
} else {
if (!Character.isWhitespace((char)character))
text.append((char)character);
}
break;
// we have just seen something like this: <foo a="b"/
// and are looking for the final >.
case SINGLE_TAG:
if(character != '>')
throwException("Expected > for tag: <"+tag+"/>");
doTag();
processTag(true);
processTag(false);
initTag();
if(!html && nested==0) {
doc.endDocument();
return;
}
state = restoreState();
break;
// we are processing CDATA
case CDATA:
if(character == '>'
&& text.toString().endsWith("]]")) {
text.setLength(text.length()-2);
flush();
state = restoreState();
} else
text.append((char)character);
break;
// we are processing a comment. We are inside
// the <!-- .... --> looking for the -->.
case COMMENT:
if(character == '>'
&& text.toString().endsWith("--")) {
text.setLength(text.length() - 2);
flush();
state = restoreState();
} else
text.append((char)character);
break;
// We are inside one of these <? ... ?> or one of these <!DOCTYPE ... >
case PI:
if(character == '>') {
state = restoreState();
if(state == TEXT) state = UNKNOWN;
}
break;
// we are processing an entity, e.g. <, », etc.
case ENTITY:
if(character == ';') {
state = restoreState();
String cent = entity.toString();
entity.setLength(0);
char ce = EntitiesToUnicode.decodeEntity(cent);
if (ce == '\0')
text.append('&').append(cent).append(';');
else
text.append(ce);
} else if ((character != '#' && (character < '0' || character > '9') && (character < 'a' || character > 'z')
&& (character < 'A' || character > 'Z')) || entity.length() >= 7) {
state = restoreState();
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -