📄 simplexmlparser.java
字号:
previousCharacter = character;
text.append('&').append(entity.toString());
entity.setLength(0);
}
else {
entity.append((char)character);
}
break;
// We are processing the quoted right-hand side of an element's attribute.
case QUOTE:
if (html && quoteCharacter == ' ' && character == '>') {
flush();
processTag(true);
initTag();
state = restoreState();
}
else if (html && quoteCharacter == ' ' && Character.isWhitespace((char)character)) {
flush();
state = TAG_EXAMINED;
}
else if (html && quoteCharacter == ' ') {
text.append((char)character);
}
else if(character == quoteCharacter) {
flush();
state = TAG_EXAMINED;
} else if(" \r\n\u0009".indexOf(character)>=0) {
text.append(' ');
} else if(character == '&') {
saveState(state);
state = ENTITY;
entity.setLength(0);
} else {
text.append((char)character);
}
break;
case ATTRIBUTE_KEY:
if(Character.isWhitespace((char)character)) {
flush();
state = ATTRIBUTE_EQUAL;
} else if(character == '=') {
flush();
state = ATTRIBUTE_VALUE;
} else if (html && character == '>') {
text.setLength(0);
processTag(true);
initTag();
state = restoreState();
} else {
text.append((char)character);
}
break;
case ATTRIBUTE_EQUAL:
if(character == '=') {
state = ATTRIBUTE_VALUE;
} else if(Character.isWhitespace((char)character)) {
// empty
} else if (html && character == '>') {
text.setLength(0);
processTag(true);
initTag();
state = restoreState();
} else if (html && character == '/') {
flush();
state = SINGLE_TAG;
} else if (html) {
flush();
text.append((char)character);
state = ATTRIBUTE_KEY;
} else {
throwException("Error in attribute processing.");
}
break;
case ATTRIBUTE_VALUE:
if(character == '"' || character == '\'') {
quoteCharacter = character;
state = QUOTE;
} else if(Character.isWhitespace((char)character)) {
// empty
} else if (html && character == '>') {
flush();
processTag(true);
initTag();
state = restoreState();
} else if (html) {
text.append((char)character);
quoteCharacter = ' ';
state = QUOTE;
} else {
throwException("Error in attribute processing");
}
break;
}
}
}
/**
* Gets a state from the stack
* @return the previous state
*/
private int restoreState() {
if(!stack.empty())
return ((Integer)stack.pop()).intValue();
else
return UNKNOWN;
}
/**
* Adds a state to the stack.
* @param s a state to add to the stack
*/
private void saveState(int s) {
stack.push(new Integer(s));
}
/**
* Flushes the text that is currently in the buffer.
* The text can be ignored, added to the document
* as content or as comment,... depending on the current state.
*/
private void flush() {
switch(state){
case TEXT:
case CDATA:
if(text.length() > 0) {
doc.text(text.toString());
}
break;
case COMMENT:
if (comment != null) {
comment.comment(text.toString());
}
break;
case ATTRIBUTE_KEY:
attributekey = text.toString();
if (html)
attributekey = attributekey.toLowerCase();
break;
case QUOTE:
case ATTRIBUTE_VALUE:
attributevalue = text.toString();
attributes.put(attributekey,attributevalue);
break;
default:
// do nothing
}
text.setLength(0);
}
/**
* Initialized the tag name and attributes.
*/
private void initTag() {
tag = null;
attributes = new HashMap();
}
/** Sets the name of the tag. */
private void doTag() {
if(tag == null)
tag = text.toString();
if (html)
tag = tag.toLowerCase();
text.setLength(0);
}
/**
* processes the tag.
* @param start if true we are dealing with a tag that has just been opened; if false we are closing a tag.
*/
private void processTag(boolean start) {
if (start) {
nested++;
doc.startElement(tag,attributes);
}
else {
nested--;
doc.endElement(tag);
}
}
/** Throws an exception */
private void throwException(String s) throws IOException {
throw new IOException(s+" near line " + lines + ", column " + columns);
}
/**
* Parses the XML document firing the events to the handler.
* @param doc the document handler
* @param r the document. The encoding is already resolved. The reader is not closed
* @throws IOException on error
*/
public static void parse(SimpleXMLDocHandler doc, SimpleXMLDocHandlerComment comment, Reader r, boolean html) throws IOException {
SimpleXMLParser parser = new SimpleXMLParser(doc, comment, html);
parser.go(r);
}
/**
* Parses the XML document firing the events to the handler.
* @param doc the document handler
* @param in the document. The encoding is deduced from the stream. The stream is not closed
* @throws IOException on error
*/
public static void parse(SimpleXMLDocHandler doc, InputStream in) throws IOException {
byte b4[] = new byte[4];
int count = in.read(b4);
if (count != 4)
throw new IOException("Insufficient length.");
String encoding = getEncodingName(b4);
String decl = null;
if (encoding.equals("UTF-8")) {
StringBuffer sb = new StringBuffer();
int c;
while ((c = in.read()) != -1) {
if (c == '>')
break;
sb.append((char)c);
}
decl = sb.toString();
}
else if (encoding.equals("CP037")) {
ByteArrayOutputStream bi = new ByteArrayOutputStream();
int c;
while ((c = in.read()) != -1) {
if (c == 0x6e) // that's '>' in ebcdic
break;
bi.write(c);
}
decl = new String(bi.toByteArray(), "CP037");
}
if (decl != null) {
decl = getDeclaredEncoding(decl);
if (decl != null)
encoding = decl;
}
parse(doc, new InputStreamReader(in, IanaEncodings.getJavaEncoding(encoding)));
}
private static String getDeclaredEncoding(String decl) {
if (decl == null)
return null;
int idx = decl.indexOf("encoding");
if (idx < 0)
return null;
int idx1 = decl.indexOf('"', idx);
int idx2 = decl.indexOf('\'', idx);
if (idx1 == idx2)
return null;
if ((idx1 < 0 && idx2 > 0) || (idx2 > 0 && idx2 < idx1)) {
int idx3 = decl.indexOf('\'', idx2 + 1);
if (idx3 < 0)
return null;
return decl.substring(idx2 + 1, idx3);
}
if ((idx2 < 0 && idx1 > 0) || (idx1 > 0 && idx1 < idx2)) {
int idx3 = decl.indexOf('"', idx1 + 1);
if (idx3 < 0)
return null;
return decl.substring(idx1 + 1, idx3);
}
return null;
}
public static void parse(SimpleXMLDocHandler doc,Reader r) throws IOException {
parse(doc, null, r, false);
}
/**
* Escapes a string with the appropriated XML codes.
* @param s the string to be escaped
* @param onlyASCII codes above 127 will always be escaped with &#nn; if <CODE>true</CODE>
* @return the escaped string
*/
public static String escapeXML(String s, boolean onlyASCII) {
char cc[] = s.toCharArray();
int len = cc.length;
StringBuffer sb = new StringBuffer();
for (int k = 0; k < len; ++k) {
int c = cc[k];
switch (c) {
case '<':
sb.append("<");
break;
case '>':
sb.append(">");
break;
case '&':
sb.append("&");
break;
case '"':
sb.append(""");
break;
case '\'':
sb.append("'");
break;
default:
if (onlyASCII && c > 127)
sb.append("&#").append(c).append(';');
else
sb.append((char)c);
}
}
return sb.toString();
}
/**
* Returns the IANA encoding name that is auto-detected from
* the bytes specified, with the endian-ness of that encoding where appropriate.
* (method found in org.apache.xerces.impl.XMLEntityManager, originaly published
* by the Apache Software Foundation under the Apache Software License; now being
* used in iText under the MPL)
* @param b4 The first four bytes of the input.
* @return an IANA-encoding string
*/
private static String getEncodingName(byte[] b4) {
// UTF-16, with BOM
int b0 = b4[0] & 0xFF;
int b1 = b4[1] & 0xFF;
if (b0 == 0xFE && b1 == 0xFF) {
// UTF-16, big-endian
return "UTF-16BE";
}
if (b0 == 0xFF && b1 == 0xFE) {
// UTF-16, little-endian
return "UTF-16LE";
}
// UTF-8 with a BOM
int b2 = b4[2] & 0xFF;
if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
return "UTF-8";
}
// other encodings
int b3 = b4[3] & 0xFF;
if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
// UCS-4, big endian (1234)
return "ISO-10646-UCS-4";
}
if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
// UCS-4, little endian (4321)
return "ISO-10646-UCS-4";
}
if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
// UCS-4, unusual octet order (2143)
// REVISIT: What should this be?
return "ISO-10646-UCS-4";
}
if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
// UCS-4, unusual octect order (3412)
// REVISIT: What should this be?
return "ISO-10646-UCS-4";
}
if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
// UTF-16, big-endian, no BOM
// (or could turn out to be UCS-2...
// REVISIT: What should this be?
return "UTF-16BE";
}
if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
// UTF-16, little-endian, no BOM
// (or could turn out to be UCS-2...
return "UTF-16LE";
}
if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
// EBCDIC
// a la xerces1, return CP037 instead of EBCDIC here
return "CP037";
}
// default encoding
return "UTF-8";
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -