📄 simplexmlparser.java
字号:
previousCharacter = character; text.append('&').append(entity.toString()); entity.setLength(0); } else { entity.append((char)character); } break; // We are processing the quoted right-hand side of an element's attribute. case QUOTE: if (html && quoteCharacter == ' ' && character == '>') { flush(); processTag(true); initTag(); state = restoreState(); } else if (html && quoteCharacter == ' ' && Character.isWhitespace((char)character)) { flush(); state = TAG_EXAMINED; } else if (html && quoteCharacter == ' ') { text.append((char)character); } else if(character == quoteCharacter) { flush(); state = TAG_EXAMINED; } else if(" \r\n\u0009".indexOf(character)>=0) { text.append(' '); } else if(character == '&') { saveState(state); state = ENTITY; entity.setLength(0); } else { text.append((char)character); } break; case ATTRIBUTE_KEY: if(Character.isWhitespace((char)character)) { flush(); state = ATTRIBUTE_EQUAL; } else if(character == '=') { flush(); state = ATTRIBUTE_VALUE; } else if (html && character == '>') { text.setLength(0); processTag(true); initTag(); state = restoreState(); } else { text.append((char)character); } break; case ATTRIBUTE_EQUAL: if(character == '=') { state = ATTRIBUTE_VALUE; } else if(Character.isWhitespace((char)character)) { // empty } else if (html && character == '>') { text.setLength(0); processTag(true); initTag(); state = restoreState(); } else if (html && character == '/') { flush(); state = SINGLE_TAG; } else if (html) { flush(); text.append((char)character); state = ATTRIBUTE_KEY; } else { throwException("Error in attribute processing."); } break; case ATTRIBUTE_VALUE: if(character == '"' || character == '\'') { quoteCharacter = character; state = QUOTE; } else if(Character.isWhitespace((char)character)) { // empty } else if (html && character == '>') { flush(); processTag(true); initTag(); state = restoreState(); } else if (html) { text.append((char)character); quoteCharacter = ' '; state = QUOTE; } else { throwException("Error in attribute processing"); } break; } } } /** * Gets a state from the stack * @return the previous state */ private int restoreState() { if(!stack.empty()) return ((Integer)stack.pop()).intValue(); else return UNKNOWN; } /** * Adds a state to the stack. * @param s a state to add to the stack */ private void saveState(int s) { stack.push(new Integer(s)); } /** * Flushes the text that is currently in the buffer. * The text can be ignored, added to the document * as content or as comment,... depending on the current state. */ private void flush() { switch(state){ case TEXT: case CDATA: if(text.length() > 0) { doc.text(text.toString()); } break; case COMMENT: if (comment != null) { comment.comment(text.toString()); } break; case ATTRIBUTE_KEY: attributekey = text.toString(); if (html) attributekey = attributekey.toLowerCase(); break; case QUOTE: case ATTRIBUTE_VALUE: attributevalue = text.toString(); attributes.put(attributekey,attributevalue); break; default: // do nothing } text.setLength(0); } /** * Initialized the tag name and attributes. */ private void initTag() { tag = null; attributes = new HashMap(); } /** Sets the name of the tag. */ private void doTag() { if(tag == null) tag = text.toString(); if (html) tag = tag.toLowerCase(); text.setLength(0); } /** * processes the tag. * @param start if true we are dealing with a tag that has just been opened; if false we are closing a tag. */ private void processTag(boolean start) { if (start) { nested++; doc.startElement(tag,attributes); } else { nested--; doc.endElement(tag); } } /** Throws an exception */ private void throwException(String s) throws IOException { throw new IOException(s+" near line " + lines + ", column " + columns); } /** * Parses the XML document firing the events to the handler. * @param doc the document handler * @param r the document. The encoding is already resolved. The reader is not closed * @throws IOException on error */ public static void parse(SimpleXMLDocHandler doc, SimpleXMLDocHandlerComment comment, Reader r, boolean html) throws IOException { SimpleXMLParser parser = new SimpleXMLParser(doc, comment, html); parser.go(r); } /** * Parses the XML document firing the events to the handler. * @param doc the document handler * @param in the document. The encoding is deduced from the stream. The stream is not closed * @throws IOException on error */ public static void parse(SimpleXMLDocHandler doc, InputStream in) throws IOException { byte b4[] = new byte[4]; int count = in.read(b4); if (count != 4) throw new IOException("Insufficient length."); String encoding = getEncodingName(b4); String decl = null; if (encoding.equals("UTF-8")) { StringBuffer sb = new StringBuffer(); int c; while ((c = in.read()) != -1) { if (c == '>') break; sb.append((char)c); } decl = sb.toString(); } else if (encoding.equals("CP037")) { ByteArrayOutputStream bi = new ByteArrayOutputStream(); int c; while ((c = in.read()) != -1) { if (c == 0x6e) // that's '>' in ebcdic break; bi.write(c); } decl = new String(bi.toByteArray(), "CP037"); } if (decl != null) { decl = getDeclaredEncoding(decl); if (decl != null) encoding = decl; } parse(doc, new InputStreamReader(in, IanaEncodings.getJavaEncoding(encoding))); } private static String getDeclaredEncoding(String decl) { if (decl == null) return null; int idx = decl.indexOf("encoding"); if (idx < 0) return null; int idx1 = decl.indexOf('"', idx); int idx2 = decl.indexOf('\'', idx); if (idx1 == idx2) return null; if ((idx1 < 0 && idx2 > 0) || (idx2 > 0 && idx2 < idx1)) { int idx3 = decl.indexOf('\'', idx2 + 1); if (idx3 < 0) return null; return decl.substring(idx2 + 1, idx3); } if ((idx2 < 0 && idx1 > 0) || (idx1 > 0 && idx1 < idx2)) { int idx3 = decl.indexOf('"', idx1 + 1); if (idx3 < 0) return null; return decl.substring(idx1 + 1, idx3); } return null; } public static void parse(SimpleXMLDocHandler doc,Reader r) throws IOException { parse(doc, null, r, false); } /** * Escapes a string with the appropriated XML codes. * @param s the string to be escaped * @param onlyASCII codes above 127 will always be escaped with &#nn; if <CODE>true</CODE> * @return the escaped string */ public static String escapeXML(String s, boolean onlyASCII) { char cc[] = s.toCharArray(); int len = cc.length; StringBuffer sb = new StringBuffer(); for (int k = 0; k < len; ++k) { int c = cc[k]; switch (c) { case '<': sb.append("<"); break; case '>': sb.append(">"); break; case '&': sb.append("&"); break; case '"': sb.append("""); break; case '\'': sb.append("'"); break; default: if (onlyASCII && c > 127) sb.append("&#").append(c).append(';'); else sb.append((char)c); } } return sb.toString(); } /** * Returns the IANA encoding name that is auto-detected from * the bytes specified, with the endian-ness of that encoding where appropriate. * (method found in org.apache.xerces.impl.XMLEntityManager, originally published * by the Apache Software Foundation under the Apache Software License; now being * used in iText under the MPL) * @param b4 The first four bytes of the input. * @return an IANA-encoding string */ private static String getEncodingName(byte[] b4) { // UTF-16, with BOM int b0 = b4[0] & 0xFF; int b1 = b4[1] & 0xFF; if (b0 == 0xFE && b1 == 0xFF) { // UTF-16, big-endian return "UTF-16BE"; } if (b0 == 0xFF && b1 == 0xFE) { // UTF-16, little-endian return "UTF-16LE"; } // UTF-8 with a BOM int b2 = b4[2] & 0xFF; if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) { return "UTF-8"; } // other encodings int b3 = b4[3] & 0xFF; if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) { // UCS-4, big endian (1234) return "ISO-10646-UCS-4"; } if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) { // UCS-4, little endian (4321) return "ISO-10646-UCS-4"; } if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) { // UCS-4, unusual octet order (2143) // REVISIT: What should this be? return "ISO-10646-UCS-4"; } if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) { // UCS-4, unusual octet order (3412) // REVISIT: What should this be? return "ISO-10646-UCS-4"; } if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) { // UTF-16, big-endian, no BOM // (or could turn out to be UCS-2... // REVISIT: What should this be? return "UTF-16BE"; } if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) { // UTF-16, little-endian, no BOM // (or could turn out to be UCS-2... return "UTF-16LE"; } if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) { // EBCDIC // a la xerces1, return CP037 instead of EBCDIC here return "CP037"; } // default encoding return "UTF-8"; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -