⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 simplexmlparser.java

📁 处理PDF
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
                    previousCharacter = character;                    text.append('&').append(entity.toString());                    entity.setLength(0);                }                else {                    entity.append((char)character);                }                break;            // We are processing the quoted right-hand side of an element's attribute.			case QUOTE:                if (html && quoteCharacter == ' ' && character == '>') {                    flush();                    processTag(true);                    initTag();                    state = restoreState();                }                else if (html && quoteCharacter == ' ' && Character.isWhitespace((char)character)) {                	flush();                    state = TAG_EXAMINED;                }                else if (html && quoteCharacter == ' ') {                    text.append((char)character);                }                else if(character == quoteCharacter) {                	flush();                    state = TAG_EXAMINED;                } else if(" \r\n\u0009".indexOf(character)>=0) {                    text.append(' ');                } else if(character == '&') {                    saveState(state);                    state = ENTITY;                    entity.setLength(0);                } else {                    text.append((char)character);                }                break;                			case ATTRIBUTE_KEY:                if(Character.isWhitespace((char)character)) {                    flush();                    state = ATTRIBUTE_EQUAL;                } else if(character == '=') {                	flush();                    state = ATTRIBUTE_VALUE;                } else if (html && character == '>') {                    text.setLength(0);                    processTag(true);                    initTag();                    state = restoreState();                } else {                    text.append((char)character);                }                break;                			case ATTRIBUTE_EQUAL:                if(character == '=') {                    state = ATTRIBUTE_VALUE;                } else if(Character.isWhitespace((char)character)) {                    // empty                } else if (html && character == '>') {                    text.setLength(0);                    processTag(true);                    initTag();                    state = restoreState();                } else if (html && character == '/') {                    flush();                    state = SINGLE_TAG;                } else if (html) {                    flush();                    text.append((char)character);                    state = ATTRIBUTE_KEY;                } else {                    throwException("Error in attribute processing.");                }                break;                			case ATTRIBUTE_VALUE:                if(character == '"' || character == '\'') {                    quoteCharacter = character;                    state = QUOTE;                } else if(Character.isWhitespace((char)character)) {                    // empty                } else if (html && character == '>') {                    flush();                    processTag(true);                    initTag();                    state = restoreState();                } else if (html) {                    text.append((char)character);                    quoteCharacter = ' ';                    state = QUOTE;                } else {                    throwException("Error in attribute processing");                }                break;            }        }    }    /**     * Gets a state from the stack     * @return the previous state     */    private int restoreState() {        if(!stack.empty())            return ((Integer)stack.pop()).intValue();        else            return UNKNOWN;    }    /**     * Adds a state to the stack.     * @param	s	a state to add to the stack     */    private void saveState(int s) {    	stack.push(new Integer(s));    }    /**     * Flushes the text that is currently in the buffer.     * The text can be ignored, added to the document     * as content or as comment,... depending on the current state.     */    private void flush() {    	switch(state){    	case TEXT:    	case CDATA:            if(text.length() > 0) {                doc.text(text.toString());            }            break;    	case COMMENT:        	if (comment != null) {                comment.comment(text.toString());            }        	break;    	case ATTRIBUTE_KEY:            attributekey = text.toString();            if (html)                attributekey = attributekey.toLowerCase();    		break;    	case QUOTE:    	case ATTRIBUTE_VALUE:        	attributevalue = text.toString();            attributes.put(attributekey,attributevalue);            break;    	default:    		// do nothing    	}        text.setLength(0);    }    /**     * Initialized the tag name and attributes.     */    private void initTag() {        tag = null;        attributes = new HashMap();    }    /** Sets the name of the tag. */    private void doTag() {    	if(tag == null)    		tag = text.toString();    	if (html)    		tag = tag.toLowerCase();    	text.setLength(0);    }    /**     * processes the tag.     * @param start	if true we are dealing with a tag that has just been opened; if false we are closing a tag.     */    private void processTag(boolean start) {    	if (start) {    		nested++;    		doc.startElement(tag,attributes);    	}    	else {            nested--;            doc.endElement(tag);    	}    }    /** Throws an exception */    private void throwException(String s) throws IOException {        throw new IOException(s+" near line " + lines + ", column " + columns);    }        /**     * Parses the XML document firing the events to the handler.     * @param doc the document handler     * @param r the document. The encoding is already resolved. The reader is not closed     * @throws IOException on error     */    public static void parse(SimpleXMLDocHandler doc, SimpleXMLDocHandlerComment comment, Reader r, boolean html) throws IOException {    	SimpleXMLParser parser = new SimpleXMLParser(doc, comment, html);    	parser.go(r);    }        /**     * Parses the XML document firing the events to the handler.     * @param doc the document handler     * @param in the document. The encoding is deduced from the stream. The stream is not closed     * @throws IOException on error     */        public static void parse(SimpleXMLDocHandler doc, InputStream in) throws IOException {        byte b4[] = new byte[4];        int count = in.read(b4);        if (count != 4)            throw new IOException("Insufficient length.");        String encoding = getEncodingName(b4);        String decl = null;        if (encoding.equals("UTF-8")) {            StringBuffer sb = new StringBuffer();            int c;            while ((c = in.read()) != -1) {                if (c == '>')                    break;                sb.append((char)c);            }            decl = sb.toString();        }        else if (encoding.equals("CP037")) {            ByteArrayOutputStream bi = new ByteArrayOutputStream();            int c;            while ((c = in.read()) != -1) {                if (c == 0x6e) // that's '>' in ebcdic                    break;                bi.write(c);            }            decl = new String(bi.toByteArray(), "CP037");        }        if (decl != null) {            decl = getDeclaredEncoding(decl);            if (decl != null)                encoding = decl;        }        parse(doc, new InputStreamReader(in, IanaEncodings.getJavaEncoding(encoding)));    }        private static String getDeclaredEncoding(String decl) {        if (decl == null)            return null;        int idx = decl.indexOf("encoding");        if (idx < 0)            return null;        int idx1 = decl.indexOf('"', idx);        int idx2 = decl.indexOf('\'', idx);        if (idx1 == idx2)            return null;        if ((idx1 < 0 && idx2 > 0) || (idx2 > 0 && idx2 < idx1)) {            int idx3 = decl.indexOf('\'', idx2 + 1);            if (idx3 < 0)                return null;            return decl.substring(idx2 + 1, idx3);        }        if ((idx2 < 0 && idx1 > 0) || (idx1 > 0 && idx1 < idx2)) {            int idx3 = decl.indexOf('"', idx1 + 1);            if (idx3 < 0)                return null;            return decl.substring(idx1 + 1, idx3);        }        return null;    }        public static void parse(SimpleXMLDocHandler doc,Reader r) throws IOException {        parse(doc, null, r, false);    }        /**     * Escapes a string with the appropriated XML codes.     * @param s the string to be escaped     * @param onlyASCII codes above 127 will always be escaped with &amp;#nn; if <CODE>true</CODE>     * @return the escaped string     */        public static String escapeXML(String s, boolean onlyASCII) {        char cc[] = s.toCharArray();        int len = cc.length;        StringBuffer sb = new StringBuffer();        for (int k = 0; k < len; ++k) {            int c = cc[k];            switch (c) {                case '<':                    sb.append("&lt;");                    break;                case '>':                    sb.append("&gt;");                    break;                case '&':                    sb.append("&amp;");                    break;                case '"':                    sb.append("&quot;");                    break;                case '\'':                    sb.append("&apos;");                    break;                default:                    if (onlyASCII && c > 127)                        sb.append("&#").append(c).append(';');                    else                        sb.append((char)c);            }        }        return sb.toString();    }    /**     * Returns the IANA encoding name that is auto-detected from     * the bytes specified, with the endian-ness of that encoding where appropriate.     * (method found in org.apache.xerces.impl.XMLEntityManager, originally published     * by the Apache Software Foundation under the Apache Software License; now being     * used in iText under the MPL)     * @param b4    The first four bytes of the input.     * @return an IANA-encoding string     */    private static String getEncodingName(byte[] b4) {                // UTF-16, with BOM        int b0 = b4[0] & 0xFF;        int b1 = b4[1] & 0xFF;        if (b0 == 0xFE && b1 == 0xFF) {            // UTF-16, big-endian            return "UTF-16BE";        }        if (b0 == 0xFF && b1 == 0xFE) {            // UTF-16, little-endian            return "UTF-16LE";        }                // UTF-8 with a BOM        int b2 = b4[2] & 0xFF;        if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {            return "UTF-8";        }                // other encodings        int b3 = b4[3] & 0xFF;        if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {            // UCS-4, big endian (1234)            return "ISO-10646-UCS-4";        }        if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {            // UCS-4, little endian (4321)            return "ISO-10646-UCS-4";        }        if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {            // UCS-4, unusual octet order (2143)            // REVISIT: What should this be?            return "ISO-10646-UCS-4";        }        if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {            // UCS-4, unusual octet order (3412)            // REVISIT: What should this be?            return "ISO-10646-UCS-4";        }        if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {            // UTF-16, big-endian, no BOM            // (or could turn out to be UCS-2...            // REVISIT: What should this be?            return "UTF-16BE";        }        if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {            // UTF-16, little-endian, no BOM            // (or could turn out to be UCS-2...            return "UTF-16LE";        }        if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {            // EBCDIC            // a la xerces1, return CP037 instead of EBCDIC here            return "CP037";        }                // default encoding        return "UTF-8";    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -