📄 xmlparser.java
字号:
////////////////////////////////////////////////////////////////////// /** * Parse an XML document. * <pre> * [1] document ::= prolog element Misc* * </pre> * <p>This is the top-level parsing function for a single XML * document. As a minimum, a well-formed document must have * a document element, and a valid document must have a prolog * (one with doctype) as well. */ private void parseDocument () throws Exception { char c; try { // added by MHK parseProlog (); require ('<'); parseElement (); } catch (EOFException ee) { // added by MHK error("premature end of file", "[EOF]", null); } try { parseMisc (); //skip all white, PIs, and comments c = readCh (); //if this doesn't throw an exception... error ("unexpected characters after document end", c, null); } catch (EOFException e) { return; } } /** * Skip a comment. * <pre> * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* "-->" * </pre> * <p> (The <code><!--</code> has already been read.) */ private void parseComment () throws Exception { char c; boolean saved = expandPE; expandPE = false; parseUntil ("--"); require ('>'); expandPE = saved; handler.comment (dataBuffer, 0, dataBufferPos); dataBufferPos = 0; } /** * Parse a processing instruction and do a call-back. * <pre> * [16] PI ::= '<?' PITarget * (S (Char* - (Char* '?>' Char*)))? * '?>' * [17] PITarget ::= Name - ( ('X'|'x') ('M'|m') ('L'|l') ) * </pre> * <p> (The <code><?</code> has already been read.) */ private void parsePI () throws SAXException, IOException { String name; boolean saved = expandPE; expandPE = false; name = readNmtoken (true); if ("xml".equalsIgnoreCase (name)) error ("Illegal processing instruction target", name, null); if (!tryRead ("?>")) { requireWhitespace (); parseUntil ("?>"); } expandPE = saved; handler.processingInstruction (name, dataBufferToString ()); } /** * Parse a CDATA section. * <pre> * [18] CDSect ::= CDStart CData CDEnd * [19] CDStart ::= '<![CDATA[' * [20] CData ::= (Char* - (Char* ']]>' Char*)) * [21] CDEnd ::= ']]>' * </pre> * <p> (The '<![CDATA[' has already been read.) */ private void parseCDSect () throws Exception { parseUntil ("]]>"); dataBufferFlush (); } /** * Parse the prolog of an XML document. * <pre> * [22] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)? * </pre> * <p>There are a couple of tricks here. First, it is necessary to * declare the XML default attributes after the DTD (if present) * has been read. [??] Second, it is not possible to expand general * references in attribute value literals until after the entire * DTD (if present) has been parsed. * <p>We do not look for the XML declaration here, because it was * handled by pushURL (). * @see #pushURL */ private void parseProlog () throws Exception { parseMisc (); if (tryRead ("<!DOCTYPE")) { parseDoctypedecl (); parseMisc (); } } /** * Parse the XML declaration. * <pre> * [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' * [24] VersionInfo ::= S 'version' Eq * ("'" VersionNum "'" | '"' VersionNum '"' ) * [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')* * [32] SDDecl ::= S 'standalone' Eq * ( "'"" ('yes' | 'no') "'"" | '"' ("yes" | "no") '"' ) * [80] EncodingDecl ::= S 'encoding' Eq * ( "'" EncName "'" | "'" EncName "'" ) * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* * </pre> * <p> (The <code><?xml</code> and whitespace have already been read.) * @return the encoding in the declaration, uppercased; or null * @see #parseTextDecl * @see #setupDecoding */ private String parseXMLDecl (boolean ignoreEncoding) throws SAXException, IOException { String version; String encodingName = null; String standalone = null; boolean white; int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; // Read the version. require ("version"); parseEq (); version = readLiteral (flags); if (!version.equals ("1.0")) { error ("unsupported XML version", version, "1.0"); } // Try reading an encoding declaration. white = tryWhitespace (); if (tryRead ("encoding")) { if (!white) error ("whitespace required before 'encoding='"); parseEq (); encodingName = readLiteral (flags); if (!ignoreEncoding) setupDecoding (encodingName); } // Try reading a standalone declaration if (encodingName != null) white = tryWhitespace (); if (tryRead ("standalone")) { if (!white) error ("whitespace required before 'standalone='"); parseEq (); standalone = readLiteral (flags); if (! ("yes".equals (standalone) || "no".equals (standalone))) error ("standalone flag must be 'yes' or 'no'"); } skipWhitespace (); require ("?>"); return encodingName; } /** * Parse a text declaration. * <pre> * [79] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>' * [80] EncodingDecl ::= S 'encoding' Eq * ( '"' EncName '"' | "'" EncName "'" ) * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* * </pre> * <p> (The <code><?xml</code>' and whitespace have already been read.) * @return the encoding in the declaration, uppercased; or null * @see #parseXMLDecl * @see #setupDecoding */ private String parseTextDecl (boolean ignoreEncoding) throws SAXException, IOException { String encodingName = null; int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; // Read an optional version. if (tryRead ("version")) { String version; parseEq (); version = readLiteral (flags); if (!version.equals ("1.0")) { error ("unsupported XML version", version, "1.0"); } requireWhitespace (); } // Read the encoding. require ("encoding"); parseEq (); encodingName = readLiteral (flags); if (!ignoreEncoding) setupDecoding (encodingName); skipWhitespace (); require ("?>"); return encodingName; } /** * Sets up internal state so that we can decode an entity using the * specified encoding. This is used when we start to read an entity * and we have been given knowledge of its encoding before we start to * read any data (e.g. from a SAX input source or from a MIME type). * * <p> It is also used after autodetection, at which point only very * limited adjustments to the encoding may be used (switching between * related builtin decoders). * * @param encodingName The name of the encoding specified by the user. * @exception IOException if the encoding isn't supported either * internally to this parser, or by the hosting JVM. * @see #parseXMLDecl * @see #parseTextDecl */ private void setupDecoding (String encodingName) throws SAXException, IOException { encodingName = encodingName.toUpperCase (); // ENCODING_EXTERNAL indicates an encoding that wasn't // autodetected ... we can use builtin decoders, or // ones from the JVM (InputStreamReader). // Otherwise we can only tweak what was autodetected, and // only for single byte (ASCII derived) builtin encodings. // ASCII-derived encodings if (encoding == ENCODING_UTF_8 || encoding == ENCODING_EXTERNAL) { if (encodingName.equals ("ISO-8859-1") || encodingName.equals ("8859_1") || encodingName.equals ("ISO8859_1") ) { encoding = ENCODING_ISO_8859_1; return; } else if (encodingName.equals ("US-ASCII") || encodingName.equals ("ASCII")) { encoding = ENCODING_ASCII; return; } else if (encodingName.equals ("UTF-8") || encodingName.equals ("UTF8")) { encoding = ENCODING_UTF_8; return; } else if (encoding != ENCODING_EXTERNAL) { // fatal error error ("unsupported ASCII-derived encoding", encodingName, "UTF-8, US-ASCII, or ISO-8859-1"); } // else fallthrough ... // it's ASCII-ish and something other than a builtin } // Unicode and such if (encoding == ENCODING_UCS_2_12 || encoding == ENCODING_UCS_2_21) { if (!(encodingName.equals ("ISO-10646-UCS-2") || encodingName.equals ("UTF-16") || encodingName.equals ("UTF-16BE") || encodingName.equals ("UTF-16LE"))) error ("unsupported Unicode encoding", encodingName, "UTF-16"); return; } // four byte encodings if (encoding == ENCODING_UCS_4_1234 || encoding == ENCODING_UCS_4_4321 || encoding == ENCODING_UCS_4_2143 || encoding == ENCODING_UCS_4_3412) { if (!encodingName.equals ("ISO-10646-UCS-4")) error ("unsupported 32-bit encoding", encodingName, "ISO-10646-UCS-4"); return; } // assert encoding == ENCODING_EXTERNAL // if (encoding != ENCODING_EXTERNAL) // throw new RuntimeException ("encoding = " + encoding); if (encodingName.equals ("UTF-16BE")) { encoding = ENCODING_UCS_2_12; return; } if (encodingName.equals ("UTF-16LE")) { encoding = ENCODING_UCS_2_21; return; } // We couldn't use the builtin decoders at all. But we can try to // create a reader, since we haven't messed up buffering. Tweak // the encoding name if necessary. if (encodingName.equals ("UTF-16") || encodingName.equals ("ISO-10646-UCS-2")) encodingName = "Unicode"; // Ignoring all the EBCDIC aliases here reader = new InputStreamReader (is, encodingName); sourceType = INPUT_READER; is = null; } /** * Parse miscellaneous markup outside the document element and DOCTYPE * declaration. * <pre> * [27] Misc ::= Comment | PI | S * </pre> */ private void parseMisc () throws Exception { while (true) { skipWhitespace (); if (tryRead ("<?")) { parsePI (); } else if (tryRead ("<!--")) { parseComment (); } else { return; } } } /** * Parse a document type declaration. * <pre> * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? * ('[' (markupdecl | PEReference | S)* ']' S?)? '>' * </pre> * <p> (The <code><!DOCTYPE</code> has already been read.) */ private void parseDoctypedecl () throws Exception { char c; String doctypeName, ids[]; // Read the document type name. requireWhitespace (); doctypeName = readNmtoken (true); // Read the External subset's IDs skipWhitespace (); ids = readExternalIds (false); // report (a) declaration of name, (b) lexical info (ids) handler.doctypeDecl (doctypeName, ids [0], ids [1]); // Internal subset is parsed first, if present skipWhitespace (); if (tryRead ('[')) { // loop until the subset ends while (true) { expandPE = true; skipWhitespace (); expandPE = false; if (tryRead (']')) { break; // end of subset } else { // WFC, PEs in internal subset (only between decls) peIsError = expandPE = true; parseMarkupdecl (); peIsError = expandPE = false; } } } // Read the external subset, if any if (ids [1] != null) { pushURL ("[external subset]", ids [0], ids [1], null, null, null); // Loop until we end up back at '>' while (true) { expandPE = true; skipWhitespace (); expandPE = false; if (tryRead ('>')) { break; } else { expandPE = true; parseMarkupdecl (); expandPE = false; } } } else { // No external subset. skipWhitespace (); require ('>'); } // done dtd handler.endDoctype (); expandPE = false; } /** * Parse a markup declaration in the internal or external DTD subset. * <pre> * [29] markupdecl ::= elementdecl | Attlistdecl | EntityDecl * | NotationDecl | PI | Comment * [30] extSubsetDecl ::= (markupdecl | conditionalSect * | PEReference | S) * * </pre> * <p> Reading toplevel PE references is handled as a lexical issue * by the caller, as is whitespace. */ private void parseMarkupdecl () throws Exception { if (tryRead ("<!ELEMENT")) { parseElementdecl (); } else if (tryRead ("<!ATTLIST")) { parseAttlistDecl (); } else if (tryRead ("<!ENTITY")) { parseEntityDecl (); } else if (tryRead ("<!NOTATION")) { parseNotationDecl (); } else if (tryRead ("<?")) { parsePI (); } else if (tryRead ("<!--")) { parseComment (); } else if (tryRead ("<![")) { if (inputStack.size () > 0) parseConditionalSect ();
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -