📄 xmlparser.java
字号:
expandPE = false; parseUntil ("--"); require ('>'); expandPE = saved; handler.comment (dataBuffer, 0, dataBufferPos); dataBufferPos = 0; } /** * Parse a processing instruction and do a call-back. * <pre> * [16] PI ::= '<?' PITarget * (S (Char* - (Char* '?>' Char*)))? * '?>' * [17] PITarget ::= Name - ( ('X'|'x') ('M'|m') ('L'|l') ) * </pre> * <p> (The <code><?</code> has already been read.) */ private void parsePI () throws SAXException, IOException { String name; boolean saved = expandPE; expandPE = false; name = readNmtoken (true); if ("xml".equalsIgnoreCase (name)) error ("Illegal processing instruction target", name, null); if (!tryRead ("?>")) { requireWhitespace (); parseUntil ("?>"); } expandPE = saved; handler.processingInstruction (name, dataBufferToString ()); } /** * Parse a CDATA section. * <pre> * [18] CDSect ::= CDStart CData CDEnd * [19] CDStart ::= '<![CDATA[' * [20] CData ::= (Char* - (Char* ']]>' Char*)) * [21] CDEnd ::= ']]>' * </pre> * <p> (The '<![CDATA[' has already been read.) */ private void parseCDSect () throws Exception { parseUntil ("]]>"); dataBufferFlush (); } /** * Parse the prolog of an XML document. * <pre> * [22] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)? * </pre> * <p>There are a couple of tricks here. First, it is necessary to * declare the XML default attributes after the DTD (if present) * has been read. [??] Second, it is not possible to expand general * references in attribute value literals until after the entire * DTD (if present) has been parsed. * <p>We do not look for the XML declaration here, because it was * handled by pushURL (). * @see pushURL */ private void parseProlog () throws Exception { parseMisc (); if (tryRead ("<!DOCTYPE")) { parseDoctypedecl (); parseMisc (); } } /** * Parse the XML declaration. * <pre> * [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' * [24] VersionInfo ::= S 'version' Eq * ("'" VersionNum "'" | '"' VersionNum '"' ) * [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')* * [32] SDDecl ::= S 'standalone' Eq * ( "'"" ('yes' | 'no') "'"" | '"' ("yes" | "no") '"' ) * [80] EncodingDecl ::= S 'encoding' Eq * ( "'" EncName "'" | "'" EncName "'" ) * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* * </pre> * <p> (The <code><?xml</code> and whitespace have already been read.) * @return the encoding in the declaration, uppercased; or null * @see #parseTextDecl * @see #setupDecoding */ private String parseXMLDecl (boolean ignoreEncoding) throws SAXException, IOException { String version; String encodingName = null; String standalone = null; boolean white; int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; // Read the version. require ("version"); parseEq (); version = readLiteral (flags); if (!version.equals ("1.0")) { error ("unsupported XML version", version, "1.0"); } // Try reading an encoding declaration. white = tryWhitespace (); if (tryRead ("encoding")) { if (!white) error ("whitespace required before 'encoding='"); parseEq (); encodingName = readLiteral (flags); if (!ignoreEncoding) setupDecoding (encodingName); } // Try reading a standalone declaration if (encodingName != null) white = tryWhitespace (); if (tryRead ("standalone")) { if (!white) error ("whitespace required before 'standalone='"); parseEq (); standalone = readLiteral (flags); if (! ("yes".equals (standalone) || "no".equals (standalone))) error ("standalone flag must be 'yes' or 'no'"); } skipWhitespace (); require ("?>"); return encodingName; } /** * Parse a text declaration. * <pre> * [79] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>' * [80] EncodingDecl ::= S 'encoding' Eq * ( '"' EncName '"' | "'" EncName "'" ) * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* * </pre> * <p> (The <code><?xml</code>' and whitespace have already been read.) * @return the encoding in the declaration, uppercased; or null * @see #parseXMLDecl * @see #setupDecoding */ private String parseTextDecl (boolean ignoreEncoding) throws SAXException, IOException { String encodingName = null; int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; // Read an optional version. if (tryRead ("version")) { String version; parseEq (); version = readLiteral (flags); if (!version.equals ("1.0")) { error ("unsupported XML version", version, "1.0"); } requireWhitespace (); } // Read the encoding. require ("encoding"); parseEq (); encodingName = readLiteral (flags); if (!ignoreEncoding) setupDecoding (encodingName); skipWhitespace (); require ("?>"); return encodingName; } /** * Sets up internal state so that we can decode an entity using the * specified encoding. This is used when we start to read an entity * and we have been given knowledge of its encoding before we start to * read any data (e.g. from a SAX input source or from a MIME type). * * <p> It is also used after autodetection, at which point only very * limited adjustments to the encoding may be used (switching between * related builtin decoders). * * @param encodingName The name of the encoding specified by the user. * @exception IOException if the encoding isn't supported either * internally to this parser, or by the hosting JVM. * @see #parseXMLDecl * @see #parseTextDecl */ private void setupDecoding (String encodingName) throws SAXException, IOException { encodingName = encodingName.toUpperCase (); // ENCODING_EXTERNAL indicates an encoding that wasn't // autodetected ... we can use builtin decoders, or // ones from the JVM (InputStreamReader). // Otherwise we can only tweak what was autodetected, and // only for single byte (ASCII derived) builtin encodings. // ASCII-derived encodings if (encoding == ENCODING_UTF_8 || encoding == ENCODING_EXTERNAL) { if (encodingName.equals ("ISO-8859-1") || encodingName.equals ("8859_1") || encodingName.equals ("ISO8859_1") ) { encoding = ENCODING_ISO_8859_1; return; } else if (encodingName.equals ("US-ASCII") || encodingName.equals ("ASCII")) { encoding = ENCODING_ASCII; return; } else if (encodingName.equals ("UTF-8") || encodingName.equals ("UTF8")) { encoding = ENCODING_UTF_8; return; } else if (encoding != ENCODING_EXTERNAL) { // fatal error error ("unsupported ASCII-derived encoding", encodingName, "UTF-8, US-ASCII, or ISO-8859-1"); } // else fallthrough ... // it's ASCII-ish and something other than a builtin } // Unicode and such if (encoding == ENCODING_UCS_2_12 || encoding == ENCODING_UCS_2_21) { if (!(encodingName.equals ("ISO-10646-UCS-2") || encodingName.equals ("UTF-16") || encodingName.equals ("UTF-16BE") || encodingName.equals ("UTF-16LE"))) error ("unsupported Unicode encoding", encodingName, "UTF-16"); return; } // four byte encodings if (encoding == ENCODING_UCS_4_1234 || encoding == ENCODING_UCS_4_4321 || encoding == ENCODING_UCS_4_2143 || encoding == ENCODING_UCS_4_3412) { if (!encodingName.equals ("ISO-10646-UCS-4")) error ("unsupported 32-bit encoding", encodingName, "ISO-10646-UCS-4"); return; } // assert encoding == ENCODING_EXTERNAL // if (encoding != ENCODING_EXTERNAL) // throw new RuntimeException ("encoding = " + encoding); if (encodingName.equals ("UTF-16BE")) { encoding = ENCODING_UCS_2_12; return; } if (encodingName.equals ("UTF-16LE")) { encoding = ENCODING_UCS_2_21; return; } // We couldn't use the builtin decoders at all. But we can try to // create a reader, since we haven't messed up buffering. Tweak // the encoding name if necessary. if (encodingName.equals ("UTF-16") || encodingName.equals ("ISO-10646-UCS-2")) encodingName = "Unicode"; // Ignoring all the EBCDIC aliases here reader = new InputStreamReader (is, encodingName); sourceType = INPUT_READER; is = null; } /** * Parse miscellaneous markup outside the document element and DOCTYPE * declaration. * <pre> * [27] Misc ::= Comment | PI | S * </pre> */ private void parseMisc () throws Exception { while (true) { skipWhitespace (); if (tryRead ("<?")) { parsePI (); } else if (tryRead ("<!--")) { parseComment (); } else { return; } } } /** * Parse a document type declaration. * <pre> * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? * ('[' (markupdecl | PEReference | S)* ']' S?)? '>' * </pre> * <p> (The <code><!DOCTYPE</code> has already been read.) */ private void parseDoctypedecl () throws Exception { char c; String doctypeName, ids[]; // Read the document type name. requireWhitespace (); doctypeName = readNmtoken (true); // Read the External subset's IDs skipWhitespace (); ids = readExternalIds (false); // report (a) declaration of name, (b) lexical info (ids) handler.doctypeDecl (doctypeName, ids [0], ids [1]); // Internal subset is parsed first, if present skipWhitespace (); if (tryRead ('[')) { // loop until the subset ends while (true) { expandPE = true; skipWhitespace (); expandPE = false; if (tryRead (']')) { break; // end of subset } else { // WFC, PEs in internal subset (only between decls) peIsError = expandPE = true; parseMarkupdecl (); peIsError = expandPE = false; } } } // Read the external subset, if any if (ids [1] != null) { pushURL ("[external subset]", ids [0], ids [1], null, null, null); // Loop until we end up back at '>' while (true) { expandPE = true; skipWhitespace (); expandPE = false; if (tryRead ('>')) { break; } else { expandPE = true; parseMarkupdecl (); expandPE = false; } } } else { // No external subset. skipWhitespace (); require ('>'); } // done dtd handler.endDoctype (); expandPE = false; } /** * Parse a markup declaration in the internal or external DTD subset. * <pre> * [29] markupdecl ::= elementdecl | Attlistdecl | EntityDecl * | NotationDecl | PI | Comment * [30] extSubsetDecl ::= (markupdecl | conditionalSect * | PEReference | S) * * </pre> * <p> Reading toplevel PE references is handled as a lexical issue * by the caller, as is whitespace. */ private void parseMarkupdecl () throws Exception { if (tryRead ("<!ELEMENT")) { parseElementdecl (); } else if (tryRead ("<!ATTLIST")) { parseAttlistDecl (); } else if (tryRead ("<!ENTITY")) { parseEntityDecl (); } else if (tryRead ("<!NOTATION")) { parseNotationDecl (); } else if (tryRead ("<?")) { parsePI (); } else if (tryRead ("<!--")) { parseComment (); } else if (tryRead ("<![")) { if (inputStack.size () > 0) parseConditionalSect (); else error ("conditional sections illegal in internal subset"); } else { error ("expected markup declaration"); } } /** * Parse an element, with its tags. * <pre> * [39] element ::= EmptyElementTag | STag content ETag * [40] STag ::= '<' Name (S Attribute)* S? '>' * [44] EmptyElementTag ::= '<' Name (S Attribute)* S? '/>' * </pre> * <p> (The '<' has already been read.) * <p>NOTE: this method actually chains onto parseContent (), if necessary, * and parseContent () will take care of calling parseETag (). */ private void parseElement () throws Exception { String gi; char c; int oldElementContent = currentElementContent; String oldElement = currentElement; Object element []; // This is the (global) counter for the // array of specified attributes. tagAttributePos = 0; // Read the element type name. gi = readNmtoken (true); // Determine the current content type. currentElement = gi; element = (Object []) elementInfo.get (gi); currentElementContent = getContentType (element, CONTENT_ANY); // Read the attributes, if any. // After this loop, "c" is the closing delimiter. boolean white = tryWhitespace (); c = readCh (); while (c != '/' && c != '>') { unread (c); if (!white) error ("need whitespace between attributes"); parseAttribute (gi); white = tryWhitespace (); c = readCh (); } // Supply any defaulted attributes. Enumeration atts = declaredAttributes (element); if (atts != null) { String aname;loop: while (atts.hasMoreElements ()) { aname = (String) atts.nextElement (); // See if it was specified. for (int i = 0; i < tagAttributePos; i++) { if (tagAttributes [i] == aname) { continue loop; } } // I guess not... handler.attribute (aname, getAttributeExpandedValue (gi, aname), false); } } // Figure out if this is a start tag // or an empty element, and dispatch an // event accordingly. switch (c) { case '>': handler.startElement (gi); parseContent (); break; case '/': require ('>'); handler.startElement (gi); handler.endElement (gi); break; } // Restore the previous state. currentElement = oldElement; currentElementContent = oldElementContent; } /** * Parse an attribute assignment.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -