xmlparser.java
来自「kaffe Java 解释器语言,源码,Java的子集系统,开放源代码」· Java 代码 · 共 2,494 行 · 第 1/5 页
JAVA
2,494 行
parseUntil (endDelimCDATA); dataBufferFlush (); } /** * Parse the prolog of an XML document. * <pre> * [22] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)? * </pre> * <p>We do not look for the XML declaration here, because it was * handled by pushURL (). * @see pushURL * @return true if a DTD was read. */ private boolean parseProlog () throws Exception { parseMisc (); if (tryRead ("<!DOCTYPE")) { parseDoctypedecl (); parseMisc (); return true; } return false; } private void checkLegalVersion (String version) throws SAXException { int len = version.length (); for (int i = 0; i < len; i++) { char c = version.charAt (i); if ('0' <= c && c <= '9') continue; if (c == '_' || c == '.' || c == ':' || c == '-') continue; if ('a' <= c && c <= 'z') continue; if ('A' <= c && c <= 'Z') continue; error ("illegal character in version", version, "1.0"); } } /** * Parse the XML declaration. * <pre> * [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' * [24] VersionInfo ::= S 'version' Eq * ("'" VersionNum "'" | '"' VersionNum '"' ) * [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')* * [32] SDDecl ::= S 'standalone' Eq * ( "'"" ('yes' | 'no') "'"" | '"' ("yes" | "no") '"' ) * [80] EncodingDecl ::= S 'encoding' Eq * ( "'" EncName "'" | "'" EncName "'" ) * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* * </pre> * <p> (The <code><?xml</code> and whitespace have already been read.) * @return the encoding in the declaration, uppercased; or null * @see #parseTextDecl * @see #setupDecoding */ private String parseXMLDecl (boolean ignoreEncoding) throws SAXException, IOException { String version; String encodingName = null; String standalone = null; int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; // Read the version. require ("version"); parseEq (); checkLegalVersion (version = readLiteral (flags)); if (!version.equals ("1.0")) handler.warn ("expected XML version 1.0, not: " + version); // Try reading an encoding declaration. boolean white = tryWhitespace (); if (tryRead ("encoding")) { if (!white) error ("whitespace required before 'encoding='"); parseEq (); encodingName = readLiteral (flags); if (!ignoreEncoding) setupDecoding (encodingName); } // Try reading a standalone declaration if (encodingName != null) white = tryWhitespace (); if (tryRead ("standalone")) { if (!white) error ("whitespace required before 'standalone='"); parseEq (); standalone = readLiteral (flags); if ("yes".equals (standalone)) docIsStandalone = true; else if (!"no".equals (standalone)) error ("standalone flag must be 'yes' or 'no'"); } skipWhitespace (); require ("?>"); return encodingName; } /** * Parse a text declaration. * <pre> * [79] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>' * [80] EncodingDecl ::= S 'encoding' Eq * ( '"' EncName '"' | "'" EncName "'" ) * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* * </pre> * <p> (The <code><?xml</code>' and whitespace have already been read.) * @return the encoding in the declaration, uppercased; or null * @see #parseXMLDecl * @see #setupDecoding */ private String parseTextDecl (boolean ignoreEncoding) throws SAXException, IOException { String encodingName = null; int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; // Read an optional version. if (tryRead ("version")) { String version; parseEq (); checkLegalVersion (version = readLiteral (flags)); if (!version.equals ("1.0")) handler.warn ("expected XML version 1.0, not: " + version); requireWhitespace (); } // Read the encoding. require ("encoding"); parseEq (); encodingName = readLiteral (flags); if (!ignoreEncoding) setupDecoding (encodingName); skipWhitespace (); require ("?>"); return encodingName; } /** * Sets up internal state so that we can decode an entity using the * specified encoding. This is used when we start to read an entity * and we have been given knowledge of its encoding before we start to * read any data (e.g. from a SAX input source or from a MIME type). * * <p> It is also used after autodetection, at which point only very * limited adjustments to the encoding may be used (switching between * related builtin decoders). * * @param encodingName The name of the encoding specified by the user. * @exception IOException if the encoding isn't supported either * internally to this parser, or by the hosting JVM. * @see #parseXMLDecl * @see #parseTextDecl */ private void setupDecoding (String encodingName) throws SAXException, IOException { encodingName = encodingName.toUpperCase (); // ENCODING_EXTERNAL indicates an encoding that wasn't // autodetected ... we can use builtin decoders, or // ones from the JVM (InputStreamReader). // Otherwise we can only tweak what was autodetected, and // only for single byte (ASCII derived) builtin encodings. // ASCII-derived encodings if (encoding == ENCODING_UTF_8 || encoding == ENCODING_EXTERNAL) { if (encodingName.equals ("ISO-8859-1") || encodingName.equals ("8859_1") || encodingName.equals ("ISO8859_1") ) { encoding = ENCODING_ISO_8859_1; return; } else if (encodingName.equals ("US-ASCII") || encodingName.equals ("ASCII")) { encoding = ENCODING_ASCII; return; } else if (encodingName.equals ("UTF-8") || encodingName.equals ("UTF8")) { encoding = ENCODING_UTF_8; return; } else if (encoding != ENCODING_EXTERNAL) { // used to start with a new reader ... throw new UnsupportedEncodingException (encodingName); } // else fallthrough ... // it's ASCII-ish and something other than a builtin } // Unicode and such if (encoding == ENCODING_UCS_2_12 || encoding == ENCODING_UCS_2_21) { if (!(encodingName.equals ("ISO-10646-UCS-2") || encodingName.equals ("UTF-16") || encodingName.equals ("UTF-16BE") || encodingName.equals ("UTF-16LE"))) error ("unsupported Unicode encoding", encodingName, "UTF-16"); return; } // four byte encodings if (encoding == ENCODING_UCS_4_1234 || encoding == ENCODING_UCS_4_4321 || encoding == ENCODING_UCS_4_2143 || encoding == ENCODING_UCS_4_3412) { // Strictly: "UCS-4" == "UTF-32BE"; also, "UTF-32LE" exists if (!encodingName.equals ("ISO-10646-UCS-4")) error ("unsupported 32-bit encoding", encodingName, "ISO-10646-UCS-4"); return; } // assert encoding == ENCODING_EXTERNAL // if (encoding != ENCODING_EXTERNAL) // throw new RuntimeException ("encoding = " + encoding); if (encodingName.equals ("UTF-16BE")) { encoding = ENCODING_UCS_2_12; return; } if (encodingName.equals ("UTF-16LE")) { encoding = ENCODING_UCS_2_21; return; } // We couldn't use the builtin decoders at all. But we can try to // create a reader, since we haven't messed up buffering. Tweak // the encoding name if necessary. if (encodingName.equals ("UTF-16") || encodingName.equals ("ISO-10646-UCS-2")) encodingName = "Unicode"; // Ignoring all the EBCDIC aliases here reader = new InputStreamReader (is, encodingName); sourceType = INPUT_READER; } /** * Parse miscellaneous markup outside the document element and DOCTYPE * declaration. * <pre> * [27] Misc ::= Comment | PI | S * </pre> */ private void parseMisc () throws Exception { while (true) { skipWhitespace (); if (tryRead (startDelimPI)) { parsePI (); } else if (tryRead (startDelimComment)) { parseComment (); } else { return; } } } /** * Parse a document type declaration. * <pre> * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? * ('[' (markupdecl | PEReference | S)* ']' S?)? '>' * </pre> * <p> (The <code><!DOCTYPE</code> has already been read.) */ private void parseDoctypedecl () throws Exception { String rootName, ids[]; // Read the document type name. requireWhitespace (); rootName = readNmtoken (true); // Read the External subset's IDs skipWhitespace (); ids = readExternalIds (false, true); // report (a) declaration of name, (b) lexical info (ids) handler.doctypeDecl (rootName, ids [0], ids [1]); // Internal subset is parsed first, if present skipWhitespace (); if (tryRead ('[')) { // loop until the subset ends while (true) { doReport = expandPE = true; skipWhitespace (); doReport = expandPE = false; if (tryRead (']')) { break; // end of subset } else { // WFC, PEs in internal subset (only between decls) peIsError = expandPE = true; parseMarkupdecl (); peIsError = expandPE = false; } } } skipWhitespace (); require ('>'); // Read the external subset, if any InputSource subset; if (ids [1] == null) subset = handler.getExternalSubset (rootName, handler.getSystemId ()); else subset = null; if (ids [1] != null || subset != null) { pushString (null, ">"); // NOTE: [dtd] is so we say what SAX2 expects, // though it's misleading (subset, not entire dtd) if (ids [1] != null) pushURL (true, "[dtd]", ids, null, null, null, true); else { handler.warn ("modifying document by adding external subset"); pushURL (true, "[dtd]", new String [] { subset.getPublicId (), subset.getSystemId (), null }, subset.getCharacterStream (), subset.getByteStream (), subset.getEncoding (), false); } // Loop until we end up back at '>' while (true) { doReport = expandPE = true; skipWhitespace (); doReport = expandPE = false; if (tryRead ('>')) { break; } else { expandPE = true; parseMarkupdecl (); expandPE = false; } } // the ">" string isn't popped yet if (inputStack.size () != 1) error ("external subset has unmatched '>'"); } // done dtd handler.endDoctype (); expandPE = false; doReport = true; } /** * Parse a markup declaration in the internal or external DTD subset. * <pre> * [29] markupdecl ::= elementdecl | Attlistdecl | EntityDecl * | NotationDecl | PI | Comment * [30] extSubsetDecl ::= (markupdecl | conditionalSect * | PEReference | S) * * </pre> * <p> Reading toplevel PE references is handled as a lexical issue * by the caller, as is whitespace. */ private void parseMarkupdecl () throws Exception { char saved [] = null; boolean savedPE = expandPE; // prevent "<%foo;" and ensures saved entity is right require ('<'); unread ('<'); expandPE = false; if (tryRead ("<!ELEMENT")) { saved = readBuffer; expandPE = savedPE; parseElementDecl (); } else if (tryRead ("<!ATTLIST")) { saved = readBuffer; expandPE = savedPE; parseAttlistDecl (); } else if (tryRead ("<!ENTITY")) { saved = readBuffer; expandPE = savedPE; parseEntityDecl (); } else if (tryRead ("<!NOTATION")) { saved = readBuffer; expandPE = savedPE; parseNotationDecl (); } else if (tryRead (startDelimPI)) { saved = readBuffer; expandPE = savedPE; parsePI (); } else if (tryRead (startDelimComment)) { saved = readBuffer; expandPE = savedPE; parseComment (); } else if (tryRead ("<![")) { saved = readBuffer; expandPE = savedPE; if (inputStack.size () > 0) parseConditionalSect (saved); else error ("conditional sections illegal in internal subset"); } else { error ("expected markup declaration"); } // VC: Proper Decl/PE Nesting if (readBuffer != saved) handler.verror ("Illegal Declaration/PE nesting"); } /** * Parse an element, with its tags. * <pre> * [39] element ::= EmptyElementTag | STag content ETag * [40] STag ::= '<' Name (S Attribute)* S? '>' * [44] EmptyElementTag ::= '<' Name (S Attribute)* S? '/>' * </pre> * <p> (The '<' has already been read.) * <p>NOTE: this method actually chains onto parseContent (), if necessary, * and parseContent () will take care of calling parseETag (). */ private void parseElement (boolean maybeGetSubset) throws Exception { String gi; char c; int oldElementContent = currentElementContent; String oldElement = currentElement; Object element []; // This is the (global) counter for the // array of specified attributes. tagAttributePos = 0; // Read the element type name. gi = readNmtoken (true); // If we saw no DTD, and this is the document root element, // let the application modify the input stream by providing one. if (maybeGetSubset) { InputSource subset = handler.getExternalSubset (gi, handler.getSystemId ()); if (subset != null) { String publicId = subset.getPublicId (); String systemId = subset.getSystemId (); handler.warn ("modifying document by adding DTD"); handler.doctypeDecl (gi, publicId, systemId); pushString (null, ">"); // NOTE: [dtd] is so we say what SAX2 expects, // though it's misleading (subset, not entire dtd) pushURL (true, "[dtd]", new String [] { publicId, systemId, null }, subset.getCharacterStream (), subset.getByteStream (), subset.getEncoding (), false); // Loop until we end up back at '>' while (true) { doReport = expandPE = true; skipWhitespace (); doReport = expandPE = false;
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?