xmlparser.java

来自「kaffe Java 解释器语言,源码,Java的子集系统,开放源代码」· Java 代码 · 共 2,494 行 · 第 1/5 页

JAVA
2,494
字号
	parseUntil (endDelimCDATA);	dataBufferFlush ();    }    /**     * Parse the prolog of an XML document.     * <pre>     * [22] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)?     * </pre>     * <p>We do not look for the XML declaration here, because it was     * handled by pushURL ().     * @see pushURL     * @return true if a DTD was read.     */    private boolean parseProlog ()    throws Exception    {	parseMisc ();	if (tryRead ("<!DOCTYPE")) {	    parseDoctypedecl ();	    parseMisc ();	    return true;	}	return false;    }    private void checkLegalVersion (String version)    throws SAXException    {	int len = version.length ();	for (int i = 0; i < len; i++) {	    char c = version.charAt (i);	    if ('0' <= c && c <= '9')		continue;	    if (c == '_' || c == '.' || c == ':' || c == '-')		continue;	    if ('a' <= c && c <= 'z')		continue;	    if ('A' <= c && c <= 'Z')		continue;	    error ("illegal character in version", version, "1.0");	}    }    /**     * Parse the XML declaration.     * <pre>     * [23] XMLDecl ::= '&lt;?xml' VersionInfo EncodingDecl? SDDecl? S? '?&gt;'     * [24] VersionInfo ::= S 'version' Eq     *		("'" VersionNum "'" | '"' VersionNum '"' )     * [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')*     * [32] SDDecl ::= S 'standalone' Eq     *		( "'"" ('yes' | 'no') "'"" | '"' ("yes" | "no") '"' )     * [80] EncodingDecl ::= S 'encoding' Eq     *		( "'" EncName "'" | "'" EncName "'" )     * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*     * </pre>     * <p> (The <code>&lt;?xml</code> and whitespace have already been read.)     * @return the encoding in the declaration, uppercased; or null     * @see #parseTextDecl     * @see #setupDecoding     */    private String parseXMLDecl (boolean ignoreEncoding)    throws SAXException, IOException    {	String	version;	String	encodingName = null;	String	standalone = null;	int	flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;	// Read the version.	require ("version");	parseEq ();	checkLegalVersion (version = readLiteral (flags));	if (!version.equals ("1.0"))	    handler.warn ("expected XML version 1.0, not: " + version);	// Try reading an encoding declaration.	boolean white = tryWhitespace ();	if (tryRead ("encoding")) {	    if (!white)		error ("whitespace required before 'encoding='");	    parseEq ();	    encodingName = readLiteral (flags);	    if (!ignoreEncoding)		setupDecoding (encodingName);	}	// Try reading a standalone declaration	if (encodingName != null)	    white = tryWhitespace ();	if (tryRead ("standalone")) {	    if (!white)		error ("whitespace required before 'standalone='");	    parseEq ();	    standalone = readLiteral (flags);	    if ("yes".equals (standalone))		docIsStandalone = true;	    else if (!"no".equals (standalone))		error ("standalone flag must be 'yes' or 'no'");	}	skipWhitespace ();	require ("?>");	return encodingName;    }    /**     * Parse a text declaration.     * <pre>     * [79] TextDecl ::= '&lt;?xml' VersionInfo? EncodingDecl S? '?&gt;'     * [80] EncodingDecl ::= S 'encoding' Eq     *		( '"' EncName '"' | "'" EncName "'" )     * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*     * </pre>     * <p> (The <code>&lt;?xml</code>' and whitespace have already been read.)     * @return the encoding in the declaration, uppercased; or null     * @see #parseXMLDecl     * @see #setupDecoding     */    private String parseTextDecl (boolean ignoreEncoding)    throws SAXException, IOException    {	String	encodingName = null;	int	flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;	// Read an optional version.	if (tryRead ("version")) {	    String version;	    parseEq ();	    checkLegalVersion (version = readLiteral (flags));	    if (!version.equals ("1.0"))		handler.warn ("expected XML version 1.0, not: " + version);	    requireWhitespace ();	}	// Read the encoding.	require ("encoding");	parseEq ();	encodingName = readLiteral (flags);	if (!ignoreEncoding)	    setupDecoding (encodingName);	skipWhitespace ();	require ("?>");	return encodingName;    }    /**     * Sets up internal state so that we can decode an entity using the     * specified encoding.  This is used when we start to read an entity     * and we have been given knowledge of its encoding before we start to     * read any data (e.g. from a SAX input source or from a MIME type).     *     * <p> It is also used after autodetection, at which point only very     * limited adjustments to the encoding may be used (switching between     * related builtin decoders).     *     * @param encodingName The name of the encoding specified by the user.     * @exception IOException if the encoding isn't supported either     *	internally to this parser, or by the hosting JVM.     * @see #parseXMLDecl     * @see #parseTextDecl     */    private void setupDecoding (String encodingName)    throws SAXException, IOException    {	encodingName = encodingName.toUpperCase ();	// ENCODING_EXTERNAL indicates an encoding that wasn't	// autodetected ... we can use builtin decoders, or	// ones from the JVM (InputStreamReader).	// Otherwise we can only tweak what was autodetected, and	// only for single byte (ASCII derived) builtin encodings.	// ASCII-derived encodings	if (encoding == ENCODING_UTF_8 || encoding == ENCODING_EXTERNAL) {	    if (encodingName.equals ("ISO-8859-1")		    || encodingName.equals ("8859_1")		    || encodingName.equals ("ISO8859_1")	      ) {		encoding = ENCODING_ISO_8859_1;		return;	    } else if (encodingName.equals ("US-ASCII")			|| encodingName.equals ("ASCII")) {		encoding = ENCODING_ASCII;		return;	    } else if (encodingName.equals ("UTF-8")			|| encodingName.equals ("UTF8")) {		encoding = ENCODING_UTF_8;		return;	    } else if (encoding != ENCODING_EXTERNAL) {		// used to start with a new reader ...		throw new UnsupportedEncodingException (encodingName);	    }	    // else fallthrough ...	    // it's ASCII-ish and something other than a builtin	}	// Unicode and such	if (encoding == ENCODING_UCS_2_12 || encoding == ENCODING_UCS_2_21) {	    if (!(encodingName.equals ("ISO-10646-UCS-2")		    || encodingName.equals ("UTF-16")		    || encodingName.equals ("UTF-16BE")		    || encodingName.equals ("UTF-16LE")))		error ("unsupported Unicode encoding",		       encodingName,		       "UTF-16");	    return;	}	// four byte encodings	if (encoding == ENCODING_UCS_4_1234		|| encoding == ENCODING_UCS_4_4321		|| encoding == ENCODING_UCS_4_2143		|| encoding == ENCODING_UCS_4_3412) {	    // Strictly:  "UCS-4" == "UTF-32BE"; also, "UTF-32LE" exists	    if (!encodingName.equals ("ISO-10646-UCS-4"))		error ("unsupported 32-bit encoding",		       encodingName,		       "ISO-10646-UCS-4");	    return;	}	// assert encoding == ENCODING_EXTERNAL	// if (encoding != ENCODING_EXTERNAL)	//     throw new RuntimeException ("encoding = " + encoding);	if (encodingName.equals ("UTF-16BE")) {	    encoding = ENCODING_UCS_2_12;	    return;	}	if (encodingName.equals ("UTF-16LE")) {	    encoding = ENCODING_UCS_2_21;	    return;	}	// We couldn't use the builtin decoders at all.  But we can try to	// create a reader, since we haven't messed up buffering.  Tweak	// the encoding name if necessary.	if (encodingName.equals ("UTF-16")		|| encodingName.equals ("ISO-10646-UCS-2"))	    encodingName = "Unicode";	// Ignoring all the EBCDIC aliases here	reader = new InputStreamReader (is, encodingName);	sourceType = INPUT_READER;    }    /**     * Parse miscellaneous markup outside the document element and DOCTYPE     * declaration.     * <pre>     * [27] Misc ::= Comment | PI | S     * </pre>     */    private void parseMisc ()    throws Exception    {	while (true) {	    skipWhitespace ();	    if (tryRead (startDelimPI)) {		parsePI ();	    } else if (tryRead (startDelimComment)) {		parseComment ();	    } else {		return;	    }	}    }    /**     * Parse a document type declaration.     * <pre>     * [28] doctypedecl ::= '&lt;!DOCTYPE' S Name (S ExternalID)? S?     *		('[' (markupdecl | PEReference | S)* ']' S?)? '&gt;'     * </pre>     * <p> (The <code>&lt;!DOCTYPE</code> has already been read.)     */    private void parseDoctypedecl ()    throws Exception    {	String rootName, ids[];	// Read the document type name.	requireWhitespace ();	rootName = readNmtoken (true);	// Read the External subset's IDs	skipWhitespace ();	ids = readExternalIds (false, true);	// report (a) declaration of name, (b) lexical info (ids)	handler.doctypeDecl (rootName, ids [0], ids [1]);	// Internal subset is parsed first, if present	skipWhitespace ();	if (tryRead ('[')) {	    // loop until the subset ends	    while (true) {		doReport = expandPE = true;		skipWhitespace ();		doReport = expandPE = false;		if (tryRead (']')) {		    break; 		// end of subset		} else {		    // WFC, PEs in internal subset (only between decls)		    peIsError = expandPE = true;		    parseMarkupdecl ();		    peIsError = expandPE = false;		}	    }	}	skipWhitespace ();	require ('>');	// Read the external subset, if any	InputSource	subset;	if (ids [1] == null)	    subset = handler.getExternalSubset (rootName,	    		handler.getSystemId ());	else	    subset = null;	if (ids [1] != null || subset != null) {	    pushString (null, ">");	    // NOTE:  [dtd] is so we say what SAX2 expects,	    // though it's misleading (subset, not entire dtd)	    if (ids [1] != null)		pushURL (true, "[dtd]", ids, null, null, null, true);	    else {		handler.warn ("modifying document by adding external subset");		pushURL (true, "[dtd]",		    new String [] { subset.getPublicId (),			    subset.getSystemId (), null },		    subset.getCharacterStream (),		    subset.getByteStream (),		    subset.getEncoding (),		    false);	    }	    // Loop until we end up back at '>'	    while (true) {		doReport = expandPE = true;		skipWhitespace ();		doReport = expandPE = false;		if (tryRead ('>')) {		    break;		} else {		    expandPE = true;		    parseMarkupdecl ();		    expandPE = false;		}	    }	    // the ">" string isn't popped yet	    if (inputStack.size () != 1)		error ("external subset has unmatched '>'");	}	// done dtd	handler.endDoctype ();	expandPE = false;	doReport = true;    }    /**     * Parse a markup declaration in the internal or external DTD subset.     * <pre>     * [29] markupdecl ::= elementdecl | Attlistdecl | EntityDecl     *		| NotationDecl | PI | Comment     * [30] extSubsetDecl ::= (markupdecl | conditionalSect     *		| PEReference | S) *     * </pre>     * <p> Reading toplevel PE references is handled as a lexical issue     * by the caller, as is whitespace.     */    private void parseMarkupdecl ()    throws Exception    {	char	saved [] = null;	boolean	savedPE = expandPE;	// prevent "<%foo;" and ensures saved entity is right	require ('<');	unread ('<');	expandPE = false;	if (tryRead ("<!ELEMENT")) {	    saved = readBuffer;	    expandPE = savedPE;	    parseElementDecl ();	} else if (tryRead ("<!ATTLIST")) {	    saved = readBuffer;	    expandPE = savedPE;	    parseAttlistDecl ();	} else if (tryRead ("<!ENTITY")) {	    saved = readBuffer;	    expandPE = savedPE;	    parseEntityDecl ();	} else if (tryRead ("<!NOTATION")) {	    saved = readBuffer;	    expandPE = savedPE;	    parseNotationDecl ();	} else if (tryRead (startDelimPI)) {	    saved = readBuffer;	    expandPE = savedPE;	    parsePI ();	} else if (tryRead (startDelimComment)) {	    saved = readBuffer;	    expandPE = savedPE;	    parseComment ();	} else if (tryRead ("<![")) {	    saved = readBuffer;	    expandPE = savedPE;	    if (inputStack.size () > 0)		parseConditionalSect (saved);	    else		error ("conditional sections illegal in internal subset");	} else {	    error ("expected markup declaration");	}	// VC: Proper Decl/PE Nesting	if (readBuffer != saved)	    handler.verror ("Illegal Declaration/PE nesting");    }    /**     * Parse an element, with its tags.     * <pre>     * [39] element ::= EmptyElementTag | STag content ETag     * [40] STag ::= '&lt;' Name (S Attribute)* S? '&gt;'     * [44] EmptyElementTag ::= '&lt;' Name (S Attribute)* S? '/&gt;'     * </pre>     * <p> (The '&lt;' has already been read.)     * <p>NOTE: this method actually chains onto parseContent (), if necessary,     * and parseContent () will take care of calling parseETag ().     */    private void parseElement (boolean maybeGetSubset)    throws Exception    {	String	gi;	char	c;	int	oldElementContent = currentElementContent;	String	oldElement = currentElement;	Object	element [];	// This is the (global) counter for the	// array of specified attributes.	tagAttributePos = 0;	// Read the element type name.	gi = readNmtoken (true);	// If we saw no DTD, and this is the document root element,	// let the application modify the input stream by providing one.	if (maybeGetSubset) {	    InputSource	subset = handler.getExternalSubset (gi,	    		handler.getSystemId ());	    if (subset != null) {		String	publicId = subset.getPublicId ();		String	systemId = subset.getSystemId ();		handler.warn ("modifying document by adding DTD");		handler.doctypeDecl (gi, publicId, systemId);		pushString (null, ">");		// NOTE:  [dtd] is so we say what SAX2 expects,		// though it's misleading (subset, not entire dtd)		pushURL (true, "[dtd]",		    new String [] { publicId, systemId, null },		    subset.getCharacterStream (),		    subset.getByteStream (),		    subset.getEncoding (),		    false);		// Loop until we end up back at '>'		while (true) {		    doReport = expandPE = true;		    skipWhitespace ();		    doReport = expandPE = false;

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?