xmlparser.java
来自「kaffe Java 解释器语言,源码,Java的子集系统,开放源代码」· Java 代码 · 共 2,494 行 · 第 1/5 页
JAVA
2,494 行
String nname, ids[]; requireWhitespace (); nname = readNmtoken (true); requireWhitespace (); // Read the external identifiers. ids = readExternalIds (true, false); // Register the notation. setNotation (nname, ids); skipWhitespace (); require ('>'); } /** * Parse character data. * <pre> * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) * </pre> */ private void parseCharData () throws Exception { char c; int state = 0; boolean pureWhite = false; // assert (dataBufferPos == 0); // are we expecting pure whitespace? it might be dirty... if (currentElementContent == CONTENT_ELEMENTS); pureWhite = true; // always report right out of readBuffer // to minimize (pointless) buffer copies while (true) { int lineAugment = 0; int columnAugment = 0; int i;loop: for (i = readBufferPos; i < readBufferLength; i++) { switch (c = readBuffer [i]) { case '\n': lineAugment++; columnAugment = 0; // pureWhite unmodified break; case '\r': // should not happen!! case '\t': case ' ': // pureWhite unmodified columnAugment++; break; case '&': case '<': columnAugment++; // pureWhite unmodified // CLEAN end of text sequence state = 1; break loop; case ']': // that's not a whitespace char, and // can not terminate pure whitespace either pureWhite = false; if ((i + 2) < readBufferLength) { if (readBuffer [i + 1] == ']' && readBuffer [i + 2] == '>') { // ERROR end of text sequence state = 2; break loop; } } else { // FIXME missing two end-of-buffer cases } columnAugment++; break; default: if (c < 0x0020 || c > 0xFFFD) error ("illegal XML character U+" + Integer.toHexString (c)); // that's not a whitespace char pureWhite = false; columnAugment++; } } // report text thus far if (lineAugment > 0) { line += lineAugment; column = columnAugment; } else { column += columnAugment; } // report characters/whitspace int length = i - readBufferPos; if (length != 0) { if (pureWhite) handler.ignorableWhitespace (readBuffer, readBufferPos, length); else handler.charData (readBuffer, readBufferPos, length); readBufferPos = i; } if (state != 0) break; // fill next buffer from this entity, or // pop stack and continue with previous entity unread (readCh ()); } // finish, maybe with error if (state != 1) // finish, no error error ("character data may not contain ']]>'"); } ////////////////////////////////////////////////////////////////////// // High-level reading and scanning methods. ////////////////////////////////////////////////////////////////////// /** * Require whitespace characters. */ private void requireWhitespace () throws SAXException, IOException { char c = readCh (); if (isWhitespace (c)) { skipWhitespace (); } else { error ("whitespace required", c, null); } } /** * Skip whitespace characters. * <pre> * [3] S ::= (#x20 | #x9 | #xd | #xa)+ * </pre> */ private void skipWhitespace () throws SAXException, IOException { // Start with a little cheat. Most of // the time, the white space will fall // within the current read buffer; if // not, then fall through. if (USE_CHEATS) { int lineAugment = 0; int columnAugment = 0;loop: for (int i = readBufferPos; i < readBufferLength; i++) { switch (readBuffer [i]) { case ' ': case '\t': case '\r': columnAugment++; break; case '\n': lineAugment++; columnAugment = 0; break; case '%': if (expandPE) break loop; // else fall through... default: readBufferPos = i; if (lineAugment > 0) { line += lineAugment; column = columnAugment; } else { column += columnAugment; } return; } } } // OK, do it the slow way. char c = readCh (); while (isWhitespace (c)) { c = readCh (); } unread (c); } /** * Read a name or (when parsing an enumeration) name token. * <pre> * [5] Name ::= (Letter | '_' | ':') (NameChar)* * [7] Nmtoken ::= (NameChar)+ * </pre> */ private String readNmtoken (boolean isName) throws SAXException, IOException { char c; if (USE_CHEATS) {loop: for (int i = readBufferPos; i < readBufferLength; i++) { c = readBuffer [i]; switch (c) { case '%': if (expandPE) break loop; // else fall through... // What may legitimately come AFTER a name/nmtoken? case '<': case '>': case '&': case ',': case '|': case '*': case '+': case '?': case ')': case '=': case '\'': case '"': case '[': case ' ': case '\t': case '\r': case '\n': case ';': case '/': int start = readBufferPos; if (i == start) error ("name expected", readBuffer [i], null); readBufferPos = i; return intern (readBuffer, start, i - start); default:// FIXME ... per IBM's OASIS test submission, these:// ? U+06dd // REJECT// BaseChar U+0132 U+0133 U+013F U+0140 U+0149 U+017F U+01C4 U+01CC// U+01F1 U+01F3 U+0E46 U+1011 U+1104 U+1108 U+110A U+110D// U+113B U+113F U+1141 U+114D U+114F U+1151 U+1156 U+1162// U+1164 U+1166 U+116B U+116F U+1174 U+119F U+11AC U+11B6// U+11B9 U+11BB U+11C3 U+11F1 U+212F U+0587// Combining U+309B // punt on exact tests from Appendix A; approximate // them using the Unicode ID start/part rules if (i == readBufferPos && isName) { if (!Character.isUnicodeIdentifierStart (c) && c != ':' && c != '_') error ("Not a name start character, U+" + Integer.toHexString (c)); } else if (!Character.isUnicodeIdentifierPart (c) && c != '-' && c != ':' && c != '_' && c != '.' && !isExtender (c)) error ("Not a name character, U+" + Integer.toHexString (c)); } } } nameBufferPos = 0; // Read the first character.loop: while (true) { c = readCh (); switch (c) { case '%': case '<': case '>': case '&': case ',': case '|': case '*': case '+': case '?': case ')': case '=': case '\'': case '"': case '[': case ' ': case '\t': case '\n': case '\r': case ';': case '/': unread (c); if (nameBufferPos == 0) { error ("name expected"); } // punt on exact tests from Appendix A, but approximate them if (isName && !Character.isUnicodeIdentifierStart ( nameBuffer [0]) && ":_".indexOf (nameBuffer [0]) == -1) error ("Not a name start character, U+" + Integer.toHexString (nameBuffer [0])); String s = intern (nameBuffer, 0, nameBufferPos); nameBufferPos = 0; return s; default: // punt on exact tests from Appendix A, but approximate them if ((nameBufferPos != 0 || !isName) && !Character.isUnicodeIdentifierPart (c) && ":-_.".indexOf (c) == -1 && !isExtender (c)) error ("Not a name character, U+" + Integer.toHexString (c)); if (nameBufferPos >= nameBuffer.length) nameBuffer = (char[]) extendArray (nameBuffer, nameBuffer.length, nameBufferPos); nameBuffer [nameBufferPos++] = c; } } } private static boolean isExtender (char c) { // [88] Extender ::= ... return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387 || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 || c == 0x3005 || (c >= 0x3031 && c <= 0x3035) || (c >= 0x309d && c <= 0x309e) || (c >= 0x30fc && c <= 0x30fe); } /** * Read a literal. With matching single or double quotes as * delimiters (and not embedded!) this is used to parse: * <pre> * [9] EntityValue ::= ... ([^%&] | PEReference | Reference)* ... * [10] AttValue ::= ... ([^<&] | Reference)* ... * [11] SystemLiteral ::= ... (URLchar - "'")* ... * [12] PubidLiteral ::= ... (PubidChar - "'")* ... * </pre> * as well as the quoted strings in XML and text declarations * (for version, encoding, and standalone) which have their * own constraints. */ private String readLiteral (int flags) throws SAXException, IOException { char delim, c; int startLine = line; boolean saved = expandPE; boolean savedReport = doReport; // Find the first delimiter. delim = readCh (); if (delim != '"' && delim != '\'') { error ("expected '\"' or \"'\"", delim, null); return null; } inLiteral = true; if ((flags & LIT_DISABLE_PE) != 0) expandPE = false; doReport = false; // Each level of input source has its own buffer; remember // ours, so we won't read the ending delimiter from any // other input source, regardless of entity processing. char ourBuf [] = readBuffer; // Read the literal. try { c = readCh ();loop: while (! (c == delim && readBuffer == ourBuf)) { switch (c) { // attributes and public ids are normalized // in almost the same ways case '\n': case '\r': if ((flags & (LIT_ATTRIBUTE | LIT_PUBID)) != 0) c = ' '; break; case '\t': if ((flags & LIT_ATTRIBUTE) != 0) c = ' '; break; case '&': c = readCh (); // Char refs are expanded immediately, except for // all the cases where it's deferred. if (c == '#') { if ((flags & LIT_DISABLE_CREF) != 0) { dataBufferAppend ('&'); break; } parseCharRef (false /* Do not do flushDataBuffer */); // exotic WFness risk: this is an entity literal, // dataBuffer [dataBufferPos - 1] == '&', and // following chars are a _partial_ entity/char ref // It looks like an entity ref ... } else { unread (c); // Expand it? if ((flags & LIT_ENTITY_REF) > 0) { parseEntityRef (false); // Is it just data? } else if ((flags & LIT_DISABLE_EREF) != 0) { dataBufferAppend ('&'); // OK, it will be an entity ref -- expanded later. } else { String name = readNmtoken (true); require (';'); dataBufferAppend ('&'); dataBufferAppend (name); dataBufferAppend (';'); } } c = readCh (); continue loop; case '<': // and why? Perhaps so "&foo;" expands the same // inside and outside an attribute? if ((flags & LIT_ATTRIBUTE) != 0) error ("attribute values may not contain '<'"); break; // We don't worry about case '%' and PE refs, readCh does. default: break; } dataBufferAppend (c); c = readCh (); } } catch (EOFException e) { error ("end of input while looking for delimiter (started on line " + startLine + ')', null, new Character (delim).toString ()); } inLiteral = false; expandPE = saved; doReport = savedReport; // Normalise whitespace if necessary. if ((flags & LIT_NORMALIZE) > 0) { dataBufferNormalize (); } // Return the value. return dataBufferToString (); } /** * Try reading external identifiers. * A system identifier is not required for notations. * @param inNotation Are we parsing a notation decl? * @param isSubset Parsing external subset decl (may be omitted)? * @return A three-member String array containing the identifiers, * or nulls. Order: public, system, baseURI. */ private String[] readExternalIds (boolean inNotation, boolean isSubset) throws Exception { char c; String ids[] = new String [3]; int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; if (tryRead ("PUBLIC")) { requireWhitespace (); ids [0] = readLiteral (LIT_NORMALIZE | LIT_PUBID | flags); if (inNotation) { skipWhitespace (); c = readCh (); unread (c); if (c == '"' || c == '\'') { ids [1] = readLiteral (flags); } } else { requireWhitespace (); ids [1] = readLiteral (flags); } for (int i = 0; i < ids [0].length (); i++) { c = ids [0].charAt (i); if (c >= 'a' && c <= 'z') continue; if (c >= 'A' && c <= 'Z') continue; if (" \r\n0123456789-' ()+,./:=?;!*#@$_%".indexOf (c) != -1) continue; error ("illegal PUBLIC id character U+" + Integer.toHexString (c)); } } else if (tryRead ("SYSTEM")) { requireWhitespace (); ids [1] = readLiteral (flags); } else if (!isSubset) error ("missing SYSTEM or PUBLIC keyword"); if (i
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?