📄 lexer.java
字号:
// got some characters ret = getNodeFactory ().createStringNode ( this.getPage (), start, end); else ret = null; return (ret); } /** * Generate a whitespace 'attribute', * @param attributes The list so far. * @param bookmarks The array of positions. */ private void whitespace (Vector attributes, int[] bookmarks) { if (bookmarks[1] > bookmarks[0]) attributes.addElement (new PageAttribute ( mPage, -1, -1, bookmarks[0], bookmarks[1], (char)0)); } /** * Generate a standalone attribute -- font. * @param attributes The list so far. * @param bookmarks The array of positions. */ private void standalone (Vector attributes, int[] bookmarks) { attributes.addElement (new PageAttribute ( mPage, bookmarks[1], bookmarks[2], -1, -1, (char)0)); } /** * Generate an empty attribute -- color=. * @param attributes The list so far. * @param bookmarks The array of positions. */ private void empty (Vector attributes, int[] bookmarks) { attributes.addElement (new PageAttribute ( mPage, bookmarks[1], bookmarks[2], bookmarks[2] + 1, -1, (char)0)); } /** * Generate an unquoted attribute -- size=1. * @param attributes The list so far. * @param bookmarks The array of positions. */ private void naked (Vector attributes, int[] bookmarks) { attributes.addElement (new PageAttribute ( mPage, bookmarks[1], bookmarks[2], bookmarks[3], bookmarks[4], (char)0)); } /** * Generate an single quoted attribute -- width='100%'. * @param attributes The list so far. * @param bookmarks The array of positions. */ private void single_quote (Vector attributes, int[] bookmarks) { attributes.addElement (new PageAttribute ( mPage, bookmarks[1], bookmarks[2], bookmarks[4] + 1, bookmarks[5], '\'')); } /** * Generate an double quoted attribute -- CONTENT="Test Development". * @param attributes The list so far. * @param bookmarks The array of positions. */ private void double_quote (Vector attributes, int[] bookmarks) { attributes.addElement (new PageAttribute ( mPage, bookmarks[1], bookmarks[2], bookmarks[5] + 1, bookmarks[6], '"')); } /** * Parse a tag. * Parse the name and attributes from a start tag.<p> * From the <a href="http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.2"> * HTML 4.01 Specification, W3C Recommendation 24 December 1999</a> * http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.2<p> * <cite> * 3.2.2 Attributes<p> * Elements may have associated properties, called attributes, which may * have values (by default, or set by authors or scripts). Attribute/value * pairs appear before the final ">" of an element's start tag. Any number * of (legal) attribute value pairs, separated by spaces, may appear in an * element's start tag. They may appear in any order.<p> * In this example, the id attribute is set for an H1 element: * <code> * <H1 id="section1"> * </code> * This is an identified heading thanks to the id attribute * <code> * </H1> * </code> * By default, SGML requires that all attribute values be delimited using * either double quotation marks (ASCII decimal 34) or single quotation * marks (ASCII decimal 39). Single quote marks can be included within the * attribute value when the value is delimited by double quote marks, and * vice versa. Authors may also use numeric character references to * represent double quotes (&#34;) and single quotes (&#39;). * For doublequotes authors can also use the character entity reference * &quot;.<p> * In certain cases, authors may specify the value of an attribute without * any quotation marks. The attribute value may only contain letters * (a-z and A-Z), digits (0-9), hyphens (ASCII decimal 45), * periods (ASCII decimal 46), underscores (ASCII decimal 95), * and colons (ASCII decimal 58). We recommend using quotation marks even * when it is possible to eliminate them.<p> * Attribute names are always case-insensitive.<p> * Attribute values are generally case-insensitive. The definition of each * attribute in the reference manual indicates whether its value is * case-insensitive.<p> * All the attributes defined by this specification are listed in the * attribute index.<p> * </cite> * <p> * This method uses a state machine with the following states: * <ol> * <li>state 0 - outside of any attribute</li> * <li>state 1 - within attributre name</li> * <li>state 2 - equals hit</li> * <li>state 3 - within naked attribute value.</li> * <li>state 4 - within single quoted attribute value</li> * <li>state 5 - within double quoted attribute value</li> * <li>state 6 - whitespaces after attribute name could lead to state 2 (=)or state 0</li> * </ol> * <p> * The starting point for the various components is stored in an array * of integers that match the initiation point for the states one-for-one, * i.e. bookmarks[0] is where state 0 began, bookmarks[1] is where state 1 * began, etc. * Attributes are stored in a <code>Vector</code> having * one slot for each whitespace or attribute/value pair. * The first slot is for attribute name (kind of like a standalone attribute). * @param start The position at which to start scanning. * @return The parsed tag. * @exception ParserException If a problem occurs reading from the source. */ protected Node parseTag (int start) throws ParserException { boolean done; char ch; int state; int[] bookmarks; Vector attributes; done = false; attributes = new Vector (); state = 0; bookmarks = new int[8]; bookmarks[0] = mCursor.getPosition (); while (!done) { bookmarks[state + 1] = mCursor.getPosition (); ch = mPage.getCharacter (mCursor); switch (state) { case 0: // outside of any attribute if ((Page.EOF == ch) || ('>' == ch) || ('<' == ch)) { if ('<' == ch) { // don't consume the opening angle mPage.ungetCharacter (mCursor); bookmarks[state + 1] = mCursor.getPosition (); } whitespace (attributes, bookmarks); done = true; } else if (!Character.isWhitespace (ch)) { whitespace (attributes, bookmarks); state = 1; } break; case 1: // within attribute name if ((Page.EOF == ch) || ('>' == ch) || ('<' == ch)) { if ('<' == ch) { // don't consume the opening angle mPage.ungetCharacter (mCursor); bookmarks[state + 1] = mCursor.getPosition (); } standalone (attributes, bookmarks); done = true; } else if (Character.isWhitespace (ch)) { // whitespaces might be followed by next attribute or an equal sign // see Bug #891058 Bug in lexer. bookmarks[6] = bookmarks[2]; // setting the bookmark[0] is done in state 6 if applicable state = 6; } else if ('=' == ch) state = 2; break; case 2: // equals hit if ((Page.EOF == ch) || ('>' == ch)) { empty (attributes, bookmarks); done = true; } else if ('\'' == ch) { state = 4; bookmarks[4] = bookmarks[3]; } else if ('"' == ch) { state = 5; bookmarks[5] = bookmarks[3]; } else if (Character.isWhitespace (ch)) { // collect white spaces after "=" into the assignment string; // do nothing // see Bug #891058 Bug in lexer. } else state = 3; break; case 3: // within naked attribute value if ((Page.EOF == ch) || ('>' == ch)) { naked (attributes, bookmarks); done = true; } else if (Character.isWhitespace (ch)) { naked (attributes, bookmarks); bookmarks[0] = bookmarks[4]; state = 0; } break; case 4: // within single quoted attribute value if (Page.EOF == ch) { single_quote (attributes, bookmarks); done = true; // complain? } else if ('\'' == ch) { single_quote (attributes, bookmarks); bookmarks[0] = bookmarks[5] + 1; state = 0; } break; case 5: // within double quoted attribute value if (Page.EOF == ch) { double_quote (attributes, bookmarks); done = true; // complain? } else if ('"' == ch) { double_quote (attributes, bookmarks); bookmarks[0] = bookmarks[6] + 1; state = 0; } break; // patch for lexer state correction by // Gernot Fricke // See Bug # 891058 Bug in lexer. case 6: // undecided for state 0 or 2 // we have read white spaces after an attributte name if (Page.EOF == ch) { // same as last else clause standalone (attributes, bookmarks); bookmarks[0]=bookmarks[6]; mPage.ungetCharacter (mCursor); state=0; } else if (Character.isWhitespace (ch)) { // proceed } else if ('=' == ch) // yepp. the white spaces belonged to the equal. { bookmarks[2] = bookmarks[6]; bookmarks[3] = bookmarks[7]; state=2; } else { // white spaces were not ended by equal // meaning the attribute was a stand alone attribute // now: create the stand alone attribute and rewind // the cursor to the end of the white spaces // and restart scanning as whitespace attribute. standalone (attributes, bookmarks); bookmarks[0]=bookmarks[6]; mPage.ungetCharacter (mCursor); state=0; } break; default: throw new IllegalStateException ("how the fuck did we get in state " + state); } } return (makeTag (start, mCursor.getPosition (), attributes)); } /** * Create a tag node based on the current cursor and the one provided. * @param start The starting point of the node. * @param end The ending point of the node. * @param attributes The attributes parsed from the tag. * @exception ParserException If the nodefactory creation of the tag node fails. * @return The new Tag node. */ protected Node makeTag (int start, int end, Vector attributes) throws ParserException { int length; Node ret; length = end - start; if (0 != length) { // return tag based on second character, '/', '%', Letter (ch), '!' if (2 > length) // this is an error return (makeString (start, end)); ret = getNodeFactory ().createTagNode (this.getPage (), start, end, attributes); } else ret = null; return (ret); } /** * Parse a comment. * Parse a remark markup.<p> * From the <a href="http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.4"> * HTML 4.01 Specification, W3C Recommendation 24 December 1999</a> * http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.4<p> * <cite> * 3.2.4 Comments<p> * HTML comments have the following syntax:<p> * <code> * <!-- this is a comment --><p> * <!-- and so is this one,<p> * which occupies more than one line --><p> * </code> * White space is not permitted between the markup declaration * open delimiter("<!") and the comment open delimiter ("--"), * but is permitted between the comment close delimiter ("--") and * the markup declaration close delimiter (">"). * A common error is to include a string of hyphens ("---") within a comment. * Authors should avoid putting two or more adjacent hyphens inside comments. * Information that appears between comments has no special meaning * (e.g., character references are not interpreted as such). * Note that comments are markup.<p> * </cite> * <p> * This method uses a state machine with the following states: * <ol> * <li>state 0 - prior to the first open delimiter (first dash)</li> * <li>state 1 - prior to the second open delimiter (second dash)</li> * <li>state 2 - prior to the first closing delimiter (first dash)</li> * <li>state 3 - prior to the second closing delimiter (second dash)</li> * <li>state 4 - prior to the terminating ></li> * </ol> * <p> * All comment text (everything excluding the < and >), is included * in the remark text. * We allow terminators like --!> even though this isn't part of the spec. * @param start The position at which to start scanning. * @param quotesmart If <code>true</code>, strings ignore quoted contents. * @return The parsed node. * @exception ParserException If a problem occurs reading from the source. */ protected Node parseRemark (int start, boolean quotesmart) throws ParserException { boolean done; char ch; int state; done = false; state = 0; while (!done) { ch = mPage.getCharacter (mCursor); if (Page.EOF == ch) done = true; else switch (state) { case 0: // prior to the first open delimiter if ('>' == ch) done = true; if ('-' == ch) state = 1; else return (parseString (start, quotesmart)); break; case 1: // prior to the second open delimiter if ('-' == ch) { // handle <!--> because netscape does ch = mPage.getCharacter (mCursor); if (Page.EOF == ch) done = true; else if ('>' == ch) done = true; else { mPage.ungetCharacter (mCursor);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -