📄 lexer.java
字号:
attributes = new Vector (); // <?xyz?> // 011112d while (!done) { ch = mPage.getCharacter (mCursor); switch (state) { case 0: // prior to the question mark switch (ch) { case '?': // <? code = mCursor.getPosition (); attributes.addElement (new PageAttribute (mPage, start + 1, code, -1, -1, (char)0)); state = 1; break; // case Page.EOF: // <\0 // case '>': // <> default: done = true; break; } break; case 1: // prior to the closing question mark switch (ch) { case Page.EOF: // <?x\0 case '>': // <?x> done = true; break; case '\'': case '"':// <?..." state = ch; break; case '?': // <?...? state = 2; break; default: // <?...x break; } break; case 2: switch (ch) { case Page.EOF: // <?x..?\0 done = true; break; case '>': state = 3; done = true; break; default: // <?...?x state = 1; break; } break; case '"': switch (ch) { case Page.EOF: // <?x.."\0 done = true; break; case '"': state = 1; break; default: // <?...'.x break; } break; case '\'': switch (ch) { case Page.EOF: // <?x..'\0 done = true; break; case '\'': state = 1; break; default: // <?..."..x break; } break; default: throw new IllegalStateException ("how the fuck did we get in state " + state); } } if (3 == state) // normal exit { if (0 != code) { state = mCursor.getPosition () - 2; // reuse state attributes.addElement (new PageAttribute (mPage, code, state, -1, -1, (char)0)); attributes.addElement (new PageAttribute (mPage, state, state + 1, -1, -1, (char)0)); } else throw new IllegalStateException ("processing instruction with no content"); } else return (parseString (start, true)); // hmmm, true? return (makeTag (start, mCursor.getPosition (), attributes)); } /** * Return CDATA as a text node. * According to appendix <a href="http://www.w3.org/TR/html4/appendix/notes.html#notes-specifying-data"> * B.3.2 Specifying non-HTML data</a> of the * <a href="http://www.w3.org/TR/html4/">HTML 4.01 Specification</a>:<br> * <quote> * <b>Element content</b><br> * When script or style data is the content of an element (SCRIPT and STYLE), * the data begins immediately after the element start tag and ends at the * first ETAGO ("</") delimiter followed by a name start character ([a-zA-Z]); * note that this may not be the element's end tag. * Authors should therefore escape "</" within the content. Escape mechanisms * are specific to each scripting or style sheet language. * </quote> * @return The <code>TextNode</code> of the CDATA or <code>null</code> if none. * @exception ParserException If a problem occurs reading from the source. */ public Node parseCDATA () throws ParserException { return (parseCDATA (false)); } /** * Return CDATA as a text node. * Slightly less rigid than {@link #parseCDATA()} this method provides for * parsing CDATA that may contain quoted strings that have embedded * ETAGO ("</") delimiters and skips single and multiline comments. * @param quotesmart If <code>true</code> the strict definition of CDATA is * extended to allow for single or double quoted ETAGO ("</") sequences. * @return The <code>TextNode</code> of the CDATA or <code>null</code> if none. * @see #parseCDATA() * @exception ParserException If a problem occurs reading from the source. */ public Node parseCDATA (boolean quotesmart) throws ParserException { int start; int state; boolean done; char quote; char ch; int end; boolean comment; start = mCursor.getPosition (); state = 0; done = false; quote = 0; comment = false; while (!done) { ch = mPage.getCharacter (mCursor); switch (state) { case 0: // prior to ETAGO switch (ch) { case Page.EOF: done = true; break; case '\'': if (quotesmart && !comment) if (0 == quote) quote = '\''; // enter quoted state else if ('\'' == quote) quote = 0; // exit quoted state break; case '"': if (quotesmart && !comment) if (0 == quote) quote = '"'; // enter quoted state else if ('"' == quote) quote = 0; // exit quoted state break; case '\\': if (quotesmart) if (0 != quote) { ch = mPage.getCharacter (mCursor); // try to consume escaped character if (Page.EOF == ch) done = true; else if ( (ch != '\\') && (ch != quote)) mCursor.retreat (); // unconsume char if character was not an escapable char. } break; case '/': if (quotesmart) if (0 == quote) { // handle multiline and double slash comments (with a quote) ch = mPage.getCharacter (mCursor); if (Page.EOF == ch) done = true; else if ('/' == ch) comment = true; else if ('*' == ch) { do { do ch = mPage.getCharacter (mCursor); while ((Page.EOF != ch) && ('*' != ch)); ch = mPage.getCharacter (mCursor); if (ch == '*') mCursor.retreat (); } while ((Page.EOF != ch) && ('/' != ch)); } else mCursor.retreat (); } break; case '\n': comment = false; break; case '<': if (quotesmart) { if (0 == quote) state = 1; } else state = 1; break; default: break; } break; case 1: // < switch (ch) { case Page.EOF: done = true; break; case '/': state = 2; break; case '!': ch = mPage.getCharacter (mCursor); if (Page.EOF == ch) done = true; else if ('-' == ch) { ch = mPage.getCharacter (mCursor); if (Page.EOF == ch) done = true; else if ('-' == ch) state = 3; else state = 0; } else state = 0; break; default: state = 0; break; } break; case 2: // </ comment = false; if (Page.EOF == ch) done = true; else if (Character.isLetter (ch)) { done = true; // back up to the start of ETAGO mCursor.retreat (); mCursor.retreat (); mCursor.retreat (); } else state = 0; break; case 3: // <! comment = false; if (Page.EOF == ch) done = true; else if ('-' == ch) { ch = mPage.getCharacter (mCursor); if (Page.EOF == ch) done = true; else if ('-' == ch) { ch = mPage.getCharacter (mCursor); if (Page.EOF == ch) done = true; else if ('>' == ch) state = 0; else { mCursor.retreat (); mCursor.retreat (); } } else mCursor.retreat (); } break; default: throw new IllegalStateException ("how the fuck did we get in state " + state); } } end = mCursor.getPosition (); return (makeString (start, end)); } // // NodeFactory interface // /** * Create a new string node. * @param page The page the node is on. * @param start The beginning position of the string. * @param end The ending positiong of the string. * @return The created Text node. */ public Text createStringNode (Page page, int start, int end) { return (new TextNode (page, start, end)); } /** * Create a new remark node. * @param page The page the node is on. * @param start The beginning position of the remark. * @param end The ending positiong of the remark. * @return The created Remark node. */ public Remark createRemarkNode (Page page, int start, int end) { return (new RemarkNode (page, start, end)); } /** * Create a new tag node. * Note that the attributes vector contains at least one element, * which is the tag name (standalone attribute) at position zero. * This can be used to decide which type of node to create, or * gate other processing that may be appropriate. * @param page The page the node is on. * @param start The beginning position of the tag. * @param end The ending positiong of the tag. * @param attributes The attributes contained in this tag. * @return The created Tag node. */ public Tag createTagNode (Page page, int start, int end, Vector attributes) { return (new TagNode (page, start, end, attributes)); } /** * Mainline for command line operation * @param args [0] The URL to parse. * @exception MalformedURLException If the provided URL cannot be resolved. * @exception ParserException If the parse fails. */ public static void main (String[] args) throws MalformedURLException, ParserException { Lexer lexer; Node node; if (0 >= args.length) System.out.println ("usage: java -jar htmllexer.jar <url>"); else { try { ConnectionManager manager = Page.getConnectionManager (); lexer = new Lexer (manager.openConnection (args[0])); while (null != (node = lexer.nextNode (false))) System.out.println (node.toString ()); } catch (ParserException pe) { System.out.println (pe.getMessage ()); if (null != pe.getThrowable ()) System.out.println (pe.getThrowable ().getMessage ()); } } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -