📄 htmlparser.java
字号:
/* * WebSphinx web-crawling toolkit * * Copyright (c) 1998-2002 Carnegie Mellon University. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */package websphinx;import java.io.InputStream;import java.io.IOException;//#ifdef JDK1.1 import java.io.Reader;import java.io.InputStreamReader;import java.io.StringReader;//#endif JDK1.1/*#ifdef JDK1.0import java.io.StringBufferInputStream;#endif JDK1.0*/import java.io.ByteArrayOutputStream;import java.util.Hashtable;import java.util.Enumeration;import java.util.Vector;import java.util.Stack;import java.net.URL;import java.net.MalformedURLException;/** * HTML parser. Parses an input stream or String and * converts it to a sequence of Tags and a tree of Elements. * HTMLParser is used by Page to parse pages. */// FIX: make HTMLParser into an interface, and// split this implementation into Tokenizer and TreeBuilderpublic class HTMLParser { // parameter for HTML type detection. // If the parser doesn't encounter at least one HTML tag // in the first VALID_HTML_PREFIX bytes of the stream, then parser // concludes that the stream isn't HTML and stops parsing it. static final int VALID_HTML_PREFIX = 10000; int maxBytes = Integer.MAX_VALUE; /** * Make an HTMLParser. */ public HTMLParser () { } /** * Parse a page as HTML. * @param page Page to parse */ public void parse (Page page) throws IOException { tokenize (page); buildParseTree (page); } /* * HTML tokenizer state machine */ // state takes on one of the following values: private static final int START = 0; private static final int INWORD = 1; private static final int ENTITY = 2; private static final int LT = 4; private static final int BANG = 5; private static final int BANG_DASH = 6; private static final int CMT = 7; private static final int CMT_DASH = 8; private static final int CMT_DASHDASH = 9; private static final int DIRECTIVE = 10; private static final int STAG = 11; private static final int ETAG = 12; private static final int ATTR = 13; private static final int ATTRNAME = 14; private static final int EQ = 15; private static final int AFTEREQ = 16; private static final int ATTRVAL = 17; private static final int ATTRVAL_SQ = 18; private static final int ATTRVAL_DQ = 19; private static final int DONE = 20; private static final int ENTNUM = 21; private static final int ENTREF = 22; StringBuffer wordBuf = new StringBuffer (); StringBuffer tagName = new StringBuffer (); StringBuffer attrName = new StringBuffer (); StringBuffer attrVal = new StringBuffer (); Vector attrs = new Vector (); StringBuffer entity = new StringBuffer (); // FIX: should entities in attr names or values be expanded? private void tokenize (Page page) throws IOException { int state = START; String content = page.getContent (); int buflen = content.length (); int bufptr = 0; int bufbase = 0; // token list Vector tokens = new Vector(); int wordStart = 0; int nWords = 0; Tag tag = null; int tagStart = 0; int entnum = 0; StringBuffer entityTargetBuf = null; int postEntityState = 0; boolean isHTML = "text/html".equals (page.getContentType ()); while (bufptr < buflen) { if (!isHTML && bufptr >= VALID_HTML_PREFIX) // we didn't see any HTML tags in the first // VALID_HTML_PREFIX bytes, // so assume the document isn't HTML and stop parsing it. return; char c = content.charAt (bufptr); //System.err.println ("%% state == " + state + ", ptr == " + (bufbase+bufptr) + ", c == " + c); switch (state) { case START: // after whitespace or tag switch (c) { case '<': ++bufptr; state = LT; break; case ' ': case '\t': case '\n': case '\r': ++bufptr; break; default: wordBuf.setLength (0); wordStart = bufbase+bufptr; state = INWORD; break; } break; case INWORD: // Character data switch (c) { case '<': tokens.addElement (new Text (page, wordStart, bufbase+bufptr, wordBuf.toString ())); ++nWords; state = START; break; case ' ': case '\t': case '\n': case '\r': tokens.addElement (new Text (page, wordStart, bufbase+bufptr, wordBuf.toString ())); ++nWords; state = START; ++bufptr; break; case '&': ++bufptr; postEntityState = INWORD; entityTargetBuf = wordBuf; state = ENTITY; break; default: wordBuf.append ((char)c); ++bufptr; // state == INWORD; break; } break; // Entities case ENTITY: if (c == '#') { ++bufptr; entnum = 0; state = ENTNUM; } else if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) { entity.setLength (0); state = ENTREF; } else { entityTargetBuf.append ('&'); state = postEntityState; } break; case ENTREF: if (!Character.isLetterOrDigit(c)) { Character ent = lookupEntityRef (entity.toString ()); if (ent != null) { entityTargetBuf.append (ent.charValue()); if (c == ';') ++bufptr; } else { // unrecognized entity -- leave // as-is entityTargetBuf.append ('&'); entityTargetBuf.append (entity.toString ()); } state = postEntityState; } else { ++bufptr; entity.append (c); // state == ENTREF; } break; case ENTNUM: if (c==';' || !Character.isDigit(c)) { entityTargetBuf.append ((char) entnum); if (c == ';') ++bufptr; state = postEntityState; } else { entnum = 10*entnum + (c - '0'); ++bufptr; } break; case LT: tagStart = bufbase+bufptr-1; switch (c) { case '/': ++bufptr; tagName.setLength (0); state = ETAG; break; case '!': ++bufptr; state = BANG; break; default: if (Character.isLetter (c)) { tagName.setLength (0); state = STAG; } else { wordBuf.append ('<'); state = INWORD; } break; } break; // Comments and directives. // Implements the (broken, but easy) Netscape rule: // <!-- starts a comment, --> closes. // All other directives <!foo> are also returned as comments. case BANG: if (c == '-') { ++bufptr; state = BANG_DASH; } else { state = DIRECTIVE; } break; case BANG_DASH: if (c == '-') { ++bufptr; state = CMT; } else { state = DIRECTIVE; } break; case CMT: if (c == '-') { ++bufptr; state = CMT_DASH; } else { ++bufptr; } break; case CMT_DASH: if (c == '-') { ++bufptr; state = CMT_DASHDASH; } else { ++bufptr; state = CMT; } break; case CMT_DASHDASH: if (c == '>') { ++bufptr; tag = new Tag (page, tagStart, bufbase+bufptr, Tag.COMMENT, true); tokens.addElement (tag); state = START; } else if (c == '-') { ++bufptr; state = CMT_DASHDASH; } else { ++bufptr; state = CMT; } break; case DIRECTIVE: if (c == '>') { ++bufptr; tag = new Tag (page, tagStart, bufbase+bufptr, Tag.COMMENT, true); tokens.addElement (tag); state = START; } else { ++bufptr; } break; // Tags case STAG: if (c == '>' || isWhitespace(c)) { tag = new Tag (page, tagStart, bufbase+bufptr, // tag doesn't really end here // -- we'll fix it up when we actually see it tagName.toString (), true); tokens.addElement (tag); attrs.setSize (0); state = ATTR; isHTML = true; } else { tagName.append (c); ++bufptr; // state == STAG; } break; case ETAG: if (c == '>') { ++bufptr; tag = new Tag (page, tagStart, bufbase+bufptr, tagName.toString (), false); tokens.addElement (tag); state = START; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -