📄 htmlparser.java

📁 java写的crawler
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/* * WebSphinx web-crawling toolkit * * Copyright (c) 1998-2002 Carnegie Mellon University.  All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright *    notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright *    notice, this list of conditions and the following disclaimer in *    the documentation and/or other materials provided with the *    distribution. * * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */package websphinx;import java.io.InputStream;import java.io.IOException;//#ifdef JDK1.1 import java.io.Reader;import java.io.InputStreamReader;import java.io.StringReader;//#endif JDK1.1/*#ifdef JDK1.0import java.io.StringBufferInputStream;#endif JDK1.0*/import java.io.ByteArrayOutputStream;import java.util.Hashtable;import java.util.Enumeration;import java.util.Vector;import java.util.Stack;import java.net.URL;import java.net.MalformedURLException;/** * HTML parser.  Parses an input stream or String and * converts it to a sequence of Tags and a tree of Elements. * HTMLParser is used by Page to parse pages. */// FIX: make HTMLParser into an interface, and// split this implementation into Tokenizer and TreeBuilderpublic class HTMLParser {    // parameter for HTML type detection.      // If the parser doesn't encounter at least one HTML tag    // in the first VALID_HTML_PREFIX bytes of the stream, then parser     // concludes that the stream isn't HTML and stops parsing it.    static final int VALID_HTML_PREFIX = 10000;    int maxBytes = Integer.MAX_VALUE;    /**     * Make an HTMLParser.     */    public HTMLParser () {    }    /**     * Parse a page as HTML.     * @param page Page to parse     */    public void parse (Page page) throws IOException {        tokenize (page);        buildParseTree (page);    }    /*     *  HTML tokenizer state machine     */    // state takes on one of the following values:    private static final int START = 0;    private static final int INWORD = 1;    private static final int ENTITY = 2;    private static final int LT = 4;    private static final int BANG = 5;    private static final int BANG_DASH = 6;    private static final int CMT = 7;    private static final int CMT_DASH = 8;    private static final int CMT_DASHDASH = 9;    private static final int DIRECTIVE = 10;    private static final int STAG = 11;    private static final int ETAG = 12;    private static final int ATTR = 13;    private static final int ATTRNAME = 14;    private static final int EQ = 15;    private static final int AFTEREQ = 16;    private static final int ATTRVAL = 17;    private static final int ATTRVAL_SQ = 18;    private static final int ATTRVAL_DQ = 19;    private static final int DONE = 20;    private static final int ENTNUM = 21;    private static final int ENTREF = 22;    StringBuffer wordBuf = new StringBuffer ();    StringBuffer tagName = new StringBuffer ();    StringBuffer attrName = new StringBuffer ();    StringBuffer attrVal = new StringBuffer ();    Vector attrs = new Vector ();    StringBuffer entity = new StringBuffer ();    // FIX: should entities in attr names or values be expanded?    private void tokenize (Page page) throws IOException {        int state = START;        String content = page.getContent ();        int buflen = content.length ();        int bufptr = 0;        int bufbase = 0;        // token list        Vector tokens = new Vector();        int wordStart = 0;        int nWords = 0;        Tag tag = null;        int tagStart = 0;        int entnum = 0;                StringBuffer entityTargetBuf = null;        int postEntityState = 0;        boolean isHTML = "text/html".equals (page.getContentType ());        while (bufptr < buflen) {            if (!isHTML && bufptr >= VALID_HTML_PREFIX)                // we didn't see any HTML tags in the first                // VALID_HTML_PREFIX bytes,                // so assume the document isn't HTML and stop parsing it.                return;            char c = content.charAt (bufptr);            //System.err.println ("%% state == " + state + ", ptr == " + (bufbase+bufptr) + ", c == " + c);            switch (state) {                case START:                    // after whitespace or tag                    switch (c) {                        case '<':                            ++bufptr;                            state = LT;                            break;                        case ' ':                        case '\t':                        case '\n':                        case '\r':                            ++bufptr;                            break;                        default:                            wordBuf.setLength (0);                            wordStart = bufbase+bufptr;                            state = INWORD;                            break;                    }                    break;                case INWORD:                    // Character data                    switch (c) {                        case '<':                            tokens.addElement (new Text (page, wordStart, bufbase+bufptr, wordBuf.toString ()));                            ++nWords;                            state = START;                            break;                        case ' ':                        case '\t':                        case '\n':                        case '\r':                            tokens.addElement (new Text (page, wordStart, bufbase+bufptr, wordBuf.toString ()));                            ++nWords;                            state = START;                            ++bufptr;                            break;                        case '&':                            ++bufptr;                            postEntityState = INWORD;                            entityTargetBuf = wordBuf;                            state = ENTITY;                            break;                        default:                            wordBuf.append ((char)c);                            ++bufptr;                            // state == INWORD;                            break;                    }                    break;                //  Entities                case ENTITY:                    if (c == '#') {                        ++bufptr;                        entnum = 0;                        state = ENTNUM;                    }                    else if ((c >= 'A' && c <= 'Z')                             || (c >= 'a' && c <= 'z')) {                        entity.setLength (0);                        state = ENTREF;                    }                    else {                        entityTargetBuf.append ('&');                        state = postEntityState;                    }                    break;                case ENTREF:                    if (!Character.isLetterOrDigit(c)) {                        Character ent = lookupEntityRef (entity.toString ());                        if (ent != null) {                            entityTargetBuf.append (ent.charValue());                            if (c == ';')                                ++bufptr;                        }                        else {                            // unrecognized entity -- leave                            // as-is                            entityTargetBuf.append ('&');                            entityTargetBuf.append (entity.toString ());                        }                                                    state = postEntityState;                    }                    else {                        ++bufptr;                        entity.append (c);                        // state == ENTREF;                    }                    break;                case ENTNUM:                    if (c==';' || !Character.isDigit(c)) {                        entityTargetBuf.append ((char) entnum);                        if (c == ';')                            ++bufptr;                        state = postEntityState;                    }                    else {                        entnum = 10*entnum + (c - '0');                        ++bufptr;                    }                    break;                case LT:                    tagStart = bufbase+bufptr-1;                    switch (c) {                        case '/':                            ++bufptr;                            tagName.setLength (0);                            state = ETAG;                            break;                        case '!':                            ++bufptr;                            state = BANG;                            break;                        default:                            if (Character.isLetter (c)) {                                tagName.setLength (0);                                state = STAG;                            }                            else {                                wordBuf.append ('<');                                state = INWORD;                            }                            break;                    }                    break;                // Comments and directives.                // Implements the (broken, but easy) Netscape rule:                // <!-- starts a comment, --> closes.                // All other directives <!foo> are also returned as comments.                case BANG:                    if (c == '-') {                        ++bufptr;                        state = BANG_DASH;                    }                    else {                        state = DIRECTIVE;                    }                    break;                case BANG_DASH:                    if (c == '-') {                        ++bufptr;                        state = CMT;                    }                    else {                        state = DIRECTIVE;                    }                    break;                case CMT:                    if (c == '-') {                        ++bufptr;                        state = CMT_DASH;                    }                    else {                        ++bufptr;                    }                    break;                case CMT_DASH:                    if (c == '-') {                        ++bufptr;                        state = CMT_DASHDASH;                    }                    else {                        ++bufptr;                        state = CMT;                    }                    break;                case CMT_DASHDASH:                    if (c == '>') {                        ++bufptr;                        tag = new Tag (page, tagStart, bufbase+bufptr, Tag.COMMENT, true);                        tokens.addElement (tag);                        state = START;                    }                    else if (c == '-') {                        ++bufptr;                        state = CMT_DASHDASH;                    }                    else {                        ++bufptr;                        state = CMT;                    }                    break;                case DIRECTIVE:                    if (c == '>') {                        ++bufptr;                        tag = new Tag (page, tagStart, bufbase+bufptr, Tag.COMMENT, true);                        tokens.addElement (tag);                        state = START;                    }                    else {                        ++bufptr;                    }                    break;                // Tags                case STAG:                    if (c == '>' || isWhitespace(c)) {                        tag = new Tag (page, tagStart, bufbase+bufptr, // tag doesn't really end here                                                                       // -- we'll fix it up when we actually see it                                       tagName.toString (), true);                        tokens.addElement (tag);                        attrs.setSize (0);                        state = ATTR;                        isHTML = true;                    }                    else {                        tagName.append (c);                        ++bufptr;                        // state == STAG;                    }                    break;                case ETAG:                    if (c == '>') {                        ++bufptr;                        tag = new Tag (page, tagStart, bufbase+bufptr, tagName.toString (), false);                        tokens.addElement (tag);                        state = START;                    }
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -