📄 htmlparser.java

📁 一个Web爬虫（机器人
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/* * WebSPHINX web crawling toolkit * Copyright (C) 1998,1999 Carnegie Mellon University  *  * This library is free software; you can redistribute it * and/or modify it under the terms of the GNU Library * General Public License as published by the Free Software  * Foundation, version 2. * * WebSPHINX homepage: http://www.cs.cmu.edu/~rcm/websphinx/ */package websphinx;import java.io.InputStream;import java.io.IOException;//#ifdef JDK1.1 import java.io.Reader;import java.io.InputStreamReader;import java.io.StringReader;//#endif JDK1.1/*#ifdef JDK1.0import java.io.StringBufferInputStream;#endif JDK1.0*/import java.util.Hashtable;import java.util.Enumeration;import java.util.Vector;import java.util.Stack;import java.net.URL;import java.net.MalformedURLException;/** * HTML parser.  Parses an input stream or String and * converts it to a sequence of Tags and a tree of Elements. * HTMLParser is used by Page to parse pages. */// FIX: make HTMLParser into an interface, and// split this implementation into Tokenizer and TreeBuilderpublic class HTMLParser {    static final int BUFFER_SIZE = 10240;    int maxBytes = Integer.MAX_VALUE;    /**     * Make an HTMLParser.     */    public HTMLParser () {    }    /**     * Make an HTMLParser which retrieves pages     * using the specified download parameters.  Pages      * larger than dp.getMaxPageSize() are rejected by parse()     * with an IOException.     * @param dp download parameters used during parsing     */    public HTMLParser (DownloadParameters dp) {        this.maxBytes = dp.getMaxPageSize () * 1024;    }    /**     * Parse an input stream.     * @param page Page to receive parsed HTML     * @param input stream containing HTML     */    public void parse (Page page, InputStream stream) throws IOException {//#ifdef JDK1.1         Reader r = new InputStreamReader (stream);        tokenize (page, r, true);//#endif JDK1.1/*#ifdef JDK1.0        tokenize (page, stream, true);#endif JDK1.0*/                buildParseTree (page);    }    /**     * Parse an input stream.     * @param page Page to receive parsed HTML     * @param input stream containing HTML     *///#ifdef JDK1.1     public void parse (Page page, Reader stream) throws IOException {        tokenize (page, stream, true);        buildParseTree (page);    }//#endif JDK1.1    /**     * Parse a string.     * @param page Page to receive parsed HTML     * @param content String containing HTML      */    public void parse (Page page, String content) throws IOException {//#ifdef JDK1.1         Reader r = new StringReader (content);        tokenize (page, r, false);        r.close ();//#endif JDK1.1/*#ifdef JDK1.0        InputStream stream = new StringBufferInputStream (content);        tokenize (page, stream, false);        stream.close ();#endif JDK1.0*/        buildParseTree (page);    }    /**     * Download an input stream without parsing it.     * @param page Page to receive the downloaded content     * @param input stream containing content     */    public void dontParse (Page page, InputStream stream) throws IOException {//#ifdef JDK1.1         Reader r = new InputStreamReader (stream);        dontParse (page, r);//#endif JDK1.1/*#ifdef JDK1.0        int n;        int total = 0;        contentBuf.setLength (0);        while ((n = stream.read (buf)) != -1) {            total += n;            if (total > maxBytes) {                throw new IOException ("Page greater than " + maxBytes + " bytes");            }            contentBuf.append (new String (buf, 0, 0, n));        }                page.content = contentBuf.toString ();        page.start = 0;        page.end = contentBuf.length();#endif JDK1.0*/    }    /**     * Download an input stream without parsing it.     * @param page Page to receive the downloaded content     * @param r stream containing content     *///#ifdef JDK1.1     public void dontParse (Page page, Reader stream) throws IOException {        int n;        int total = 0;        contentBuf.setLength (0);        while ((n = stream.read (buf)) != -1) {            total += n;            if (total > maxBytes) {                throw new IOException ("Page greater than " + maxBytes + " bytes");            }            contentBuf.append (buf, 0, n);        }                page.content = contentBuf.toString ();        page.start = 0;        page.end = contentBuf.length();    }//#endif JDK1.1    /*     *  HTML tokenizer state machine     */    // state takes on one of the following values:    private static final int START = 0;    private static final int INWORD = 1;    private static final int ENTITY = 2;    private static final int LT = 4;    private static final int BANG = 5;    private static final int BANG_DASH = 6;    private static final int CMT = 7;    private static final int CMT_DASH = 8;    private static final int CMT_DASHDASH = 9;    private static final int DIRECTIVE = 10;    private static final int STAG = 11;    private static final int ETAG = 12;    private static final int ATTR = 13;    private static final int ATTRNAME = 14;    private static final int EQ = 15;    private static final int AFTEREQ = 16;    private static final int ATTRVAL = 17;    private static final int ATTRVAL_SQ = 18;    private static final int ATTRVAL_DQ = 19;    private static final int DONE = 20;    private static final int ENTNUM = 21;    private static final int ENTREF = 22;//#ifdef JDK1.1     char[] buf = new char[BUFFER_SIZE];//#endif JDK1.1/*#ifdef JDK1.0    byte[] buf = new byte[BUFFER_SIZE];#endif JDK1.0*/    StringBuffer contentBuf = new StringBuffer ();    StringBuffer wordBuf = new StringBuffer ();    StringBuffer tagName = new StringBuffer ();    StringBuffer attrName = new StringBuffer ();    StringBuffer attrVal = new StringBuffer ();    Vector attrs = new Vector ();    StringBuffer entity = new StringBuffer ();    // FIX: should entities in attr names or values be expanded?//#ifdef JDK1.1     private void tokenize (Page page, Reader stream, boolean saveContent) throws IOException {//#endif JDK1.1/*#ifdef JDK1.0    private void tokenize (Page page, InputStream stream, boolean saveContent) throws IOException {#endif JDK1.0*/        int state = START;        int bufptr = 0;        int buflen = 0;        int bufbase = 0;        // token list        Vector tokens = new Vector();        int wordStart = 0;        int nWords = 0;        Tag tag = null;        int tagStart = 0;        int entnum = 0;                StringBuffer entityTargetBuf = null;        int postEntityState = 0;        contentBuf.setLength (0);        while (true) {            if (bufptr >= buflen) {                bufptr = 0;                bufbase += buflen;                buflen = stream.read (buf);                if (buflen == -1)                    break;                if (bufbase + buflen > maxBytes) {                    throw new IOException ("Page exceeded " + maxBytes + " bytes");                }                if (saveContent)//#ifdef JDK1.1                     contentBuf.append (buf, 0, buflen);//#endif JDK1.1/*#ifdef JDK1.0                    contentBuf.append (new String (buf, 0, 0, buflen));#endif JDK1.0*/            }            char c = (char)buf[bufptr];            //System.err.println ("%% state == " + state + ", ptr == " + (bufbase+bufptr) + ", c == " + c);            switch (state) {                case START:                    // after whitespace or tag                    switch (c) {                        case '<':                            ++bufptr;                            state = LT;                            break;                        case ' ':                        case '\t':                        case '\n':                        case '\r':                            ++bufptr;                            break;                        default:                            wordBuf.setLength (0);                            wordStart = bufbase+bufptr;                            state = INWORD;                            break;                    }                    break;                case INWORD:                    // Character data                    switch (c) {                        case '<':                            tokens.addElement (new Text (page, wordStart, bufbase+bufptr, wordBuf.toString ()));                            ++nWords;                            state = START;                            break;                        case ' ':                        case '\t':                        case '\n':                        case '\r':                            tokens.addElement (new Text (page, wordStart, bufbase+bufptr, wordBuf.toString ()));                            ++nWords;                            state = START;                            ++bufptr;                            break;                        case '&':                            ++bufptr;                            postEntityState = INWORD;                            entityTargetBuf = wordBuf;                            state = ENTITY;                            break;                        default:                            wordBuf.append ((char)c);                            ++bufptr;                            // state == INWORD;                            break;                    }                    break;                //  Entities                case ENTITY:                    if (c == '#') {                        ++bufptr;                        entnum = 0;                        state = ENTNUM;                    }                    else if ((c >= 'A' && c <= 'Z')                             || (c >= 'a' && c <= 'z')) {                        entity.setLength (0);                        state = ENTREF;                    }                    else {                        entityTargetBuf.append ('&');                        state = postEntityState;                    }                    break;                case ENTREF:                    if (!Character.isLetterOrDigit(c)) {                        Character ent = lookupEntityRef (entity.toString ());                        if (ent != null) {                            entityTargetBuf.append (ent.charValue());                            if (c == ';')                                ++bufptr;                        }                        else {                            // unrecognized entity -- leave                            // as-is                            entityTargetBuf.append ('&');                            entityTargetBuf.append (entity.toString ());                        }                                                    state = postEntityState;                    }                    else {                        ++bufptr;                        entity.append (c);                        // state == ENTREF;                    }                    break;                case ENTNUM:                    if (c==';' || !Character.isDigit(c)) {                        entityTargetBuf.append ((char) entnum);                        if (c == ';')                            ++bufptr;                        state = postEntityState;                    }                    else {                        entnum = 10*entnum + (c - '0');                        ++bufptr;                    }                    break;                case LT:                    tagStart = bufbase+bufptr-1;                    switch (c) {                        case '/':                            ++bufptr;                            tagName.setLength (0);                            state = ETAG;                            break;                        case '!':                            ++bufptr;                            state = BANG;                            break;                        default:                            if (Character.isLetter (c)) {                                tagName.setLength (0);                                state = STAG;                            }                            else {                                wordBuf.append ('<');                                state = INWORD;                            }                            break;                    }                    break;                // Comments and directives.                // Implements the (broken, but easy) Netscape rule:                // <!-- starts a comment, --> closes.                // All other directives <!foo> are also returned as comments.                case BANG:                    if (c == '-') {                        ++bufptr;                        state = BANG_DASH;                    }                    else {                        state = DIRECTIVE;                    }                    break;                case BANG_DASH:                    if (c == '-') {                        ++bufptr;                        state = CMT;                    }                    else {                        state = DIRECTIVE;                    }                    break;                case CMT:                    if (c == '-') {                        ++bufptr;                        state = CMT_DASH;                    }                    else {
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -