⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 htmlparser.java

📁 一个Web爬虫(机器人
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
          .add (Tag.H1)          .add (Tag.H2)          .add (Tag.H3)          .add (Tag.H4)          .add (Tag.H5)          .add (Tag.H6)          .add (Tag.ADDRESS)          ;    // maps elements which force closure to the elements that they close, e.g.,    // LI maps to LI, DT maps to DD,DT, and all block-level tags map to P.    private static Hashtable forcesClosed = new Hashtable2 ()          .add (Tag.DD, new Hashtable2 () .add (Tag.DD) .add (Tag.DT))          .add (Tag.DT, new Hashtable2 () .add (Tag.DD) .add (Tag.DT))          .add (Tag.LI, new Hashtable2 () .add (Tag.LI))          .add (Tag.OPTION, new Hashtable2 () .add (Tag.OPTION))          .add (Tag.TR, new Hashtable2 () .add (Tag.TR))          .add (Tag.TD, new Hashtable2 () .add (Tag.TD) .add (Tag.TH))          .add (Tag.TH, new Hashtable2 () .add (Tag.TD) .add (Tag.TH))          ;    static {        Hashtable p = new Hashtable2 () .add (Tag.P);        Enumeration enum = blocktag.keys ();        while (enum.hasMoreElements ())            union (forcesClosed, enum.nextElement(), p);    }    // union of forcesClosed plus the tag's possible containers.  For instance,    // LI maps to LI, OL, UL, MENU, DIR.  When a forcesClosed tag like LI is    // encountered, the parser looks upward for the first context tag.    // Having the tag's container element included in the search ensures that    // LI in a nested list won't close its parent LI.    static Hashtable context = new Hashtable2 ()          .add (Tag.DD, new Hashtable2 () .add (Tag.DL))          .add (Tag.DT, new Hashtable2 () .add (Tag.DL))          .add (Tag.LI, new Hashtable2 () .add (Tag.OL) .add (Tag.UL) .add (Tag.MENU) .add (Tag.DIR))          .add (Tag.OPTION, new Hashtable2 () .add (Tag.SELECT))          .add (Tag.TR, new Hashtable2 () .add (Tag.TABLE))          .add (Tag.TD, new Hashtable2 () .add (Tag.TR) .add (Tag.TABLE))          .add (Tag.TH, new Hashtable2 () .add (Tag.TR) .add (Tag.TABLE))          ;    static {        Enumeration enum = forcesClosed.keys ();        while (enum.hasMoreElements ()) {            Object tagname = enum.nextElement();            union (context, tagname, (Hashtable)forcesClosed.get (tagname));        }    }    // NIY: handle literal and semi-literal elements (XMP, LISTING, TEXTAREA, OPTION)    // elements whose content should be treated as plain text    static Hashtable literal = new Hashtable2()            ;    // maps link elements to their URL attribute (e.g., A maps to HREF)    static Hashtable linktag = new Hashtable2 ()            .add (Tag.A, "href")            .add (Tag.AREA, "href")            .add (Tag.APPLET, "code")            .add (Tag.EMBED, "src")            .add (Tag.FRAME, "src")            .add (Tag.FORM, "action")            .add (Tag.IMG, "src")            .add (Tag.LINK, "href")            .add (Tag.SCRIPT, "src")            ;    // elements whose text contents are crucial to the crawler    static Hashtable savetext = new Hashtable2 ()            .add (Tag.A)            .add (Tag.TITLE);    // elements found in <HEAD>    static Hashtable headtag = new Hashtable2()          .add (Tag.META)          .add (Tag.TITLE)          .add (Tag.BASE)          .add (Tag.LINK)          .add (Tag.ISINDEX)          ;    private static void union (Hashtable map, Object tagname, Hashtable tagset) {        Hashtable2 currset = (Hashtable2)map.get (tagname);        if (currset == null)            map.put (tagname, tagset);        else            map.put (tagname, currset.union (tagset));    }    private void buildParseTree (Page page) {        boolean keepText = false;        elems.setSize (0);        openPtr = 0;        Region[] tokens = page.tokens;        for (int t=0; t<tokens.length; ++t) {            Region r = tokens[t];            if (r instanceof Tag) {                Tag tag = (Tag)r;                String tagName = tag.getTagName();                if (tag.isStartTag()) {                    // start tag <X>                    // check if <X> forces closure of an open element                    if (forcesClosed.containsKey (tagName)) {                        Element e = findOpenElement ((Hashtable)context.get (tagName));                        if (e != null && ((Hashtable)forcesClosed.get (tagName)).containsKey (e.getTagName()))                            close (e, tag.start);                    }                    // create the element and push it on the elems stack                    Element e = makeElement (page.base, tag);                    open (e);                    if (empty.containsKey (tagName)) {                        // element has no content                        // close it off right now                        close (e, tag.end);                    }                    else if (savetext.containsKey (tagName)) {                        text.setLength (0);                        keepText = true;                    }                    if (tagName == Tag.BASE) {                        String href = tag.getHTMLAttribute ("href");                        if (href != null) {                            try {                                                         page.base = new URL (page.base, new String (href.toCharArray())); // make copy to avoid reference to page content                            } catch (MalformedURLException ex) {} // bad URL                              catch (NullPointerException ex) {} // base == null                        }                    }                }                else {                    // end tag </X>                    // find matching start tag <X>                    Element e = findOpenElement (tagName);                    if (e != null) {                        close (e, tag);                        if (savetext.containsKey (tagName)) {                            if (tagName == Tag.TITLE)                                page.title = text.toString();                            else if (e instanceof Link)                                ((Link)e).setText (text.toString());                            keepText = false;                        }                    }                }            }            else { // r is a text token                if (keepText) {                    if (text.length() > 0)                        text.append (' ');                    text.append (r.toText());                }            }        }        // close any remaining open elements        closeAll (page.end);        // link together the top-level elements        if (!elems.empty()) {            int nElems = elems.size ();            Element c = (Element)elems.elementAt (0);            page.root = c;            for (int j=1; j<nElems; ++j) {                Element d = (Element)elems.elementAt (j);                c.sibling = d;                c = d;            }        }        page.elements = new Element[vElements.size()];        vElements.copyInto (page.elements);        page.links = new Link[vLinks.size()];        vLinks.copyInto (page.links);    }    private Element makeElement (URL base, Tag tag) {        Element e = null;        String tagName = tag.getTagName ();        String hrefAttr = (String)linktag.get (tagName);        String type;        try {            if (tagName == Tag.FORM) {                e = new Form (tag, null, base);                vLinks.addElement (e);            }            else if (tagName == Tag.INPUT                      && (type = tag.getHTMLAttribute ("type")) != null                     && (type.equalsIgnoreCase ("submit") || type.equalsIgnoreCase ("image"))) {                e = new FormButton (tag, null, currentForm);                vLinks.addElement (e);            }            else if (hrefAttr != null && tag.hasHTMLAttribute (hrefAttr)) {                e = new Link (tag, null, base);                vLinks.addElement (e);            }        } catch (MalformedURLException f) {} // bad URL          catch (NullPointerException ex) {} // base == null        if (e == null)            // just make an ordinary element            e = new Element (tag, null);                    vElements.addElement (e);        tag.element = e;        return e;    }    // Stack management    Stack elems = new Stack();        // stack of Elements appearing before than the current element in        // a preorder traversal, except that completely-visited subtrees        // are represented by their root.    int[] openElems = new int[20];    int openPtr = 0;        // stack of indices of open elements in elems    Form currentForm;    private void open (Element e) {        if (openPtr > 0)            e.parent = (Element)elems.elementAt (openElems[openPtr-1]);        else            e.parent = null;        elems.push (e);        if (e instanceof Form)            currentForm = (Form)e;        if (openPtr == openElems.length) {            int[] newarr = new int[openElems.length + 10];            System.arraycopy (openElems, 0, newarr, 0, openElems.length);            openElems = newarr;        }        openElems[openPtr] = elems.size()-1;        ++openPtr;    }    private Element findOpenElement (String tagname) {        for (int i=openPtr-1; i >= 0; --i) {            Element e = (Element)elems.elementAt (openElems[i]);            if (tagname == e.getTagName ())                return e;        }        return null;    }    private Element findOpenElement (Hashtable tags) {        for (int i=openPtr-1; i >= 0; --i) {            Element e = (Element)elems.elementAt (openElems[i]);            if (tags.containsKey (e.getTagName ()))                return e;        }        return null;    }    // NIY: stack up unclosed flow tags (like <B> and <A>) and reopen them    // when the next element is opened    private void close (Element elem, Tag tag) {        elem.endTag = tag;        tag.element = elem;        close (elem, tag.start);        elem.end = tag.end;    }    private void close (Element elem, int end) {        int v;        Element e;        do {            v = openElems[--openPtr];            e = (Element)elems.elementAt (v);            e.end = end;            if (e instanceof Form)                currentForm = null;            int firstChild = v+1;            int nElems = elems.size();            if (firstChild < nElems) {                Element c = (Element)elems.elementAt (firstChild);                e.child = c;                for (int j=firstChild+1; j<nElems; ++j) {                    Element d = (Element)elems.elementAt (j);                    c.sibling = d;                    c = d;                }                elems.setSize (firstChild);            }                    } while (e != elem);    }    private void closeAll (int end) {        if (openPtr > 0)            close ((Element)elems.elementAt (openElems[0]), end);    }    /*     * Testing interface     *     */    public static void main (String[] args) throws Exception {        if (args.length < 1 || args.length > 2) {            System.err.println ("usage: HTMLParser <URL>");            System.exit(-1);        }        Page page;        if (args.length == 1)            page = new Page (new Link(args[0]), new HTMLParser ());        else            page = new Page (new URL(args[0]), args[1], new HTMLParser ());        /*        long tm = System.currentTimeMillis();     //??dk        HTMLParser tokenizer = new HTMLParser ();        tm = System.currentTimeMillis() - tm;       //??dk            System.err.println("[Parsed " + args[0] + " in " + tm + "ms]");        */         System.out.println ("Tokens: ------------------------------------------");        Region[] tokens = page.tokens;        for (int i=0; i<tokens.length; ++i) {            System.out.println ("[" + tokens[i].getStart() + "," + tokens[i].getEnd() + "]" + tokens[i]);        }       System.out.println ("Tags: ------------------------------------------");        Tag[] tags = page.tags;        for (int i=0; i<tags.length; ++i) {            Tag t = tags[i];            System.out.print ((t.isStartTag() ? "start tag" : "end tag") + " " + t.getTagName ());            Enumeration attrs = t.enumerateHTMLAttributes();            String name, val;            while (attrs.hasMoreElements()) {                name = (String)attrs.nextElement();                val = t.getHTMLAttribute (name);                System.out.print (" " + name + "=\"" + val + "\"");            }            System.out.println ();            System.out.println ("    " + t);        }        System.out.println ("Words: ------------------------------------------");        Text[] words = page.words;        for (int i=0; i<words.length; ++i) {            System.out.println (words[i]);        }        System.out.println ("Elements: ------------------------------------------");        printout (page.root, 0);        System.out.println ("Links: ------------------------------------------");        printout (page.getLinks (), 0);    }    private static String indentation (int indent) {        StringBuffer s = new StringBuffer();        for (int i=0; i<indent; ++i)            s.append ("    ");        return s.toString();    }    private static void printout (Element element, int indent) {      for (Element e = element; e != null; e = e.getSibling ()) {          Element c = e.getChild();          System.out.println (indentation(indent) + e.getStartTag() + "[" + e.getStart() + "," + e.getEnd() + "]");          if (c != null)              printout (c, indent+1);          if (e.getEndTag() != null)              System.out.println (indentation(indent) + e.getEndTag());      }    }    private static void printout (Link[] elements, int indent) {        for (int i=0; i<elements.length; ++i) {            Link e = elements[i];            System.out.println (indentation(indent) + e.toDescription());        }    }}class Hashtable2 extends Hashtable {    public Hashtable2 () {    }    public Hashtable2 add (Object key) {        put (key, key);        return this;    }    public Hashtable2 add (Object key, Object val) {        put (key, val);        return this;    }    public Hashtable2 union (Hashtable map) {        Enumeration enum = map.keys ();        while (enum.hasMoreElements ()) {            Object key = enum.nextElement ();            put (key, map.get (key));        }        return this;    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -