⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 htmlparser.java

📁 java写的crawler
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
          .add (Tag.TD, new Hashtable2 () .add (Tag.TR) .add (Tag.TABLE))          .add (Tag.TH, new Hashtable2 () .add (Tag.TR) .add (Tag.TABLE))          ;    static {        Enumeration enum = forcesClosed.keys ();        while (enum.hasMoreElements ()) {            Object tagname = enum.nextElement();            union (context, tagname, (Hashtable)forcesClosed.get (tagname));        }    }    // NIY: handle literal and semi-literal elements (XMP, LISTING, TEXTAREA, OPTION)    // elements whose content should be treated as plain text    static Hashtable literal = new Hashtable2()            ;    // maps link elements to their URL attribute (e.g., A maps to HREF)    static Hashtable linktag = new Hashtable2 ()            .add (Tag.A, "href")            .add (Tag.AREA, "href")            .add (Tag.APPLET, "code")            .add (Tag.EMBED, "src")            .add (Tag.FRAME, "src")            .add (Tag.FORM, "action")            .add (Tag.IMG, "src")            .add (Tag.LINK, "href")            .add (Tag.SCRIPT, "src")            ;    // elements whose text contents are crucial to the crawler    static Hashtable savetext = new Hashtable2 ()            .add (Tag.A)            .add (Tag.TITLE);    // elements found in <HEAD>    static Hashtable headtag = new Hashtable2()          .add (Tag.META)          .add (Tag.TITLE)          .add (Tag.BASE)          .add (Tag.LINK)          .add (Tag.ISINDEX)          ;    private static void union (Hashtable map, Object tagname, Hashtable tagset) {        Hashtable2 currset = (Hashtable2)map.get (tagname);        if (currset == null)            map.put (tagname, tagset);        else            map.put (tagname, currset.union (tagset));    }    private void buildParseTree (Page page) {        boolean keepText = false;        elems.setSize (0);        openPtr = 0;        Region[] tokens = page.tokens;        for (int t=0; t<tokens.length; ++t) {            Region r = tokens[t];            if (r instanceof Tag) {                Tag tag = (Tag)r;                String tagName = tag.getTagName();                if (tag.isStartTag()) {                    // start tag <X>                    // check if <X> forces closure of an open element                    if (forcesClosed.containsKey (tagName)) {                        Element e = findOpenElement ((Hashtable)context.get (tagName));                        if (e != null && ((Hashtable)forcesClosed.get (tagName)).containsKey (e.getTagName()))                            close (e, tag.start);                    }                    // create the element and push it on the elems stack                    Element e = makeElement (page.base, tag);                    open (e);                    if (empty.containsKey (tagName)) {                        // element has no content                        // close it off right now                        close (e, tag.end);                    }                    else if (savetext.containsKey (tagName)) {                        text.setLength (0);                        keepText = true;                    }                    if (tagName == Tag.BASE) {                        String href = tag.getHTMLAttribute ("href");                        if (href != null) {                            try {                                                         page.base = new URL (page.base, new String (href.toCharArray())); // make copy to avoid reference to page content                            } catch (MalformedURLException ex) {} // bad URL                              catch (NullPointerException ex) {} // base == null                        }                    }                }                else {                    // end tag </X>                    // find matching start tag <X>                    Element e = findOpenElement (tagName);                    if (e != null) {                        close (e, tag);                        if (savetext.containsKey (tagName)) {                            if (tagName == Tag.TITLE)                                page.title = text.toString();                            else if (e instanceof Link)                                ((Link)e).setText (text.toString());                            keepText = false;                        }                    }                }            }            else { // r is a text token                if (keepText) {                    if (text.length() > 0)                        text.append (' ');                    text.append (r.toText());                }            }        }        // close any remaining open elements        closeAll (page.end);        // link together the top-level elements        if (!elems.empty()) {            int nElems = elems.size ();            Element c = (Element)elems.elementAt (0);            page.root = c;            for (int j=1; j<nElems; ++j) {                Element d = (Element)elems.elementAt (j);                c.sibling = d;                c = d;            }        }        page.elements = new Element[vElements.size()];        vElements.copyInto (page.elements);        page.links = new Link[vLinks.size()];        vLinks.copyInto (page.links);    }    private Element makeElement (URL base, Tag tag) {        Element e = null;        String tagName = tag.getTagName ();        String hrefAttr = (String)linktag.get (tagName);        String type;        try {            if (tagName == Tag.FORM) {                e = new Form (tag, null, base);                vLinks.addElement (e);            }            else if (tagName == Tag.INPUT                      && (type = tag.getHTMLAttribute ("type")) != null                     && (type.equalsIgnoreCase ("submit") || type.equalsIgnoreCase ("image"))) {                e = new FormButton (tag, null, currentForm);                vLinks.addElement (e);            }            else if (hrefAttr != null && tag.hasHTMLAttribute (hrefAttr)) {                e = new Link (tag, null, base);                vLinks.addElement (e);            }        } catch (MalformedURLException f) {} // bad URL          catch (NullPointerException ex) {} // base == null        if (e == null)            // just make an ordinary element            e = new Element (tag, null);                    vElements.addElement (e);        tag.element = e;        return e;    }    // Stack management    Stack elems = new Stack();        // stack of Elements appearing before than the current element in        // a preorder traversal, except that completely-visited subtrees        // are represented by their root.    int[] openElems = new int[20];    int openPtr = 0;        // stack of indices of open elements in elems    Form currentForm;    private void open (Element e) {        if (openPtr > 0)            e.parent = (Element)elems.elementAt (openElems[openPtr-1]);        else            e.parent = null;        elems.push (e);        if (e instanceof Form)            currentForm = (Form)e;        if (openPtr == openElems.length) {            int[] newarr = new int[openElems.length + 10];            System.arraycopy (openElems, 0, newarr, 0, openElems.length);            openElems = newarr;        }        openElems[openPtr] = elems.size()-1;        ++openPtr;    }    private Element findOpenElement (String tagname) {        for (int i=openPtr-1; i >= 0; --i) {            Element e = (Element)elems.elementAt (openElems[i]);            if (tagname == e.getTagName ())                return e;        }        return null;    }    private Element findOpenElement (Hashtable tags) {        for (int i=openPtr-1; i >= 0; --i) {            Element e = (Element)elems.elementAt (openElems[i]);            if (tags.containsKey (e.getTagName ()))                return e;        }        return null;    }    // NIY: stack up unclosed flow tags (like <B> and <A>) and reopen them    // when the next element is opened    private void close (Element elem, Tag tag) {        elem.endTag = tag;        tag.element = elem;        close (elem, tag.start);        elem.end = tag.end;    }    private void close (Element elem, int end) {        int v;        Element e;        do {            v = openElems[--openPtr];            e = (Element)elems.elementAt (v);            e.end = end;            if (e instanceof Form)                currentForm = null;            int firstChild = v+1;            int nElems = elems.size();            if (firstChild < nElems) {                Element c = (Element)elems.elementAt (firstChild);                e.child = c;                for (int j=firstChild+1; j<nElems; ++j) {                    Element d = (Element)elems.elementAt (j);                    c.sibling = d;                    c = d;                }                elems.setSize (firstChild);            }                    } while (e != elem);    }    private void closeAll (int end) {        if (openPtr > 0)            close ((Element)elems.elementAt (openElems[0]), end);    }    /*     * Testing interface     *     */    public static void main (String[] args) throws Exception {        if (args.length < 1 || args.length > 2) {            System.err.println ("usage: HTMLParser <URL>");            System.exit(-1);        }        Page page;        if (args.length == 1)            page = new Page (new Link(args[0]), new DownloadParameters (), new HTMLParser ());        else            page = new Page (new URL(args[0]), args[1], new HTMLParser ());        /*        long tm = System.currentTimeMillis();     //??dk        HTMLParser tokenizer = new HTMLParser ();        tm = System.currentTimeMillis() - tm;       //??dk            System.err.println("[Parsed " + args[0] + " in " + tm + "ms]");        */         System.out.println ("Tokens: ------------------------------------------");        Region[] tokens = page.tokens;        for (int i=0; i<tokens.length; ++i) {            System.out.println ("[" + tokens[i].getStart() + "," + tokens[i].getEnd() + "]" + tokens[i]);        }       System.out.println ("Tags: ------------------------------------------");        Tag[] tags = page.tags;        for (int i=0; i<tags.length; ++i) {            Tag t = tags[i];            System.out.print ((t.isStartTag() ? "start tag" : "end tag") + " " + t.getTagName ());            Enumeration attrs = t.enumerateHTMLAttributes();            String name, val;            while (attrs.hasMoreElements()) {                name = (String)attrs.nextElement();                val = t.getHTMLAttribute (name);                System.out.print (" " + name + "=\"" + val + "\"");            }            System.out.println ();            System.out.println ("    " + t);        }        System.out.println ("Words: ------------------------------------------");        Text[] words = page.words;        for (int i=0; i<words.length; ++i) {            System.out.println (words[i]);        }        System.out.println ("Elements: ------------------------------------------");        printout (page.root, 0);        System.out.println ("Links: ------------------------------------------");        printout (page.getLinks (), 0);    }    private static String indentation (int indent) {        StringBuffer s = new StringBuffer();        for (int i=0; i<indent; ++i)            s.append ("    ");        return s.toString();    }    private static void printout (Element element, int indent) {      for (Element e = element; e != null; e = e.getSibling ()) {          Element c = e.getChild();          System.out.println (indentation(indent) + e.getStartTag() + "[" + e.getStart() + "," + e.getEnd() + "]");          if (c != null)              printout (c, indent+1);          if (e.getEndTag() != null)              System.out.println (indentation(indent) + e.getEndTag());      }    }    private static void printout (Link[] elements, int indent) {        for (int i=0; i<elements.length; ++i) {            Link e = elements[i];            System.out.println (indentation(indent) + e.toDescription());        }    }}class Hashtable2 extends Hashtable {    public Hashtable2 () {    }    public Hashtable2 add (Object key) {        put (key, key);        return this;    }    public Hashtable2 add (Object key, Object val) {        put (key, val);        return this;    }    public Hashtable2 union (Hashtable map) {        Enumeration enum = map.keys ();        while (enum.hasMoreElements ()) {            Object key = enum.nextElement ();            put (key, map.get (key));        }        return this;    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -