📄 htmlparser.java
字号:
.add (Tag.TD, new Hashtable2 () .add (Tag.TR) .add (Tag.TABLE)) .add (Tag.TH, new Hashtable2 () .add (Tag.TR) .add (Tag.TABLE)) ; static { Enumeration enum = forcesClosed.keys (); while (enum.hasMoreElements ()) { Object tagname = enum.nextElement(); union (context, tagname, (Hashtable)forcesClosed.get (tagname)); } } // NIY: handle literal and semi-literal elements (XMP, LISTING, TEXTAREA, OPTION) // elements whose content should be treated as plain text static Hashtable literal = new Hashtable2() ; // maps link elements to their URL attribute (e.g., A maps to HREF) static Hashtable linktag = new Hashtable2 () .add (Tag.A, "href") .add (Tag.AREA, "href") .add (Tag.APPLET, "code") .add (Tag.EMBED, "src") .add (Tag.FRAME, "src") .add (Tag.FORM, "action") .add (Tag.IMG, "src") .add (Tag.LINK, "href") .add (Tag.SCRIPT, "src") ; // elements whose text contents are crucial to the crawler static Hashtable savetext = new Hashtable2 () .add (Tag.A) .add (Tag.TITLE); // elements found in <HEAD> static Hashtable headtag = new Hashtable2() .add (Tag.META) .add (Tag.TITLE) .add (Tag.BASE) .add (Tag.LINK) .add (Tag.ISINDEX) ; private static void union (Hashtable map, Object tagname, Hashtable tagset) { Hashtable2 currset = (Hashtable2)map.get (tagname); if (currset == null) map.put (tagname, tagset); else map.put (tagname, currset.union (tagset)); } private void buildParseTree (Page page) { boolean keepText = false; elems.setSize (0); openPtr = 0; Region[] tokens = page.tokens; for (int t=0; t<tokens.length; ++t) { Region r = tokens[t]; if (r instanceof Tag) { Tag tag = (Tag)r; String tagName = tag.getTagName(); if (tag.isStartTag()) { // start tag <X> // check if <X> forces closure of an open element if (forcesClosed.containsKey (tagName)) { Element e = findOpenElement ((Hashtable)context.get (tagName)); if (e != null && ((Hashtable)forcesClosed.get (tagName)).containsKey (e.getTagName())) close (e, tag.start); } // create the element and push it on the elems stack Element e = makeElement (page.base, tag); open (e); if (empty.containsKey (tagName)) { // element has no content // close it off right now close (e, tag.end); } else if (savetext.containsKey (tagName)) { text.setLength (0); keepText = true; } if (tagName == Tag.BASE) { String href = tag.getHTMLAttribute ("href"); if (href != null) { try { page.base = new URL (page.base, new String (href.toCharArray())); // make copy to avoid reference to page content } catch (MalformedURLException ex) {} // bad URL catch (NullPointerException ex) {} // base == null } } } else { // end tag </X> // find matching start tag <X> Element e = findOpenElement (tagName); if (e != null) { close (e, tag); if (savetext.containsKey (tagName)) { if (tagName == Tag.TITLE) page.title = text.toString(); else if (e instanceof Link) ((Link)e).setText (text.toString()); keepText = false; } } } } else { // r is a text token if (keepText) { if (text.length() > 0) text.append (' '); text.append (r.toText()); } } } // close any remaining open elements closeAll (page.end); // link together the top-level elements if (!elems.empty()) { int nElems = elems.size (); Element c = (Element)elems.elementAt (0); page.root = c; for (int j=1; j<nElems; ++j) { Element d = (Element)elems.elementAt (j); c.sibling = d; c = d; } } page.elements = new Element[vElements.size()]; vElements.copyInto (page.elements); page.links = new Link[vLinks.size()]; vLinks.copyInto (page.links); } private Element makeElement (URL base, Tag tag) { Element e = null; String tagName = tag.getTagName (); String hrefAttr = (String)linktag.get (tagName); String type; try { if (tagName == Tag.FORM) { e = new Form (tag, null, base); vLinks.addElement (e); } else if (tagName == Tag.INPUT && (type = tag.getHTMLAttribute ("type")) != null && (type.equalsIgnoreCase ("submit") || type.equalsIgnoreCase ("image"))) { e = new FormButton (tag, null, currentForm); vLinks.addElement (e); } else if (hrefAttr != null && tag.hasHTMLAttribute (hrefAttr)) { e = new Link (tag, null, base); vLinks.addElement (e); } } catch (MalformedURLException f) {} // bad URL catch (NullPointerException ex) {} // base == null if (e == null) // just make an ordinary element e = new Element (tag, null); vElements.addElement (e); tag.element = e; return e; } // Stack management Stack elems = new Stack(); // stack of Elements appearing before than the current element in // a preorder traversal, except that completely-visited subtrees // are represented by their root. int[] openElems = new int[20]; int openPtr = 0; // stack of indices of open elements in elems Form currentForm; private void open (Element e) { if (openPtr > 0) e.parent = (Element)elems.elementAt (openElems[openPtr-1]); else e.parent = null; elems.push (e); if (e instanceof Form) currentForm = (Form)e; if (openPtr == openElems.length) { int[] newarr = new int[openElems.length + 10]; System.arraycopy (openElems, 0, newarr, 0, openElems.length); openElems = newarr; } openElems[openPtr] = elems.size()-1; ++openPtr; } private Element findOpenElement (String tagname) { for (int i=openPtr-1; i >= 0; --i) { Element e = (Element)elems.elementAt (openElems[i]); if (tagname == e.getTagName ()) return e; } return null; } private Element findOpenElement (Hashtable tags) { for (int i=openPtr-1; i >= 0; --i) { Element e = (Element)elems.elementAt (openElems[i]); if (tags.containsKey (e.getTagName ())) return e; } return null; } // NIY: stack up unclosed flow tags (like <B> and <A>) and reopen them // when the next element is opened private void close (Element elem, Tag tag) { elem.endTag = tag; tag.element = elem; close (elem, tag.start); elem.end = tag.end; } private void close (Element elem, int end) { int v; Element e; do { v = openElems[--openPtr]; e = (Element)elems.elementAt (v); e.end = end; if (e instanceof Form) currentForm = null; int firstChild = v+1; int nElems = elems.size(); if (firstChild < nElems) { Element c = (Element)elems.elementAt (firstChild); e.child = c; for (int j=firstChild+1; j<nElems; ++j) { Element d = (Element)elems.elementAt (j); c.sibling = d; c = d; } elems.setSize (firstChild); } } while (e != elem); } private void closeAll (int end) { if (openPtr > 0) close ((Element)elems.elementAt (openElems[0]), end); } /* * Testing interface * */ public static void main (String[] args) throws Exception { if (args.length < 1 || args.length > 2) { System.err.println ("usage: HTMLParser <URL>"); System.exit(-1); } Page page; if (args.length == 1) page = new Page (new Link(args[0]), new DownloadParameters (), new HTMLParser ()); else page = new Page (new URL(args[0]), args[1], new HTMLParser ()); /* long tm = System.currentTimeMillis(); //??dk HTMLParser tokenizer = new HTMLParser (); tm = System.currentTimeMillis() - tm; //??dk System.err.println("[Parsed " + args[0] + " in " + tm + "ms]"); */ System.out.println ("Tokens: ------------------------------------------"); Region[] tokens = page.tokens; for (int i=0; i<tokens.length; ++i) { System.out.println ("[" + tokens[i].getStart() + "," + tokens[i].getEnd() + "]" + tokens[i]); } System.out.println ("Tags: ------------------------------------------"); Tag[] tags = page.tags; for (int i=0; i<tags.length; ++i) { Tag t = tags[i]; System.out.print ((t.isStartTag() ? "start tag" : "end tag") + " " + t.getTagName ()); Enumeration attrs = t.enumerateHTMLAttributes(); String name, val; while (attrs.hasMoreElements()) { name = (String)attrs.nextElement(); val = t.getHTMLAttribute (name); System.out.print (" " + name + "=\"" + val + "\""); } System.out.println (); System.out.println (" " + t); } System.out.println ("Words: ------------------------------------------"); Text[] words = page.words; for (int i=0; i<words.length; ++i) { System.out.println (words[i]); } System.out.println ("Elements: ------------------------------------------"); printout (page.root, 0); System.out.println ("Links: ------------------------------------------"); printout (page.getLinks (), 0); } private static String indentation (int indent) { StringBuffer s = new StringBuffer(); for (int i=0; i<indent; ++i) s.append (" "); return s.toString(); } private static void printout (Element element, int indent) { for (Element e = element; e != null; e = e.getSibling ()) { Element c = e.getChild(); System.out.println (indentation(indent) + e.getStartTag() + "[" + e.getStart() + "," + e.getEnd() + "]"); if (c != null) printout (c, indent+1); if (e.getEndTag() != null) System.out.println (indentation(indent) + e.getEndTag()); } } private static void printout (Link[] elements, int indent) { for (int i=0; i<elements.length; ++i) { Link e = elements[i]; System.out.println (indentation(indent) + e.toDescription()); } }}class Hashtable2 extends Hashtable { public Hashtable2 () { } public Hashtable2 add (Object key) { put (key, key); return this; } public Hashtable2 add (Object key, Object val) { put (key, val); return this; } public Hashtable2 union (Hashtable map) { Enumeration enum = map.keys (); while (enum.hasMoreElements ()) { Object key = enum.nextElement (); put (key, map.get (key)); } return this; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -