⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 htmlcollector.java

📁 HTML解释器JAVA源码
💻 JAVA
字号:
/* * HtmlCollector.java -- structures an HTML document tree.   * Copyright (C) 1999 Quiotix Corporation.   * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License, version 2, as  * published by the Free Software Foundation.   * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU General Public License (http://www.gnu.org/copyleft/gpl.txt) * for more details. */package com.quiotix.html.parser;import java.util.*;import java.io.*;/** An HtmlVisitor which modifies the structure of the document so that * begin tags are matched properly with end tags and placed in TagBlock * elements.  Typically, an HtmlDocument is created by the parser, which  * simply returns a flat list of elements.  The HtmlCollector takes this * flat list and gives it the structure that is implied by the HTML content. * * @author Brian Goetz, Quiotix */public class HtmlCollector extends HtmlVisitor {  protected MyVector tagStack = new MyVector();  protected MyVector elements;  protected boolean collected;  protected static Hashtable dontMatch = new Hashtable();  protected static String[] dontMatchStrings    = { "AREA", "BASE", "BASEFONT", "BR", "COL", "HR", "IMG", "INPUT",         "ISINDEX", "LINK", "META", "P", "PARAM" };  static {    Integer dummy = new Integer(0);    for (int i=0; i < dontMatchStrings.length; i++)       dontMatch.put(dontMatchStrings[i], dummy);  };  private static class TagStackEntry {    String tagName;    int index;  };  private static class MyVector extends Vector {    MyVector()      { super();  }    MyVector(int n) { super(n); }    public void popN(int n) { elementCount -= n; }  };  protected int pushNode(HtmlDocument.HtmlElement e) {    elements.addElement(e);    return elements.size()-1;  };  public void visit(HtmlDocument.Comment c)     { pushNode(c); };  public void visit(HtmlDocument.Text t)        { pushNode(t); };  public void visit(HtmlDocument.Newline n)     { pushNode(n); };  public void visit(HtmlDocument.Tag t)         {     TagStackEntry ts = new TagStackEntry();    int index;    // Push the tag onto the element stack, and push an entry on the tag    // stack if it's a tag we care about matching    index = pushNode(t);    if (!t.emptyTag         && !dontMatch.containsKey(t.tagName.toUpperCase())) {      ts.tagName = t.tagName;      ts.index = index;      tagStack.addElement(ts);    };  };  public void visit(HtmlDocument.EndTag t)      {     int i;    for (i=tagStack.size()-1; i >= 0; i--) {      TagStackEntry ts = (TagStackEntry) tagStack.elementAt(i);      if (t.tagName.equalsIgnoreCase(ts.tagName)) {        HtmlDocument.TagBlock block;        HtmlDocument.ElementSequence blockElements;        HtmlDocument.Tag tag;              // Create a new ElementSequence and copy the elements to it        blockElements =           new HtmlDocument.ElementSequence(elements.size() - ts.index - 1);        for (int j=ts.index+1; j<elements.size(); j++)           blockElements.addElement((HtmlDocument.HtmlElement)                                    elements.elementAt(j));        tag = (HtmlDocument.Tag) elements.elementAt(ts.index);        block = new HtmlDocument.TagBlock(tag.tagName,                                           tag.attributeList, blockElements);        // Pop the elements off the stack, push the new block        elements.popN(elements.size() - ts.index);        elements.addElement(block);        // Pop the matched tag and intervening unmatched tags         tagStack.popN(tagStack.size()-i);                collected = true;        break;      };    };    // If we didn't find a match, just push the end tag    if (i < 0)       pushNode(t);  };  public void visit(HtmlDocument.TagBlock bl) {    HtmlCollector c = new HtmlCollector();    c.start();    c.visit(bl.body);    c.finish();    pushNode(bl);  }  public void visit(HtmlDocument.ElementSequence s) {    elements = new MyVector(s.elements.size());    collected = false;    for (Enumeration e = s.elements.elements();          e.hasMoreElements(); )      ((HtmlDocument.HtmlElement)e.nextElement()).accept(this);    if (collected)       s.elements = elements;  }  public void start() {}  public void finish() {}  public static void main (String[] args) throws Exception {    InputStream r = new FileInputStream(args[0]);    HtmlDocument document;        try {       document = new com.quiotix.html.parser.HtmlParser(r).HtmlDocument();      document.accept(new HtmlScrubber());      document.accept(new HtmlCollector());      document.accept(new HtmlDumper(System.out));    }    finally {      r.close();    };  };}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -