📄 abstractnode.java

📁 本程序用于对页面信息进行提取并分析
💻 JAVA
字号:
// HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML// http://sourceforge.org/projects/htmlparser// Copyright (C) 2004 Somik Raha//// Revision Control Information//// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/nodes/AbstractNode.java,v $// $Author: derrickoswald $// $Date: 2005/10/26 22:01:23 $// $Revision: 1.5 $//// This library is free software; you can redistribute it and/or// modify it under the terms of the GNU Lesser General Public// License as published by the Free Software Foundation; either// version 2.1 of the License, or (at your option) any later version.//// This library is distributed in the hope that it will be useful,// but WITHOUT ANY WARRANTY; without even the implied warranty of// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU// Lesser General Public License for more details.//// You should have received a copy of the GNU Lesser General Public// License along with this library; if not, write to the Free Software// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA//package org.htmlparser.nodes;import java.io.Serializable;import org.htmlparser.Node;import org.htmlparser.NodeFilter;import org.htmlparser.lexer.Page;import org.htmlparser.util.NodeList;import org.htmlparser.util.ParserException;import org.htmlparser.visitors.NodeVisitor;/** * The concrete base class for all types of nodes (tags, text remarks). * This class provides basic functionality to hold the {@link Page}, the * starting and ending position in the page, the parent and the list of * {@link NodeList children}. */public abstract class AbstractNode implements Node, Serializable{    /**     * The page this node came from.     */    protected Page mPage;    /**     * The beginning position of the tag in the line     */    protected int nodeBegin;    /**     * The ending position of the tag in the line     */    protected int nodeEnd;    /**     * The parent of this node.     */    protected Node parent;    /**     * The children of this node.     */    protected NodeList children;    /**     * Create an abstract node with the page positions given.     * Remember the page and start & end cursor positions.     * @param page The page this tag was read from.     * @param start The starting offset of this node within the page.     * @param end The ending offset of this node within the page.     */    public AbstractNode (Page page, int start, int end)    {        mPage = page;        nodeBegin = start;        nodeEnd = end;        parent = null;        children = null;    }    /**     * Clone this object.     * Exposes java.lang.Object clone as a public method.     * @return A clone of this object.     * @exception CloneNotSupportedException This shouldn't be thrown since     * the {@link Node} interface extends Cloneable.     */    public Object clone() throws CloneNotSupportedException    {        return (super.clone ());    }    /**     * Returns a string representation of the node.     * It allows a simple string transformation     * of a web page, regardless of node type.<br>     * Typical application code (for extracting only the text from a web page)     * would then be simplified to:<br>     * <pre>     * Node node;     * for (Enumeration e = parser.elements (); e.hasMoreElements (); )     * {     *     node = (Node)e.nextElement();     *     System.out.println (node.toPlainTextString ());     *     // or do whatever processing you wish with the plain text string     * }     * </pre>     * @return The 'browser' content of this node.     */    public abstract String toPlainTextString ();    /**     * Return the HTML that generated this node.     * This method will make it easier when using html parser to reproduce html     * pages (with or without modifications).     * Applications reproducing html can use this method on nodes which are to     * be used or transferred as they were recieved, with the original html.     * @return The HTML code for this node.     */    public abstract String toHtml ();    /**     * Return a string representation of the node.     * Subclasses must define this method, and this is typically to be used in the manner<br>     * <pre>System.out.println(node)</pre>     * @return A textual representation of the node suitable for debugging     */    public abstract String toString ();    /**     * Collect this node and its child nodes (if-applicable) into the collectionList parameter, provided the node     * satisfies the filtering criteria.<P>     *      * This mechanism allows powerful filtering code to be written very easily,     * without bothering about collection of embedded tags separately.     * e.g. when we try to get all the links on a page, it is not possible to     * get it at the top-level, as many tags (like form tags), can contain     * links embedded in them. We could get the links out by checking if the     * current node is a {@link org.htmlparser.tags.CompositeTag}, and going through its children.     * So this method provides a convenient way to do this.<P>     *      * Using collectInto(), programs get a lot shorter. Now, the code to     * extract all links from a page would look like:     * <pre>     * NodeList collectionList = new NodeList();     * NodeFilter filter = new TagNameFilter ("A");     * for (NodeIterator e = parser.elements(); e.hasMoreNodes();)     *      e.nextNode().collectInto(collectionList, filter);     * </pre>     * Thus, collectionList will hold all the link nodes, irrespective of how     * deep the links are embedded.<P>     *      * Another way to accomplish the same objective is:     * <pre>     * NodeList collectionList = new NodeList();     * NodeFilter filter = new TagClassFilter (LinkTag.class);     * for (NodeIterator e = parser.elements(); e.hasMoreNodes();)     *      e.nextNode().collectInto(collectionList, filter);     * </pre>     * This is slightly less specific because the LinkTag class may be     * registered for more than one node name, e.g. &lt;LINK&gt; tags too.     * @param list The node list to collect acceptable nodes into.     * @param filter The filter to determine which nodes are retained.     */    public void collectInto (NodeList list, NodeFilter filter)    {        if (filter.accept (this))            list.add (this);    }    /**     * Get the page this node came from.     * @return The page that supplied this node.     */    public Page getPage ()    {        return (mPage);    }    /**     * Set the page this node came from.     * @param page The page that supplied this node.     */    public void setPage (Page page)    {        mPage = page;    }    /**     * Gets the starting position of the node.     * @return The start position.     */    public int getStartPosition ()    {        return (nodeBegin);    }    /**     * Sets the starting position of the node.     * @param position The new start position.     */    public void setStartPosition (int position)    {        nodeBegin = position;    }    /**     * Gets the ending position of the node.     * @return The end position.     */    public int getEndPosition ()    {        return (nodeEnd);    }    /**     * Sets the ending position of the node.     * @param position The new end position.     */    public void setEndPosition (int position)    {        nodeEnd = position;    }    /**     * Visit this node.     * @param visitor The visitor that is visiting this node.     */    public abstract void accept (NodeVisitor visitor);    /**     * Get the parent of this node.     * This will always return null when parsing without scanners,     * i.e. if semantic parsing was not performed.     * The object returned from this method can be safely cast to a <code>CompositeTag</code>.     * @return The parent of this node, if it's been set, <code>null</code> otherwise.     */    public Node getParent ()    {        return (parent);    }    /**     * Sets the parent of this node.     * @param node The node that contains this node. Must be a <code>CompositeTag</code>.     */    public void setParent (Node node)    {        parent = node;    }    /**     * Get the children of this node.     * @return The list of children contained by this node, if it's been set, <code>null</code> otherwise.     */    public NodeList getChildren ()    {        return (children);    }    /**     * Set the children of this node.     * @param children The new list of children this node contains.     */    public void setChildren (NodeList children)    {        this.children = children;    }        /**     * Get the first child of this node.     * @return The first child in the list of children contained by this node,     * <code>null</code> otherwise.     */    public Node getFirstChild ()    {        if (children == null)            return null;        if (children.size() == 0)            return null;        return children.elementAt(0);    }        /**     * Get the last child of this node.     * @return The last child in the list of children contained by this node,     * <code>null</code> otherwise.     */    public Node getLastChild ()    {        if (children == null)            return null;        int numChildren = children.size();        if (numChildren == 0)            return null;        return children.elementAt(numChildren - 1);    }        /**     * Get the previous sibling to this node.     * @return The previous sibling to this node if one exists,     * <code>null</code> otherwise.     */    public Node getPreviousSibling ()    {        Node parentNode = this.getParent();        if (parentNode == null)//root node            return null;        NodeList siblings = parentNode.getChildren();        if (siblings == null)//this should actually be an error            return null;        int numSiblings = siblings.size();        if (numSiblings < 2)//need at least one other node to have a chance of having any siblings            return null;        int positionInParent = -1;        for (int i = 0; i < numSiblings; i++)        {            if (siblings.elementAt(i) == this)            {                positionInParent = i;                break;            }        }        if (positionInParent < 1)//no previous siblings            return null;        return siblings.elementAt(positionInParent - 1);    }        /**     * Get the next sibling to this node.     * @return The next sibling to this node if one exists,     * <code>null</code> otherwise.     */    public Node getNextSibling ()    {        Node parentNode = this.getParent();        if (parentNode == null)//root node            return null;        NodeList siblings = parentNode.getChildren();        if (siblings == null)//this should actually be an error            return null;        int numSiblings = siblings.size();        if (numSiblings < 2)//need at least one other node to have a chance of having any siblings            return null;        int positionInParent = -1;        for (int i = 0; i < numSiblings; i++)        {            if (siblings.elementAt(i) == this)            {                positionInParent = i;                break;            }        }        if (positionInParent == -1)//this should actually be an error            return null;        if (positionInParent == (numSiblings - 1))//no next sibling            return null;        return siblings.elementAt(positionInParent + 1);    }    /**     * Returns the text of the node.     * @return The text of this node. The default is <code>null</code>.     */    public String getText ()    {        return null;    }    /**     * Sets the string contents of the node.     * @param text The new text for the node.     */    public void setText(String text)    {    }    /**     * Perform the meaning of this tag.     * The default action is to do nothing.     * @exception ParserException <em>Not used.</em> Provides for subclasses     * that may want to indicate an exceptional condition.     */    public void doSemanticAction ()        throws            ParserException    {    }}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -