📄 compositetagscanner.java

📁 本程序用于对页面信息进行提取并分析
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
// HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML// http://sourceforge.org/projects/htmlparser// Copyright (C) 2003 Somik Raha//// Revision Control Information//// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/CompositeTagScanner.java,v $// $Author: derrickoswald $// $Date: 2005/04/10 23:20:44 $// $Revision: 1.90 $//// This library is free software; you can redistribute it and/or// modify it under the terms of the GNU Lesser General Public// License as published by the Free Software Foundation; either// version 2.1 of the License, or (at your option) any later version.//// This library is distributed in the hope that it will be useful,// but WITHOUT ANY WARRANTY; without even the implied warranty of// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU// Lesser General Public License for more details.//// You should have received a copy of the GNU Lesser General Public// License along with this library; if not, write to the Free Software// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA//package org.htmlparser.scanners;import java.util.Vector;import org.htmlparser.Attribute;import org.htmlparser.Node;import org.htmlparser.Tag;import org.htmlparser.lexer.Lexer;import org.htmlparser.lexer.Page;import org.htmlparser.scanners.Scanner;import org.htmlparser.util.NodeList;import org.htmlparser.util.ParserException;/** * The main scanning logic for nested tags. * When asked to scan, this class gathers nodes into a heirarchy of tags. */public class CompositeTagScanner extends TagScanner{    /**     * Determine whether to use JVM or NodeList stack.     * This can be set to true to get the original behaviour of     * recursion into composite tags on the JVM stack.     * This may lead to StackOverFlowException problems in some cases     * i.e. Windows.     */    private static final boolean mUseJVMStack = false;    /**     * Determine whether unexpected end tags should cause stack roll-up.     * This can be set to true to get the original behaviour of gathering     * end tags into whatever tag is open.     * This can be expensive, but should only be needed in the presence of     * bad HTML.     */    private static final boolean mLeaveEnds = false;    /**     * Create a composite tag scanner.     */    public CompositeTagScanner ()    {    }    /**     * Collect the children.     * <p>An initial test is performed for an empty XML tag, in which case     * the start tag and end tag of the returned tag are the same and it has     * no children.<p>     * If it's not an empty XML tag, the lexer is repeatedly asked for     * subsequent nodes until an end tag is found or a node is encountered     * that matches the tag ender set or end tag ender set.     * In the latter case, a virtual end tag is created.     * Each node found that is not the end tag is added to     * the list of children. The end tag is special and not a child.<p>     * Nodes that also have a CompositeTagScanner as their scanner are     * recursed into, which provides the nested structure of an HTML page.     * This method operates in two possible modes, depending on a private boolean.     * It can recurse on the JVM stack, which has caused some overflow problems     * in the past, or it can use the supplied stack argument to nest scanning     * of child tags within itself. The former is left as an option in the code,     * mostly to help subsequent modifiers visualize what the internal nesting     * is doing.     * @param tag The tag this scanner is responsible for.     * @param lexer The source of subsequent nodes.     * @param stack The parse stack. May contain pending tags that enclose     * this tag.     * @return The resultant tag (may be unchanged).     */    public Tag scan (Tag tag, Lexer lexer, NodeList stack) throws ParserException    {        Node node;        Tag next;        String name;        Scanner scanner;        Tag ret;                ret = tag;        if (ret.isEmptyXmlTag ())            ret.setEndTag (ret);        else            do            {                node = lexer.nextNode (false);                if (null != node)                {                    if (node instanceof Tag)                    {                        next = (Tag)node;                        name = next.getTagName ();                        // check for normal end tag                        if (next.isEndTag () && name.equals (ret.getTagName ()))                        {                            ret.setEndTag (next);                            node = null;                        }                        else if (isTagToBeEndedFor (ret, next)) // check DTD                        {                            // backup one node. insert a virtual end tag later                            lexer.setPosition (next.getStartPosition ());                            node = null;                        }                        else if (!next.isEndTag ())                        {                            // now recurse if there is a scanner for this type of tag                            scanner = next.getThisScanner ();                            if (null != scanner)                            {                                if (mUseJVMStack)                                {   // JVM stack recursion                                    node = scanner.scan (next, lexer, stack);                                    addChild (ret, node);                                }                                else                                {                                    // fake recursion:                                    if (scanner == this)                                    {                                        if (next.isEmptyXmlTag ())                                        {                                            next.setEndTag (next);                                            finishTag (next, lexer);                                            addChild (ret, next);                                        }                                        else                                        {                                            stack.add (ret);                                            ret = next;                                        }                                    }                                    else                                    {   // normal recursion if switching scanners                                        node = scanner.scan (next, lexer, stack);                                        addChild (ret, node);                                    }                                }                            }                            else                                addChild (ret, next);                        }                        else                        {                            if (!mUseJVMStack && !mLeaveEnds)                            {                                // Since all non-end tags are consumed by the                                // previous clause, we're here because we have an                                // end tag with no opening tag... this could be bad.                                // There are two cases...                                // 1) The tag hasn't been registered, in which case                                // we just add it as a simple child, like it's                                // opening tag                                // 2) There may be an opening tag further up the                                // parse stack that needs closing.                                // So, we ask the factory for a node like this one                                // (since end tags never have scanners) and see                                // if it's scanner is a composite tag scanner.                                // If it is we walk up the parse stack looking for                                // something that needs this end tag to finish it.
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -