⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 tagnode.java

📁 本程序用于对页面信息进行提取并分析
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
// HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML// http://sourceforge.org/projects/htmlparser// Copyright (C) 2004 Derrick Oswald//// Revision Control Information//// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/nodes/TagNode.java,v $// $Author: derrickoswald $// $Date: 2005/04/10 23:20:44 $// $Revision: 1.6 $//// This library is free software; you can redistribute it and/or// modify it under the terms of the GNU Lesser General Public// License as published by the Free Software Foundation; either// version 2.1 of the License, or (at your option) any later version.//// This library is distributed in the hope that it will be useful,// but WITHOUT ANY WARRANTY; without even the implied warranty of// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU// Lesser General Public License for more details.//// You should have received a copy of the GNU Lesser General Public// License along with this library; if not, write to the Free Software// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA//package org.htmlparser.nodes;import java.util.Enumeration;import java.util.Hashtable;import java.util.Locale;import java.util.Vector;import org.htmlparser.Attribute;import org.htmlparser.Tag;import org.htmlparser.lexer.Cursor;import org.htmlparser.lexer.Lexer;import org.htmlparser.lexer.Page;import org.htmlparser.scanners.Scanner;import org.htmlparser.scanners.TagScanner;import org.htmlparser.util.ParserException;import org.htmlparser.util.SpecialHashtable;import org.htmlparser.visitors.NodeVisitor;/** * TagNode represents a generic tag. * If no scanner is registered for a given tag name, this is what you get. * This is also the base class for all tags created by the parser. */public class TagNode    extends        AbstractNode    implements        Tag{    /**     * An empty set of tag names.     */    private final static String[] NONE = new String[0];        /**     * The scanner for this tag.     */    private Scanner mScanner;        /**     * The default scanner for non-composite tags.     */    protected final static Scanner mDefaultScanner = new TagScanner ();    /**     * The tag attributes.     * Objects of type {@link Attribute}.     * The first element is the tag name, subsequent elements being either     * whitespace or real attributes.     */    protected Vector mAttributes;    /**     * Set of tags that breaks the flow.     */    protected static Hashtable breakTags;    static    {        breakTags = new Hashtable (30);        breakTags.put ("BLOCKQUOTE", Boolean.TRUE);        breakTags.put ("BODY", Boolean.TRUE);        breakTags.put ("BR", Boolean.TRUE);        breakTags.put ("CENTER", Boolean.TRUE);        breakTags.put ("DD", Boolean.TRUE);        breakTags.put ("DIR", Boolean.TRUE);        breakTags.put ("DIV", Boolean.TRUE);        breakTags.put ("DL", Boolean.TRUE);        breakTags.put ("DT", Boolean.TRUE);        breakTags.put ("FORM", Boolean.TRUE);        breakTags.put ("H1", Boolean.TRUE);        breakTags.put ("H2", Boolean.TRUE);        breakTags.put ("H3", Boolean.TRUE);        breakTags.put ("H4", Boolean.TRUE);        breakTags.put ("H5", Boolean.TRUE);        breakTags.put ("H6", Boolean.TRUE);        breakTags.put ("HEAD", Boolean.TRUE);        breakTags.put ("HR", Boolean.TRUE);        breakTags.put ("HTML", Boolean.TRUE);        breakTags.put ("ISINDEX", Boolean.TRUE);        breakTags.put ("LI", Boolean.TRUE);        breakTags.put ("MENU", Boolean.TRUE);        breakTags.put ("NOFRAMES", Boolean.TRUE);        breakTags.put ("OL", Boolean.TRUE);        breakTags.put ("P", Boolean.TRUE);        breakTags.put ("PRE", Boolean.TRUE);        breakTags.put ("TD", Boolean.TRUE);        breakTags.put ("TH", Boolean.TRUE);        breakTags.put ("TITLE", Boolean.TRUE);        breakTags.put ("UL", Boolean.TRUE);    }    /**     * Create an empty tag.     */    public TagNode ()    {        this (null, -1, -1, new Vector ());    }    /**     * Create a tag with the location and attributes provided     * @param page The page this tag was read from.     * @param start The starting offset of this node within the page.     * @param end The ending offset of this node within the page.     * @param attributes The list of attributes that were parsed in this tag.     * @see Attribute     */    public TagNode (Page page, int start, int end, Vector attributes)    {        super (page, start, end);        mScanner = mDefaultScanner;        mAttributes = attributes;        if ((null == mAttributes) || (0 == mAttributes.size ()))        {            String[] names;            names = getIds ();            if ((null != names) && (0 != names.length))                setTagName (names[0]);            else                setTagName (""); // make sure it's not null        }    }    /**     * Create a tag like the one provided.     * @param tag The tag to emulate.     * @param scanner The scanner for this tag.     */    public TagNode (TagNode tag, TagScanner scanner)    {        this (tag.getPage (), tag.getTagBegin (), tag.getTagEnd (), tag.getAttributesEx ());        setThisScanner (scanner);    }    /**     * Returns the value of an attribute.     * @param name Name of attribute, case insensitive.     * @return The value associated with the attribute or null if it does     * not exist, or is a stand-alone or     */    public String getAttribute (String name)    {        Attribute attribute;        String ret;        ret = null;        if (name.equalsIgnoreCase (SpecialHashtable.TAGNAME))            ret = ((Attribute)getAttributesEx ().elementAt (0)).getName ();        else        {            attribute = getAttributeEx (name);            if (null != attribute)                ret = attribute.getValue ();        }        return (ret);    }    /**     * Set attribute with given key, value pair.     * Figures out a quote character to use if necessary.     * @param key The name of the attribute.     * @param value The value of the attribute.     */    public void setAttribute (String key, String value)    {        char ch;        boolean needed;        boolean singleq;        boolean doubleq;        String ref;        StringBuffer buffer;        char quote;        Attribute attribute;        // first determine if there's whitespace in the value        // and while we'return at it find a suitable quote character        needed = false;        singleq = true;        doubleq = true;        if (null != value)            for (int i = 0; i < value.length (); i++)            {                ch = value.charAt (i);                if (Character.isWhitespace (ch))                    needed = true;                else if ('\'' == ch)                    singleq  = false;                else if ('"' == ch)                    doubleq = false;            }        // now apply quoting        if (needed)        {            if (doubleq)                quote = '"';            else if (singleq)                quote = '\'';            else            {                // uh-oh, we need to convert some quotes into character references                // convert all double quotes into &#34;                quote = '"';                ref = "&quot;"; // Translate.encode (quote);                // JDK 1.4: value = value.replaceAll ("\"", ref);                buffer = new StringBuffer (value.length() * 5);                for (int i = 0; i < value.length (); i++)                {                    ch = value.charAt (i);                    if (quote == ch)                        buffer.append (ref);                    else                        buffer.append (ch);                }                value = buffer.toString ();            }        }        else            quote = 0;        attribute = getAttributeEx (key);        if (null != attribute)        {   // see if we can splice it in rather than replace it            attribute.setValue (value);            if (0 != quote)                attribute.setQuote (quote);        }        else            setAttribute (key, value, quote);    }    /**     * Remove the attribute with the given key, if it exists.     * @param key The name of the attribute.     */    public void removeAttribute (String key)    {        Attribute attribute;        attribute = getAttributeEx (key);        if (null != attribute)            getAttributesEx ().remove (attribute);    }    /**     * Set attribute with given key, value pair where the value is quoted by quote.     * @param key The name of the attribute.     * @param value The value of the attribute.     * @param quote The quote character to be used around value.     * If zero, it is an unquoted value.     */    public void setAttribute (String key, String value, char quote)    {        setAttribute (new Attribute (key, value, quote));    }    /**     * Returns the attribute with the given name.     * @param name Name of attribute, case insensitive.     * @return The attribute or null if it does     * not exist.     */    public Attribute getAttributeEx (String name)    {        Vector attributes;        int size;        Attribute attribute;        String string;        Attribute ret;        ret = null;        attributes = getAttributesEx ();        if (null != attributes)        {            size = attributes.size ();            for (int i = 0; i < size; i++)            {                attribute = (Attribute)attributes.elementAt (i);                string = attribute.getName ();                if ((null != string) && name.equalsIgnoreCase (string))                {                    ret = attribute;                    i = size; // exit fast                }            }        }        return (ret);    }    /**     * Set an attribute.     * @param attribute The attribute to set.     * @see #setAttribute(Attribute)     */    public void setAttributeEx (Attribute attribute)    {        setAttribute (attribute);    }    /**     * Set an attribute.     * This replaces an attribute of the same name.     * To set the zeroth attribute (the tag name), use setTagName().     * @param attribute The attribute to set.     */    public void setAttribute (Attribute attribute)    {        boolean replaced;        Vector attributes;        int length;        String name;        Attribute test;        String test_name;        replaced = false;        attributes = getAttributesEx ();        length =  attributes.size ();        if (0 < length)        {            name = attribute.getName ();            for (int i = 1; i < attributes.size (); i++)            {                test = (Attribute)attributes.elementAt (i);                test_name = test.getName ();                if (null != test_name)                    if (test_name.equalsIgnoreCase (name))                    {                        attributes.setElementAt (attribute, i);                        replaced = true;                    }            }        }        if (!replaced)        {            // add whitespace between attributes            if ((0 != length) && !((Attribute)attributes.elementAt (length - 1)).isWhitespace ())                attributes.addElement (new Attribute (" "));            attributes.addElement (attribute);        }    }    /**     * Gets the attributes in the tag.     * @return Returns the list of {@link Attribute Attributes} in the tag.     * The first element is the tag name, subsequent elements being either     * whitespace or real attributes.     */    public Vector getAttributesEx ()    {        return (mAttributes);    }    /**     * Gets the attributes in the tag.     * This is not the preferred  method to get attributes, see {@link     * #getAttributesEx getAttributesEx} which returns a list of {@link     * Attribute} objects, which offer more information than the simple     * <code>String</code> objects available from this <code>Hashtable</code>.     * @return Returns a list of name/value pairs representing the attributes.     * These are not in order, the keys (names) are converted to uppercase and the values     * are not quoted, even if they need to be. The table <em>will</em> return     * <code>null</code> if there was no value for an attribute (no equals     * sign or nothing to the right of the equals sign). A special entry with     * a key of SpecialHashtable.TAGNAME ("$<TAGNAME>$") holds the tag name.     * The conversion to uppercase is performed with an ENGLISH locale.     */    public Hashtable getAttributes ()    {        Vector attributes;        Attribute attribute;        String value;        Hashtable ret;        ret = new SpecialHashtable ();        attributes = getAttributesEx ();        if (0 < attributes.size ())        {            // special handling for the node name            attribute = (Attribute)attributes.elementAt (0);            ret.put (SpecialHashtable.TAGNAME, attribute.getName ().toUpperCase (Locale.ENGLISH));            // the rest            for (int i = 1; i < attributes.size (); i++)            {                attribute = (Attribute)attributes.elementAt (i);                if (!attribute.isWhitespace ())                {                    value = attribute.getValue ();                    if (attribute.isEmpty ())                        value = SpecialHashtable.NOTHING;                    if (null == value)                        value = SpecialHashtable.NULLVALUE;                    ret.put (attribute.getName ().toUpperCase (Locale.ENGLISH), value);                }            }        }        else            ret.put (SpecialHashtable.TAGNAME, "");        return (ret);    }    /**     * Return the name of this tag.     * <p>     * <em>     * Note: This value is converted to uppercase and does not     * begin with "/" if it is an end tag. Nor does it end with     * a slash in the case of an XML type tag.     * To get at the original text of the tag name use     * {@link #getRawTagName getRawTagName()}.     * The conversion to uppercase is performed with an ENGLISH locale.     * </em>     * @return The tag name.     */    public String getTagName ()    {        String ret;        ret = getRawTagName ();        if (null != ret)        {            ret = ret.toUpperCase (Locale.ENGLISH);            if (ret.startsWith ("/"))                ret = ret.substring (1);            if (ret.endsWith ("/"))                ret = ret.substring (0, ret.length () - 1);        }        return (ret);    }    /**     * Return the name of this tag.     * @return The tag name or null if this tag contains nothing or only     * whitespace.     */    public String getRawTagName ()    {        Vector attributes;        String ret;        ret = null;                attributes = getAttributesEx ();        if (0 != attributes.size ())            ret = ((Attribute)attributes.elementAt (0)).getName ();        return (ret);    }    /**     * Set the name of this tag.     * This creates or replaces the first attribute of the tag (the     * zeroth element of the attribute vector).     * @param name The tag name.     */    public void setTagName (String name)    {        Attribute attribute;        Vector attributes;        Attribute zeroth;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -