stringbean.java

来自「html 解析处理代码」· Java 代码 · 共 717 行 · 第 1/2 页
JAVA
717 行
// HTMLParser Library $Name: v1_6 $ - A java-based parser for HTML// http://sourceforge.org/projects/htmlparser// Copyright (C) 2004 Derrick Oswald//// Revision Control Information//// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/beans/StringBean.java,v $// $Author: derrickoswald $// $Date: 2006/05/30 01:07:14 $// $Revision: 1.45 $//// This library is free software; you can redistribute it and/or// modify it under the terms of the GNU Lesser General Public// License as published by the Free Software Foundation; either// version 2.1 of the License, or (at your option) any later version.//// This library is distributed in the hope that it will be useful,// but WITHOUT ANY WARRANTY; without even the implied warranty of// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU// Lesser General Public License for more details.//// You should have received a copy of the GNU Lesser General Public// License along with this library; if not, write to the Free Software// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA//package org.htmlparser.beans;import java.beans.PropertyChangeListener;import java.beans.PropertyChangeSupport;import java.io.Serializable;import java.net.URLConnection;import org.htmlparser.Parser;import org.htmlparser.Text;import org.htmlparser.tags.LinkTag;import org.htmlparser.Tag;import org.htmlparser.util.ParserException;import org.htmlparser.util.EncodingChangeException;import org.htmlparser.util.Translate;import org.htmlparser.visitors.NodeVisitor;/** * Extract strings from a URL. * <p>Text within &lt;SCRIPT&gt;&lt;/SCRIPT&gt; tags is removed.</p> * <p>The text within &lt;PRE&gt;&lt;/PRE&gt; tags is not altered.</p> * <p>The property <code>Strings</code>, which is the output property is null * until a URL is set. So a typical usage is:</p> * <pre> *     StringBean sb = new StringBean (); *     sb.setLinks (false); *     sb.setReplaceNonBreakingSpaces (true); *     sb.setCollapse (true); *     sb.setURL ("http://www.netbeans.org"); // the HTTP is performed here *     String s = sb.getStrings (); * </pre> * You can also use the StringBean as a NodeVisitor on your own parser, * in which case you have to refetch your page if you change one of the * properties because it resets the Strings property:</p> * <pre> *     StringBean sb = new StringBean (); *     Parser parser = new Parser ("http://cbc.ca"); *     parser.visitAllNodesWith (sb); *     String s = sb.getStrings (); *     sb.setLinks (true); *     parser.reset (); *     parser.visitAllNodesWith (sb); *     String sl = sb.getStrings (); * </pre> * According to Nick Burch, who contributed the patch, this is handy if you * don't want StringBean to wander off and get the content itself, either * because you already have it, it's not on a website etc. */public class StringBean extends NodeVisitor implements Serializable{    /**     * Property name in event where the URL contents changes.     */    public static final String PROP_STRINGS_PROPERTY = "strings";    /**     * Property name in event where the 'embed links' state changes.     */    public static final String PROP_LINKS_PROPERTY = "links";    /**     * Property name in event where the URL changes.     */    public static final String PROP_URL_PROPERTY = "URL";    /**     * Property name in event where the 'replace non-breaking spaces'     * state changes.     */    public static final String PROP_REPLACE_SPACE_PROPERTY =        "replaceNonBreakingSpaces";    /**     * Property name in event where the 'collapse whitespace' state changes.     */    public static final String PROP_COLLAPSE_PROPERTY = "collapse";    /**     * Property name in event where the connection changes.     */    public static final String PROP_CONNECTION_PROPERTY = "connection";    /**     * A newline.     */    private static final String NEWLINE = System.getProperty ("line.separator");    /**     * The length of the NEWLINE.     */    private static final int NEWLINE_SIZE = NEWLINE.length ();    /**     * Bound property support.     */    protected PropertyChangeSupport mPropertySupport;    /**     * The parser used to extract strings.     */    protected Parser mParser;    /**     * The strings extracted from the URL.     */    protected String mStrings;    /**     * If <code>true</code> the link URLs are embedded in the text output.     */    protected boolean mLinks;    /**     * If <code>true</code> regular space characters are substituted for     * non-breaking spaces in the text output.     */    protected boolean mReplaceSpace;    /**     * If <code>true</code> sequences of whitespace characters are replaced     * with a single space character.     */    protected boolean mCollapse;    /**     * The state of the collapse processiung state machine.     */    protected int mCollapseState;    /**     * The buffer text is stored in while traversing the HTML.     */    protected StringBuffer mBuffer;    /**     * Set <code>true</code> when traversing a SCRIPT tag.     */    protected boolean mIsScript;    /**     * Set <code>true</code> when traversing a PRE tag.     */    protected boolean mIsPre;    /**     * Set <code>true</code> when traversing a STYLE tag.     */    protected boolean mIsStyle;   /**     * Create a StringBean object.     * Default property values are set to 'do the right thing':     * <p><code>Links</code> is set <code>false</code> so text appears like a     * browser would display it, albeit without the colour or underline clues     * normally associated with a link.</p>     * <p><code>ReplaceNonBreakingSpaces</code> is set <code>true</code>, so     * that printing the text works, but the extra information regarding these     * formatting marks is available if you set it false.</p>     * <p><code>Collapse</code> is set <code>true</code>, so text appears     * compact like a browser would display it.</p>     */    public StringBean ()    {        super (true, true);        mPropertySupport = new PropertyChangeSupport (this);        mParser = new Parser ();        mStrings = null;        mLinks = false;        mReplaceSpace = true;        mCollapse = true;        mCollapseState = 0;        mBuffer = new StringBuffer (4096);        mIsScript = false;        mIsPre = false;        mIsStyle = false;    }    //    // internals    //    /**     * Appends a newline to the buffer if there isn't one there already.     * Except if the buffer is empty.     */    protected void carriageReturn ()    {        int length;        length = mBuffer.length ();        if ((0 != length) // don't append newlines to the beginning of a buffer            && ((NEWLINE_SIZE <= length) // not enough chars to hold a NEWLINE            && (!mBuffer.substring (                length - NEWLINE_SIZE, length).equals (NEWLINE))))            mBuffer.append (NEWLINE);        mCollapseState = 0;    }    /**     * Add the given text collapsing whitespace.     * Use a little finite state machine:     * <pre>     * state 0: whitepace was last emitted character     * state 1: in whitespace     * state 2: in word     * A whitespace character moves us to state 1 and any other character     * moves us to state 2, except that state 0 stays in state 0 until     * a non-whitespace and going from whitespace to word we emit a space     * before the character:     *    input:     whitespace   other-character     * state\next     *    0               0             2     *    1               1        space then 2     *    2               1             2     * </pre>     * @param buffer The buffer to append to.     * @param string The string to append.     */    protected void collapse (StringBuffer buffer, String string)    {        int chars;        char character;        chars = string.length ();        if (0 != chars)        {            for (int i = 0; i < chars; i++)            {                character = string.charAt (i);                switch (character)                {                    // see HTML specification section 9.1 White space                    // http://www.w3.org/TR/html4/struct/text.html#h-9.1                    case '\u0020':                    case '\u0009':                    case '\u000C':                    case '\u200B':                    case '\r':                    case '\n':                        if (0 != mCollapseState)                            mCollapseState = 1;                        break;                    default:                        if (1 == mCollapseState)                            buffer.append (' ');                        mCollapseState = 2;                        buffer.append (character);                }            }        }    }    /**     * Extract the text from a page.     * @return The textual contents of the page.     * @exception ParserException If a parse error occurs.     */    protected String extractStrings ()        throws            ParserException    {        String ret;        mCollapseState = 0;        mParser.visitAllNodesWith (this);        ret = mBuffer.toString ();        mBuffer = new StringBuffer(4096);        return (ret);    }    /**     * Assign the <code>Strings</code> property, firing the property change.     * @param strings The new value of the <code>Strings</code> property.     */    protected void updateStrings (String strings)    {        String oldValue;        if ((null == mStrings) || !mStrings.equals (strings))        {            oldValue = mStrings;            mStrings = strings;            mPropertySupport.firePropertyChange (                PROP_STRINGS_PROPERTY, oldValue, strings);        }    }    /**     * Fetch the URL contents.     * Only do work if there is a valid parser with it's URL set.     */    protected void setStrings ()    {        mCollapseState = 0;        if (null != getURL ())            try            {                try                {                    mParser.visitAllNodesWith (this);                    updateStrings (mBuffer.toString ());                }                finally                {                    mBuffer = new StringBuffer (4096);                }            }            catch (EncodingChangeException ece)            {                mIsPre = false;                mIsScript = false;                mIsStyle = false;                try                {   // try again with the encoding now in force                    mParser.reset ();                    mBuffer = new StringBuffer (4096);                    mCollapseState = 0;                    mParser.visitAllNodesWith (this);                    updateStrings (mBuffer.toString ());                }                catch (ParserException pe)                {                    updateStrings (pe.toString ());                }                finally                {                    mBuffer = new StringBuffer (4096);                }             }            catch (ParserException pe)            {                updateStrings (pe.toString ());            }
stringbean.java - 源码说明

本页面展示了「html 解析处理代码」中的 stringbean.java 源码文件，采用 Java 编程语言编写，共 717 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与html相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?