📄 stringbean.java

📁 html 解析处理代码
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
        else        {            // reset in case this StringBean is used as a visitor            // on another parser, not it's own            mStrings = null;            mBuffer = new StringBuffer (4096);        }    }    /**     * Refetch the URL contents.     * Only need to worry if there is already a valid parser and it's     * been spent fetching the string contents.     */    private void resetStrings ()    {        if (null != mStrings)            try            {                mParser.setURL (getURL ());                setStrings ();            }            catch (ParserException pe)            {                updateStrings (pe.toString ());            }    }    //    // Property change support.    //    /**     * Add a PropertyChangeListener to the listener list.     * The listener is registered for all properties.     * @param listener The PropertyChangeListener to be added.     */    public void addPropertyChangeListener (PropertyChangeListener listener)    {        mPropertySupport.addPropertyChangeListener (listener);    }    /**     * Remove a PropertyChangeListener from the listener list.     * This removes a registered PropertyChangeListener.     * @param listener The PropertyChangeListener to be removed.     */    public void removePropertyChangeListener (PropertyChangeListener listener)    {        mPropertySupport.removePropertyChangeListener (listener);    }    //    // Properties    //    /**     * Return the textual contents of the URL.     * This is the primary output of the bean.     * @return The user visible (what would be seen in a browser) text.     */    public String getStrings ()    {        if (null == mStrings)        if (0 == mBuffer.length ())            setStrings ();        else            updateStrings (mBuffer.toString ());        return (mStrings);    }    /**     * Get the current 'include links' state.     * @return <code>true</code> if link text is included in the text extracted     * from the URL, <code>false</code> otherwise.     */    public boolean getLinks ()    {        return (mLinks);    }    /**     * Set the 'include links' state.     * If the setting is changed after the URL has been set, the text from the     * URL will be reacquired, which is possibly expensive.     * @param links Use <code>true</code> if link text is to be included in the     * text extracted from the URL, <code>false</code> otherwise.     */    public void setLinks (boolean links)    {        boolean oldValue = mLinks;        if (oldValue != links)        {            mLinks = links;            mPropertySupport.firePropertyChange (                PROP_LINKS_PROPERTY, oldValue, links);            resetStrings ();        }    }    /**     * Get the current URL.     * @return The URL from which text has been extracted, or <code>null</code>     * if this property has not been set yet.     */    public String getURL ()    {         return ((null != mParser) ? mParser.getURL () : null);    }    /**     * Set the URL to extract strings from.     * The text from the URL will be fetched, which may be expensive, so this     * property should be set last.     * @param url The URL that text should be fetched from.     */    public void setURL (String url)    {        String old;        URLConnection conn;        old = getURL ();        conn = getConnection ();        if (((null == old) && (null != url)) || ((null != old)            && !old.equals (url)))        {            try            {                if (null == mParser)                    mParser = new Parser (url);                else                    mParser.setURL (url);                mPropertySupport.firePropertyChange (                    PROP_URL_PROPERTY, old, getURL ());                mPropertySupport.firePropertyChange (                    PROP_CONNECTION_PROPERTY, conn, mParser.getConnection ());                setStrings ();            }            catch (ParserException pe)            {                updateStrings (pe.toString ());            }        }    }    /**     * Get the current 'replace non breaking spaces' state.     * @return <code>true</code> if non-breaking spaces (character '&#92;u00a0',     * numeric character reference &amp;#160; or character entity     * reference &amp;nbsp;) are to be replaced with normal     * spaces (character '&#92;u0020').     */    public boolean getReplaceNonBreakingSpaces ()    {        return (mReplaceSpace);    }    /**     * Set the 'replace non breaking spaces' state.     * If the setting is changed after the URL has been set, the text from the     * URL will be reacquired, which is possibly expensive.     * @param replace <code>true</code> if non-breaking spaces     * (character '&#92;u00a0', numeric character reference &amp;#160;     * or character entity reference &amp;nbsp;) are to be replaced with normal     * spaces (character '&#92;u0020').     */    public void setReplaceNonBreakingSpaces (boolean replace)    {        boolean oldValue = mReplaceSpace;        if (oldValue != replace)        {            mReplaceSpace = replace;            mPropertySupport.firePropertyChange (PROP_REPLACE_SPACE_PROPERTY,                oldValue, replace);            resetStrings ();        }    }    /**     * Get the current 'collapse whitespace' state.     * If set to <code>true</code> this emulates the operation of browsers     * in interpretting text where <quote>user agents should collapse input     * white space sequences when producing output inter-word space</quote>.     * See HTML specification section 9.1 White space     * <a href="http://www.w3.org/TR/html4/struct/text.html#h-9.1">     * http://www.w3.org/TR/html4/struct/text.html#h-9.1</a>.     * @return <code>true</code> if sequences of whitespace (space '&#92;u0020',     * tab '&#92;u0009', form feed '&#92;u000C', zero-width space '&#92;u200B',     * carriage-return '\r' and NEWLINE '\n') are to be replaced with a single     * space.     */    public boolean getCollapse ()    {        return (mCollapse);    }    /**     * Set the current 'collapse whitespace' state.     * If the setting is changed after the URL has been set, the text from the     * URL will be reacquired, which is possibly expensive.     * The internal state of the collapse state machine can be reset with     * code like this:     * <code>setCollapse (getCollapse ());</code>     * @param collapse If <code>true</code>, sequences of whitespace     * will be reduced to a single space.     */    public void setCollapse (boolean collapse)    {        mCollapseState = 0;        boolean oldValue = mCollapse;        if (oldValue != collapse)        {            mCollapse = collapse;            mPropertySupport.firePropertyChange (                    PROP_COLLAPSE_PROPERTY, oldValue, collapse);            resetStrings ();        }    }    /**     * Get the current connection.     * @return The connection that the parser has or <code>null</code> if it     * hasn't been set or the parser hasn't been constructed yet.     */    public URLConnection getConnection ()    {        return ((null != mParser) ? mParser.getConnection () : null);    }    /**     * Set the parser's connection.     * The text from the URL will be fetched, which may be expensive, so this     * property should be set last.     * @param connection New value of property Connection.     */    public void setConnection (URLConnection connection)    {        String url;        URLConnection conn;        url = getURL ();        conn = getConnection ();        if (((null == conn) && (null != connection))            || ((null != conn) && !conn.equals (connection)))        {            try            {                if (null == mParser)                    mParser = new Parser (connection);                else                    mParser.setConnection (connection);                mPropertySupport.firePropertyChange (                    PROP_URL_PROPERTY, url, getURL ());                mPropertySupport.firePropertyChange (                    PROP_CONNECTION_PROPERTY, conn, mParser.getConnection ());                setStrings ();            }            catch (ParserException pe)            {                updateStrings (pe.toString ());            }        }    }    //    // NodeVisitor overrides    //    /**     * Appends the text to the output.     * @param string The text node.     */    public void visitStringNode (Text string)    {        if (!mIsScript && !mIsStyle)        {            String text = string.getText ();            if (!mIsPre)            {                text = Translate.decode (text);                if (getReplaceNonBreakingSpaces ())                    text = text.replace ('\u00a0', ' ');                if (getCollapse ())                    collapse (mBuffer, text);                else                    mBuffer.append (text);            }            else                mBuffer.append (text);        }    }    /**     * Appends a NEWLINE to the output if the tag breaks flow, and     * possibly sets the state of the PRE and SCRIPT flags.     * @param tag The tag to examine.     */    public void visitTag (Tag tag)    {        String name;        if (tag instanceof LinkTag)            if (getLinks ())            { // appends the link as text between angle brackets to the output.                mBuffer.append ("<");                mBuffer.append (((LinkTag)tag).getLink ());                mBuffer.append (">");            }        name = tag.getTagName ();        if (name.equalsIgnoreCase ("PRE"))            mIsPre = true;        else if (name.equalsIgnoreCase ("SCRIPT"))            mIsScript = true;        else if (name.equalsIgnoreCase ("STYLE"))            mIsStyle = true;        if (tag.breaksFlow ())            carriageReturn ();    }    /**     * Resets the state of the PRE and SCRIPT flags.     * @param tag The end tag to process.     */    public void visitEndTag (Tag tag)    {        String name;        name = tag.getTagName ();        if (name.equalsIgnoreCase ("PRE"))            mIsPre = false;        else if (name.equalsIgnoreCase ("SCRIPT"))            mIsScript = false;        else if (name.equalsIgnoreCase ("STYLE"))            mIsStyle = false;    }    /**     * Unit test.     * @param args Pass arg[0] as the URL to process.     */    public static void main (String[] args)    {        if (0 >= args.length)            System.out.println ("Usage: java -classpath htmlparser.jar"                + " org.htmlparser.beans.StringBean <http://whatever_url>");        else        {            StringBean sb = new StringBean ();            sb.setLinks (false);            sb.setReplaceNonBreakingSpaces (true);            sb.setCollapse (true);            sb.setURL (args[0]);            System.out.println (sb.getStrings ());        }    }}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -