📄 stringbean.java
字号:
else { // reset in case this StringBean is used as a visitor // on another parser, not it's own mStrings = null; mBuffer = new StringBuffer (4096); } } /** * Refetch the URL contents. * Only need to worry if there is already a valid parser and it's * been spent fetching the string contents. */ private void resetStrings () { if (null != mStrings) try { mParser.setURL (getURL ()); setStrings (); } catch (ParserException pe) { updateStrings (pe.toString ()); } } // // Property change support. // /** * Add a PropertyChangeListener to the listener list. * The listener is registered for all properties. * @param listener The PropertyChangeListener to be added. */ public void addPropertyChangeListener (PropertyChangeListener listener) { mPropertySupport.addPropertyChangeListener (listener); } /** * Remove a PropertyChangeListener from the listener list. * This removes a registered PropertyChangeListener. * @param listener The PropertyChangeListener to be removed. */ public void removePropertyChangeListener (PropertyChangeListener listener) { mPropertySupport.removePropertyChangeListener (listener); } // // Properties // /** * Return the textual contents of the URL. * This is the primary output of the bean. * @return The user visible (what would be seen in a browser) text. */ public String getStrings () { if (null == mStrings) if (0 == mBuffer.length ()) setStrings (); else updateStrings (mBuffer.toString ()); return (mStrings); } /** * Get the current 'include links' state. * @return <code>true</code> if link text is included in the text extracted * from the URL, <code>false</code> otherwise. */ public boolean getLinks () { return (mLinks); } /** * Set the 'include links' state. * If the setting is changed after the URL has been set, the text from the * URL will be reacquired, which is possibly expensive. * @param links Use <code>true</code> if link text is to be included in the * text extracted from the URL, <code>false</code> otherwise. */ public void setLinks (boolean links) { boolean oldValue = mLinks; if (oldValue != links) { mLinks = links; mPropertySupport.firePropertyChange ( PROP_LINKS_PROPERTY, oldValue, links); resetStrings (); } } /** * Get the current URL. * @return The URL from which text has been extracted, or <code>null</code> * if this property has not been set yet. */ public String getURL () { return ((null != mParser) ? mParser.getURL () : null); } /** * Set the URL to extract strings from. * The text from the URL will be fetched, which may be expensive, so this * property should be set last. * @param url The URL that text should be fetched from. */ public void setURL (String url) { String old; URLConnection conn; old = getURL (); conn = getConnection (); if (((null == old) && (null != url)) || ((null != old) && !old.equals (url))) { try { if (null == mParser) mParser = new Parser (url); else mParser.setURL (url); mPropertySupport.firePropertyChange ( PROP_URL_PROPERTY, old, getURL ()); mPropertySupport.firePropertyChange ( PROP_CONNECTION_PROPERTY, conn, mParser.getConnection ()); setStrings (); } catch (ParserException pe) { updateStrings (pe.toString ()); } } } /** * Get the current 'replace non breaking spaces' state. * @return <code>true</code> if non-breaking spaces (character '\u00a0', * numeric character reference &#160; or character entity * reference &nbsp;) are to be replaced with normal * spaces (character '\u0020'). */ public boolean getReplaceNonBreakingSpaces () { return (mReplaceSpace); } /** * Set the 'replace non breaking spaces' state. * If the setting is changed after the URL has been set, the text from the * URL will be reacquired, which is possibly expensive. * @param replace <code>true</code> if non-breaking spaces * (character '\u00a0', numeric character reference &#160; * or character entity reference &nbsp;) are to be replaced with normal * spaces (character '\u0020'). */ public void setReplaceNonBreakingSpaces (boolean replace) { boolean oldValue = mReplaceSpace; if (oldValue != replace) { mReplaceSpace = replace; mPropertySupport.firePropertyChange (PROP_REPLACE_SPACE_PROPERTY, oldValue, replace); resetStrings (); } } /** * Get the current 'collapse whitespace' state. * If set to <code>true</code> this emulates the operation of browsers * in interpretting text where <quote>user agents should collapse input * white space sequences when producing output inter-word space</quote>. * See HTML specification section 9.1 White space * <a href="http://www.w3.org/TR/html4/struct/text.html#h-9.1"> * http://www.w3.org/TR/html4/struct/text.html#h-9.1</a>. * @return <code>true</code> if sequences of whitespace (space '\u0020', * tab '\u0009', form feed '\u000C', zero-width space '\u200B', * carriage-return '\r' and NEWLINE '\n') are to be replaced with a single * space. */ public boolean getCollapse () { return (mCollapse); } /** * Set the current 'collapse whitespace' state. * If the setting is changed after the URL has been set, the text from the * URL will be reacquired, which is possibly expensive. * The internal state of the collapse state machine can be reset with * code like this: * <code>setCollapse (getCollapse ());</code> * @param collapse If <code>true</code>, sequences of whitespace * will be reduced to a single space. */ public void setCollapse (boolean collapse) { mCollapseState = 0; boolean oldValue = mCollapse; if (oldValue != collapse) { mCollapse = collapse; mPropertySupport.firePropertyChange ( PROP_COLLAPSE_PROPERTY, oldValue, collapse); resetStrings (); } } /** * Get the current connection. * @return The connection that the parser has or <code>null</code> if it * hasn't been set or the parser hasn't been constructed yet. */ public URLConnection getConnection () { return ((null != mParser) ? mParser.getConnection () : null); } /** * Set the parser's connection. * The text from the URL will be fetched, which may be expensive, so this * property should be set last. * @param connection New value of property Connection. */ public void setConnection (URLConnection connection) { String url; URLConnection conn; url = getURL (); conn = getConnection (); if (((null == conn) && (null != connection)) || ((null != conn) && !conn.equals (connection))) { try { if (null == mParser) mParser = new Parser (connection); else mParser.setConnection (connection); mPropertySupport.firePropertyChange ( PROP_URL_PROPERTY, url, getURL ()); mPropertySupport.firePropertyChange ( PROP_CONNECTION_PROPERTY, conn, mParser.getConnection ()); setStrings (); } catch (ParserException pe) { updateStrings (pe.toString ()); } } } // // NodeVisitor overrides // /** * Appends the text to the output. * @param string The text node. */ public void visitStringNode (Text string) { if (!mIsScript && !mIsStyle) { String text = string.getText (); if (!mIsPre) { text = Translate.decode (text); if (getReplaceNonBreakingSpaces ()) text = text.replace ('\u00a0', ' '); if (getCollapse ()) collapse (mBuffer, text); else mBuffer.append (text); } else mBuffer.append (text); } } /** * Appends a NEWLINE to the output if the tag breaks flow, and * possibly sets the state of the PRE and SCRIPT flags. * @param tag The tag to examine. */ public void visitTag (Tag tag) { String name; if (tag instanceof LinkTag) if (getLinks ()) { // appends the link as text between angle brackets to the output. mBuffer.append ("<"); mBuffer.append (((LinkTag)tag).getLink ()); mBuffer.append (">"); } name = tag.getTagName (); if (name.equalsIgnoreCase ("PRE")) mIsPre = true; else if (name.equalsIgnoreCase ("SCRIPT")) mIsScript = true; else if (name.equalsIgnoreCase ("STYLE")) mIsStyle = true; if (tag.breaksFlow ()) carriageReturn (); } /** * Resets the state of the PRE and SCRIPT flags. * @param tag The end tag to process. */ public void visitEndTag (Tag tag) { String name; name = tag.getTagName (); if (name.equalsIgnoreCase ("PRE")) mIsPre = false; else if (name.equalsIgnoreCase ("SCRIPT")) mIsScript = false; else if (name.equalsIgnoreCase ("STYLE")) mIsStyle = false; } /** * Unit test. * @param args Pass arg[0] as the URL to process. */ public static void main (String[] args) { if (0 >= args.length) System.out.println ("Usage: java -classpath htmlparser.jar" + " org.htmlparser.beans.StringBean <http://whatever_url>"); else { StringBean sb = new StringBean (); sb.setLinks (false); sb.setReplaceNonBreakingSpaces (true); sb.setCollapse (true); sb.setURL (args[0]); System.out.println (sb.getStrings ()); } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -