📄 stringbean.java
字号:
// HTMLParser Library $Name: v1_6 $ - A java-based parser for HTML// http://sourceforge.org/projects/htmlparser// Copyright (C) 2004 Derrick Oswald//// Revision Control Information//// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/beans/StringBean.java,v $// $Author: derrickoswald $// $Date: 2006/05/30 01:07:14 $// $Revision: 1.45 $//// This library is free software; you can redistribute it and/or// modify it under the terms of the GNU Lesser General Public// License as published by the Free Software Foundation; either// version 2.1 of the License, or (at your option) any later version.//// This library is distributed in the hope that it will be useful,// but WITHOUT ANY WARRANTY; without even the implied warranty of// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU// Lesser General Public License for more details.//// You should have received a copy of the GNU Lesser General Public// License along with this library; if not, write to the Free Software// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA//package org.htmlparser.beans;import java.beans.PropertyChangeListener;import java.beans.PropertyChangeSupport;import java.io.Serializable;import java.net.URLConnection;import org.htmlparser.Parser;import org.htmlparser.Text;import org.htmlparser.tags.LinkTag;import org.htmlparser.Tag;import org.htmlparser.util.ParserException;import org.htmlparser.util.EncodingChangeException;import org.htmlparser.util.Translate;import org.htmlparser.visitors.NodeVisitor;/** * Extract strings from a URL. * <p>Text within <SCRIPT></SCRIPT> tags is removed.</p> * <p>The text within <PRE></PRE> tags is not altered.</p> * <p>The property <code>Strings</code>, which is the output property is null * until a URL is set. So a typical usage is:</p> * <pre> * StringBean sb = new StringBean (); * sb.setLinks (false); * sb.setReplaceNonBreakingSpaces (true); * sb.setCollapse (true); * sb.setURL ("http://www.netbeans.org"); // the HTTP is performed here * String s = sb.getStrings (); * </pre> * You can also use the StringBean as a NodeVisitor on your own parser, * in which case you have to refetch your page if you change one of the * properties because it resets the Strings property:</p> * <pre> * StringBean sb = new StringBean (); * Parser parser = new Parser ("http://cbc.ca"); * parser.visitAllNodesWith (sb); * String s = sb.getStrings (); * sb.setLinks (true); * parser.reset (); * parser.visitAllNodesWith (sb); * String sl = sb.getStrings (); * </pre> * According to Nick Burch, who contributed the patch, this is handy if you * don't want StringBean to wander off and get the content itself, either * because you already have it, it's not on a website etc. */public class StringBean extends NodeVisitor implements Serializable{ /** * Property name in event where the URL contents changes. */ public static final String PROP_STRINGS_PROPERTY = "strings"; /** * Property name in event where the 'embed links' state changes. */ public static final String PROP_LINKS_PROPERTY = "links"; /** * Property name in event where the URL changes. */ public static final String PROP_URL_PROPERTY = "URL"; /** * Property name in event where the 'replace non-breaking spaces' * state changes. */ public static final String PROP_REPLACE_SPACE_PROPERTY = "replaceNonBreakingSpaces"; /** * Property name in event where the 'collapse whitespace' state changes. */ public static final String PROP_COLLAPSE_PROPERTY = "collapse"; /** * Property name in event where the connection changes. */ public static final String PROP_CONNECTION_PROPERTY = "connection"; /** * A newline. */ private static final String NEWLINE = System.getProperty ("line.separator"); /** * The length of the NEWLINE. */ private static final int NEWLINE_SIZE = NEWLINE.length (); /** * Bound property support. */ protected PropertyChangeSupport mPropertySupport; /** * The parser used to extract strings. */ protected Parser mParser; /** * The strings extracted from the URL. */ protected String mStrings; /** * If <code>true</code> the link URLs are embedded in the text output. */ protected boolean mLinks; /** * If <code>true</code> regular space characters are substituted for * non-breaking spaces in the text output. */ protected boolean mReplaceSpace; /** * If <code>true</code> sequences of whitespace characters are replaced * with a single space character. */ protected boolean mCollapse; /** * The state of the collapse processiung state machine. */ protected int mCollapseState; /** * The buffer text is stored in while traversing the HTML. */ protected StringBuffer mBuffer; /** * Set <code>true</code> when traversing a SCRIPT tag. */ protected boolean mIsScript; /** * Set <code>true</code> when traversing a PRE tag. */ protected boolean mIsPre; /** * Set <code>true</code> when traversing a STYLE tag. */ protected boolean mIsStyle; /** * Create a StringBean object. * Default property values are set to 'do the right thing': * <p><code>Links</code> is set <code>false</code> so text appears like a * browser would display it, albeit without the colour or underline clues * normally associated with a link.</p> * <p><code>ReplaceNonBreakingSpaces</code> is set <code>true</code>, so * that printing the text works, but the extra information regarding these * formatting marks is available if you set it false.</p> * <p><code>Collapse</code> is set <code>true</code>, so text appears * compact like a browser would display it.</p> */ public StringBean () { super (true, true); mPropertySupport = new PropertyChangeSupport (this); mParser = new Parser (); mStrings = null; mLinks = false; mReplaceSpace = true; mCollapse = true; mCollapseState = 0; mBuffer = new StringBuffer (4096); mIsScript = false; mIsPre = false; mIsStyle = false; } // // internals // /** * Appends a newline to the buffer if there isn't one there already. * Except if the buffer is empty. */ protected void carriageReturn () { int length; length = mBuffer.length (); if ((0 != length) // don't append newlines to the beginning of a buffer && ((NEWLINE_SIZE <= length) // not enough chars to hold a NEWLINE && (!mBuffer.substring ( length - NEWLINE_SIZE, length).equals (NEWLINE)))) mBuffer.append (NEWLINE); mCollapseState = 0; } /** * Add the given text collapsing whitespace. * Use a little finite state machine: * <pre> * state 0: whitepace was last emitted character * state 1: in whitespace * state 2: in word * A whitespace character moves us to state 1 and any other character * moves us to state 2, except that state 0 stays in state 0 until * a non-whitespace and going from whitespace to word we emit a space * before the character: * input: whitespace other-character * state\next * 0 0 2 * 1 1 space then 2 * 2 1 2 * </pre> * @param buffer The buffer to append to. * @param string The string to append. */ protected void collapse (StringBuffer buffer, String string) { int chars; char character; chars = string.length (); if (0 != chars) { for (int i = 0; i < chars; i++) { character = string.charAt (i); switch (character) { // see HTML specification section 9.1 White space // http://www.w3.org/TR/html4/struct/text.html#h-9.1 case '\u0020': case '\u0009': case '\u000C': case '\u200B': case '\r': case '\n': if (0 != mCollapseState) mCollapseState = 1; break; default: if (1 == mCollapseState) buffer.append (' '); mCollapseState = 2; buffer.append (character); } } } } /** * Extract the text from a page. * @return The textual contents of the page. * @exception ParserException If a parse error occurs. */ protected String extractStrings () throws ParserException { String ret; mCollapseState = 0; mParser.visitAllNodesWith (this); ret = mBuffer.toString (); mBuffer = new StringBuffer(4096); return (ret); } /** * Assign the <code>Strings</code> property, firing the property change. * @param strings The new value of the <code>Strings</code> property. */ protected void updateStrings (String strings) { String oldValue; if ((null == mStrings) || !mStrings.equals (strings)) { oldValue = mStrings; mStrings = strings; mPropertySupport.firePropertyChange ( PROP_STRINGS_PROPERTY, oldValue, strings); } } /** * Fetch the URL contents. * Only do work if there is a valid parser with it's URL set. */ protected void setStrings () { mCollapseState = 0; if (null != getURL ()) try { try { mParser.visitAllNodesWith (this); updateStrings (mBuffer.toString ()); } finally { mBuffer = new StringBuffer (4096); } } catch (EncodingChangeException ece) { mIsPre = false; mIsScript = false; mIsStyle = false; try { // try again with the encoding now in force mParser.reset (); mBuffer = new StringBuffer (4096); mCollapseState = 0; mParser.visitAllNodesWith (this); updateStrings (mBuffer.toString ()); } catch (ParserException pe) { updateStrings (pe.toString ()); } finally { mBuffer = new StringBuffer (4096); } } catch (ParserException pe) { updateStrings (pe.toString ()); }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -