📄 sitecapturer.java
字号:
// HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML// http://sourceforge.org/projects/htmlparser// Copyright (C) 2003 Derrick Oswald//// Revision Control Information//// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications/SiteCapturer.java,v $// $Author: derrickoswald $// $Date: 2005/04/12 11:27:41 $// $Revision: 1.9 $//// This library is free software; you can redistribute it and/or// modify it under the terms of the GNU Lesser General Public// License as published by the Free Software Foundation; either// version 2.1 of the License, or (at your option) any later version.//// This library is distributed in the hope that it will be useful,// but WITHOUT ANY WARRANTY; without even the implied warranty of// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU// Lesser General Public License for more details.//// You should have received a copy of the GNU Lesser General Public// License along with this library; if not, write to the Free Software// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA//package org.htmlparser.parserapplications;import java.io.File;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.PrintWriter;import java.net.MalformedURLException;import java.net.URL;import java.net.URLConnection;import java.util.ArrayList;import java.util.HashSet;import javax.swing.JFileChooser;import javax.swing.JOptionPane;import org.htmlparser.NodeFilter;import org.htmlparser.Parser;import org.htmlparser.PrototypicalNodeFactory;import org.htmlparser.filters.AndFilter;import org.htmlparser.filters.HasAttributeFilter;import org.htmlparser.filters.NodeClassFilter;import org.htmlparser.tags.BaseHrefTag;import org.htmlparser.tags.FrameTag;import org.htmlparser.tags.ImageTag;import org.htmlparser.tags.LinkTag;import org.htmlparser.tags.MetaTag;import org.htmlparser.util.EncodingChangeException;import org.htmlparser.util.NodeIterator;import org.htmlparser.util.NodeList;import org.htmlparser.util.ParserException;/** * Save a web site locally. * Illustrative program to save a web site contents locally. * It was created to demonstrate URL rewriting in it's simplest form. * It uses customized tags in the NodeFactory to alter the URLs. * This program has a number of limitations: * <ul> * <li>it doesn't capture forms, this would involve too many assumptions</li> * <li>it doesn't capture script references, so funky onMouseOver and other * non-static content will not be faithfully reproduced</li> * <li>it doesn't handle style sheets</li> * <li>it doesn't dig into attributes that might reference resources, so * for example, background images won't necessarily be captured</li> * <li>worst of all, it gets confused when a URL both has content and is * the prefix for other content, * i.e. http://whatever.com/top and http://whatever.com/top/sub.html both * yield content, since this cannot be faithfully replicated to a static * directory structure (this happens a lot with servlet based sites)</li> *</ul> */public class SiteCapturer{ /** * The web site to capture. * This is used as the base URL in deciding whether to adjust a link * and whether to capture a page or not. */ protected String mSource; /** * The local directory to capture to. * This is used as a base prefix for files saved locally. */ protected String mTarget; /** * The list of pages to capture. * Links are added to this list as they are discovered, and removed in * sequential order (FIFO queue) leading to a breadth * first traversal of the web site space. */ protected ArrayList mPages; /** * The set of pages already captured. * Used to avoid repeated acquisition of the same page. */ protected HashSet mFinished; /** * The list of resources to copy. * Images and other resources are added to this list as they are discovered. */ protected ArrayList mImages; /** * The set of resources already copied. * Used to avoid repeated acquisition of the same images and other resources. */ protected HashSet mCopied; /** * The parser to use for processing. */ protected Parser mParser; /** * If <code>true</code>, save resources locally too, * otherwise, leave resource links pointing to original page. */ protected boolean mCaptureResources; /** * The filter to apply to the nodes retrieved. */ protected NodeFilter mFilter; /** * Copy buffer size. * Resources are moved to disk in chunks this size or less. */ protected final int TRANSFER_SIZE = 4096; /** * Create a web site capturer. */ public SiteCapturer () { PrototypicalNodeFactory factory; mSource = null; mTarget = null; mPages = new ArrayList (); mFinished = new HashSet (); mImages = new ArrayList (); mCopied = new HashSet (); mParser = new Parser (); factory = new PrototypicalNodeFactory (); factory.registerTag (new LocalLinkTag ()); factory.registerTag (new LocalFrameTag ()); factory.registerTag (new LocalBaseHrefTag ()); factory.registerTag (new LocalImageTag ()); mParser.setNodeFactory (factory); mCaptureResources = true; mFilter = null; } /** * Getter for property source. * @return Value of property source. */ public String getSource () { return (mSource); } /** * Setter for property source. * This is the base URL to capture. URL's that don't start with this prefix * are ignored (left as is), while the ones with this URL as a base are * re-homed to the local target. * @param source New value of property source. */ public void setSource (String source) { if (source.endsWith ("/")) source = source.substring (0, source.length () - 1); mSource = source; } /** * Getter for property target. * @return Value of property target. */ public String getTarget () { return (mTarget); } /** * Setter for property target. * This is the local directory under which to save the site's pages. * @param target New value of property target. */ public void setTarget (String target) { mTarget = target; } /** * Getter for property captureResources. * If <code>true</code>, the images and other resources referenced by * the site and within the base URL tree are also copied locally to the * target directory. If <code>false</code>, the image links are left 'as * is', still refering to the original site. * @return Value of property captureResources. */ public boolean getCaptureResources () { return (mCaptureResources); } /** * Setter for property captureResources. * @param capture New value of property captureResources. */ public void setCaptureResources (boolean capture) { mCaptureResources = capture; } /** Getter for property filter. * @return Value of property filter. * */ public NodeFilter getFilter () { return (mFilter); } /** Setter for property filter. * @param filter New value of property filter. * */ public void setFilter (NodeFilter filter) { mFilter = filter; } /** * Returns <code>true</code> if the link is one we are interested in. * @param link The link to be checked. * @return <code>true</code> if the link has the source URL as a prefix * and doesn't contain '?' or '#'; the former because we won't be able to * handle server side queries in the static target directory structure and * the latter because presumably the full page with that reference has * already been captured previously. This performs a case insensitive * comparison, which is cheating really, but it's cheap. */ protected boolean isToBeCaptured (String link) { return ( link.toLowerCase ().startsWith (getSource ().toLowerCase ()) && (-1 == link.indexOf ("?")) && (-1 == link.indexOf ("#"))); } /** * Returns <code>true</code> if the link contains text/html content. * @param link The URL to check for content type. * @return <code>true</code> if the HTTP header indicates the type is * "text/html". * @exception ParserException If the supplied URL can't be read from. */ protected boolean isHtml (String link) throws ParserException { URL url; URLConnection connection; String type; boolean ret; ret = false; try { url = new URL (link); connection = url.openConnection (); type = connection.getContentType (); if (type == null) ret = false; else ret = type.startsWith ("text/html"); } catch (Exception e) { throw new ParserException ("URL " + link + " has a problem", e); } return (ret); } /** * Converts a link to local. * A relative link can be used to construct both a URL and a file name. * Basically, the operation is to strip off the base url, if any, * and then prepend as many dot-dots as necessary to make * it relative to the current page. * A bit of a kludge handles the root page specially by calling it * index.html, even though that probably isn't it's real file name. * This isn't pretty, but it works for me. * @param link The link to make relative. * @param current The current page URL, or empty if it's an absolute URL * that needs to be converted. * @return The URL relative to the current page. */ protected String makeLocalLink (String link, String current) { int i; int j; String ret; if (link.equals (getSource ()) || (!getSource ().endsWith ("/") && link.equals (getSource () + "/"))) ret = "index.html"; // handle the root page specially else if (link.startsWith (getSource ()) && (link.length () > getSource ().length ())) ret = link.substring (getSource ().length () + 1); else ret = link; // give up // make it relative to the current page by prepending "../" for // each '/' in the current local path if ((null != current) && link.startsWith (getSource ()) && (current.length () > getSource ().length ())) { current = current.substring (getSource ().length () + 1); i = 0; while (-1 != (j = current.indexOf ('/', i))) { ret = "../" + ret; i = j + 1; } } return (ret); } /** * Unescape a URL to form a file name. * Very crude. * @param raw The escaped URI. * @return The native URI. */ protected String decode (String raw) { int length; int start; int index; int value; StringBuffer ret; ret = new StringBuffer (raw.length ()); length = raw.length (); start = 0; while (-1 != (index = raw.indexOf ('%', start))) { // append the part up to the % sign ret.append (raw.substring (start, index)); // there must be two hex digits after the percent sign if (index + 2 < length) { try { value = Integer.parseInt (raw.substring (index + 1, index + 3), 16); ret.append ((char)value); start = index + 3; } catch (NumberFormatException nfe) { ret.append ('%'); start = index + 1; } } else { // this case is actually illegal in a URI, but... ret.append ('%'); start = index + 1; } } ret.append (raw.substring (start)); return (ret.toString ()); } /** * Copy a resource (image) locally. * Removes one element from the 'to be copied' list and saves the * resource it points to locally as a file. */ protected void copy () { String link; String raw; String name; File file; File dir; URL source; byte[] data; InputStream in; FileOutputStream out; int read; link = (String)mImages.remove (0); mCopied.add (link); if (getCaptureResources ())
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -