📄 sitecapturer.java

📁 本程序用于对页面信息进行提取并分析
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
// HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML// http://sourceforge.org/projects/htmlparser// Copyright (C) 2003 Derrick Oswald//// Revision Control Information//// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications/SiteCapturer.java,v $// $Author: derrickoswald $// $Date: 2005/04/12 11:27:41 $// $Revision: 1.9 $//// This library is free software; you can redistribute it and/or// modify it under the terms of the GNU Lesser General Public// License as published by the Free Software Foundation; either// version 2.1 of the License, or (at your option) any later version.//// This library is distributed in the hope that it will be useful,// but WITHOUT ANY WARRANTY; without even the implied warranty of// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU// Lesser General Public License for more details.//// You should have received a copy of the GNU Lesser General Public// License along with this library; if not, write to the Free Software// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA//package org.htmlparser.parserapplications;import java.io.File;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.PrintWriter;import java.net.MalformedURLException;import java.net.URL;import java.net.URLConnection;import java.util.ArrayList;import java.util.HashSet;import javax.swing.JFileChooser;import javax.swing.JOptionPane;import org.htmlparser.NodeFilter;import org.htmlparser.Parser;import org.htmlparser.PrototypicalNodeFactory;import org.htmlparser.filters.AndFilter;import org.htmlparser.filters.HasAttributeFilter;import org.htmlparser.filters.NodeClassFilter;import org.htmlparser.tags.BaseHrefTag;import org.htmlparser.tags.FrameTag;import org.htmlparser.tags.ImageTag;import org.htmlparser.tags.LinkTag;import org.htmlparser.tags.MetaTag;import org.htmlparser.util.EncodingChangeException;import org.htmlparser.util.NodeIterator;import org.htmlparser.util.NodeList;import org.htmlparser.util.ParserException;/** * Save a web site locally. * Illustrative program to save a web site contents locally. * It was created to demonstrate URL rewriting in it's simplest form. * It uses customized tags in the NodeFactory to alter the URLs. * This program has a number of limitations: * <ul> * <li>it doesn't capture forms, this would involve too many assumptions</li> * <li>it doesn't capture script references, so funky onMouseOver and other * non-static content will not be faithfully reproduced</li> * <li>it doesn't handle style sheets</li> * <li>it doesn't dig into attributes that might reference resources, so * for example, background images won't necessarily be captured</li> * <li>worst of all, it gets confused when a URL both has content and is * the prefix for other content, * i.e. http://whatever.com/top and http://whatever.com/top/sub.html both * yield content, since this cannot be faithfully replicated to a static * directory structure (this happens a lot with servlet based sites)</li> *</ul> */public class SiteCapturer{    /**     * The web site to capture.     * This is used as the base URL in deciding whether to adjust a link     * and whether to capture a page or not.     */    protected String mSource;    /**     * The local directory to capture to.     * This is used as a base prefix for files saved locally.     */    protected String mTarget;    /**     * The list of pages to capture.     * Links are added to this list as they are discovered, and removed in     * sequential order (FIFO queue) leading to a breadth     * first traversal of the web site space.     */    protected ArrayList mPages;    /**     * The set of pages already captured.     * Used to avoid repeated acquisition of the same page.     */    protected HashSet mFinished;    /**     * The list of resources to copy.     * Images and other resources are added to this list as they are discovered.     */    protected ArrayList mImages;    /**     * The set of resources already copied.     * Used to avoid repeated acquisition of the same images and other resources.     */    protected HashSet mCopied;    /**     * The parser to use for processing.     */    protected Parser mParser;    /**     * If <code>true</code>, save resources locally too,     * otherwise, leave resource links pointing to original page.     */    protected boolean mCaptureResources;    /**     * The filter to apply to the nodes retrieved.     */    protected NodeFilter mFilter;    /**     * Copy buffer size.     * Resources are moved to disk in chunks this size or less.     */    protected final int TRANSFER_SIZE = 4096;    /**     * Create a web site capturer.     */    public SiteCapturer ()    {        PrototypicalNodeFactory factory;        mSource = null;        mTarget = null;        mPages = new ArrayList ();        mFinished = new HashSet ();        mImages = new ArrayList ();        mCopied = new HashSet ();        mParser = new Parser ();        factory = new PrototypicalNodeFactory ();        factory.registerTag (new LocalLinkTag ());        factory.registerTag (new LocalFrameTag ());        factory.registerTag (new LocalBaseHrefTag ());        factory.registerTag (new LocalImageTag ());        mParser.setNodeFactory (factory);        mCaptureResources = true;        mFilter = null;    }    /**     * Getter for property source.     * @return Value of property source.     */    public String getSource ()    {        return (mSource);    }        /**     * Setter for property source.     * This is the base URL to capture. URL's that don't start with this prefix     * are ignored (left as is), while the ones with this URL as a base are     * re-homed to the local target.     * @param source New value of property source.     */    public void setSource (String source)    {        if (source.endsWith ("/"))            source = source.substring (0, source.length () - 1);        mSource = source;    }        /**     * Getter for property target.     * @return Value of property target.     */    public String getTarget ()    {        return (mTarget);    }        /**     * Setter for property target.     * This is the local directory under which to save the site's pages.     * @param target New value of property target.     */    public void setTarget (String target)    {        mTarget = target;    }    /**     * Getter for property captureResources.     * If <code>true</code>, the images and other resources referenced by     * the site and within the base URL tree are also copied locally to the     * target directory. If <code>false</code>, the image links are left 'as     * is', still refering to the original site.     * @return Value of property captureResources.     */    public boolean getCaptureResources ()    {        return (mCaptureResources);    }        /**     * Setter for property captureResources.     * @param capture New value of property captureResources.     */    public void setCaptureResources (boolean capture)    {        mCaptureResources = capture;    }            /** Getter for property filter.     * @return Value of property filter.     *     */    public NodeFilter getFilter ()    {        return (mFilter);    }        /** Setter for property filter.     * @param filter New value of property filter.     *     */    public void setFilter (NodeFilter filter)    {        mFilter = filter;    }        /**     * Returns <code>true</code> if the link is one we are interested in.     * @param link The link to be checked.     * @return <code>true</code> if the link has the source URL as a prefix     * and doesn't contain '?' or '#'; the former because we won't be able to     * handle server side queries in the static target directory structure and     * the latter because presumably the full page with that reference has     * already been captured previously. This performs a case insensitive     * comparison, which is cheating really, but it's cheap.     */    protected boolean isToBeCaptured (String link)    {        return (            link.toLowerCase ().startsWith (getSource ().toLowerCase ())            && (-1 == link.indexOf ("?"))            && (-1 == link.indexOf ("#")));    }    /**     * Returns <code>true</code> if the link contains text/html content.     * @param link The URL to check for content type.     * @return <code>true</code> if the HTTP header indicates the type is     * "text/html".     * @exception ParserException If the supplied URL can't be read from.     */    protected boolean isHtml (String link)        throws            ParserException    {        URL url;        URLConnection connection;        String type;        boolean ret;        ret = false;        try        {            url = new URL (link);            connection = url.openConnection ();            type = connection.getContentType ();            if (type == null)                ret = false;            else                ret = type.startsWith ("text/html");        }        catch (Exception e)        {            throw new ParserException ("URL " + link + " has a problem", e);        }                return (ret);    }    /**     * Converts a link to local.     * A relative link can be used to construct both a URL and a file name.     * Basically, the operation is to strip off the base url, if any,     * and then prepend as many dot-dots as necessary to make     * it relative to the current page.     * A bit of a kludge handles the root page specially by calling it     * index.html, even though that probably isn't it's real file name.     * This isn't pretty, but it works for me.     * @param link The link to make relative.     * @param current The current page URL, or empty if it's an absolute URL     * that needs to be converted.     * @return The URL relative to the current page.     */    protected String makeLocalLink (String link, String current)    {        int i;        int j;        String ret;        if (link.equals (getSource ()) || (!getSource ().endsWith ("/") && link.equals (getSource () + "/")))            ret = "index.html"; // handle the root page specially        else if (link.startsWith (getSource ())                && (link.length () > getSource ().length ()))            ret = link.substring (getSource ().length () + 1);        else            ret = link; // give up                    // make it relative to the current page by prepending "../" for        // each '/' in the current local path        if ((null != current)            && link.startsWith (getSource ())            && (current.length () > getSource ().length ()))        {            current = current.substring (getSource ().length () + 1);            i = 0;            while (-1 != (j = current.indexOf ('/', i)))            {                ret = "../" + ret;                i = j + 1;            }        }        return (ret);    }    /**     * Unescape a URL to form a file name.     * Very crude.     * @param raw The escaped URI.     * @return The native URI.     */    protected String decode (String raw)    {        int length;        int start;        int index;        int value;        StringBuffer ret;                ret = new StringBuffer (raw.length ());        length = raw.length ();        start = 0;        while (-1 != (index = raw.indexOf ('%', start)))        {   // append the part up to the % sign            ret.append (raw.substring (start, index));            // there must be two hex digits after the percent sign            if (index + 2 < length)            {                try                {                    value = Integer.parseInt (raw.substring (index + 1, index + 3), 16);                    ret.append ((char)value);                    start = index + 3;                }                catch (NumberFormatException nfe)                {                    ret.append ('%');                    start = index + 1;                }            }            else            {   // this case is actually illegal in a URI, but...                ret.append ('%');                start = index + 1;            }        }        ret.append (raw.substring (start));                return (ret.toString ());    }    /**     * Copy a resource (image) locally.     * Removes one element from the 'to be copied' list and saves the     * resource it points to locally as a file.     */    protected void copy ()    {        String link;        String raw;        String name;        File file;        File dir;        URL source;        byte[] data;        InputStream in;        FileOutputStream out;        int read;        link = (String)mImages.remove (0);        mCopied.add (link);        if (getCaptureResources ())
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -