📄 sitecapturer.java

📁 本程序用于对页面信息进行提取并分析
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
        {            raw = makeLocalLink (link, "");            name = decode (raw);            file = new File (getTarget (), name);            System.out.println ("copying " + link + " to " + file.getAbsolutePath ());            // ensure directory exists            dir = file.getParentFile ();            if (!dir.exists ())                dir.mkdirs ();            try            {                source = new URL (link);                data = new byte [TRANSFER_SIZE];                try                {                    in = source.openStream ();                    try                    {                        out = new FileOutputStream (file);                        try                        {                            while (-1 != (read = in.read (data, 0, data.length)))                                out.write (data, 0, read);                        }                        finally                        {                            out.close ();                        }                    }                    catch (FileNotFoundException fnfe)                    {                        fnfe.printStackTrace ();                    }                    finally                    {                        in.close ();                    }                }                catch (FileNotFoundException fnfe)                {                    System.err.println ("broken link " + fnfe.getMessage () + " ignored");                }            }            catch (MalformedURLException murle)            {                murle.printStackTrace ();            }            catch (IOException ioe)            {                ioe.printStackTrace ();            }        }    }     /**     * Process a single page.     * @param filter The filter to apply to the collected nodes.     * @exception ParserException If a parse error occurs.     */    protected void process (NodeFilter filter)        throws            ParserException    {        String url;        int bookmark;        NodeList list;        NodeList robots;        MetaTag robot;        String content;        File file;        File dir;        PrintWriter out;        // get the next URL and add it to the done pile        url = (String)mPages.remove (0);        System.out.println ("processing " + url);        mFinished.add (url);        try        {            bookmark = mPages.size ();            // fetch the page and gather the list of nodes            mParser.setURL (url);            try            {                list = new NodeList ();                for (NodeIterator e = mParser.elements (); e.hasMoreNodes (); )                    list.add (e.nextNode ()); // URL conversion occurs in the tags            }            catch (EncodingChangeException ece)            {                // fix bug #998195 SiteCatpurer just crashed                // try again with the encoding now set correctly                // hopefully mPages, mImages, mCopied and mFinished won't be corrupted                mParser.reset ();                list = new NodeList ();                for (NodeIterator e = mParser.elements (); e.hasMoreNodes (); )                    list.add (e.nextNode ());            }            // handle robots meta tag according to http://www.robotstxt.org/wc/meta-user.html            // <meta name="robots" content="index,follow" />            // <meta name="robots" content="noindex,nofollow" />            robots = list.extractAllNodesThatMatch (                new AndFilter (                    new NodeClassFilter (MetaTag.class),                    new HasAttributeFilter ("name", "robots")), true);            if (0 != robots.size ())            {                robot = (MetaTag)robots.elementAt (0);                content = robot.getAttribute ("content").toLowerCase ();                if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("nofollow")))                    // reset mPages                    for (int i = bookmark; i < mPages.size (); i++)                        mPages.remove (i);                if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("noindex")))                    return;            }                if (null != filter)                list.keepAllNodesThatMatch (filter, true);            // save the page locally            file = new File (getTarget (), makeLocalLink (url, ""));            dir = file.getParentFile ();            if (!dir.exists ())                dir.mkdirs ();            else if (!dir.isDirectory ())            {                dir = new File (dir.getParentFile (), dir.getName () + ".content");                if (!dir.exists ())                    dir.mkdirs ();                file = new File (dir, file.getName ());            }                            try            {                out = new PrintWriter (new FileOutputStream (file));                for (int i = 0; i < list.size (); i++)                    out.print (list.elementAt (i).toHtml ());                out.close ();            }            catch (FileNotFoundException fnfe)            {                fnfe.printStackTrace ();            }        }        catch (ParserException pe)        {            String message;                        // this exception handling is suboptimal,            // but it recognizes resources that aren't text/html            message = pe.getMessage ();            if ((null != message) && (message.endsWith ("does not contain text")))            {                if (!mCopied.contains (url))                    if (!mImages.contains (url))                        mImages.add (url);                mFinished.remove (url);            }            else                throw pe;        }    }    /**     * Link tag that rewrites the HREF.     * The HREF is changed to a local target if it matches the source.     */    class LocalLinkTag extends LinkTag    {        public void doSemanticAction ()            throws                ParserException        {            boolean html;            String link;            // get the link            link = getLink ();            // check if it needs to be captured            if (isToBeCaptured (link))            {                // add the link to a list to be processed                if (mFinished.contains (link))                    html = true;                else if (mPages.contains (link))                    html = true;                else if (mCopied.contains (link))                    html = false;                else if (mImages.contains (link))                    html = false;                else                {   // this test is expensive, do it reluctantly                    html = isHtml (link);                    if (html)                        mPages.add (link);                    else                        mImages.add (link);                }                // alter the link                if (html || (!html && getCaptureResources ()))                    link = makeLocalLink (link, mParser.getLexer ().getPage ().getUrl ());                setLink (link);            }        }    }    /**     * Frame tag that rewrites the SRC URLs.     * The SRC URLs are mapped to local targets if they match the source.     */    class LocalFrameTag extends FrameTag    {        public void doSemanticAction ()            throws                ParserException        {            boolean html;            String link;            // get the link            link = getFrameLocation ();            // check if it needs to be captured            if (isToBeCaptured (link))            {                // add the link to a list to be processed                if (mFinished.contains (link))                    html = true;                else if (mPages.contains (link))                    html = true;                else if (mCopied.contains (link))                    html = false;                else if (mImages.contains (link))                    html = false;                else                {   // this test is expensive, do it reluctantly                    html = isHtml (link);                    if (html)                        mPages.add (link);                    else                        mImages.add (link);                }                // alter the link                if (html || (!html && getCaptureResources ()))                    link = makeLocalLink (link, mParser.getLexer ().getPage ().getUrl ());                setFrameLocation (link);            }        }    }    /**     * Image tag that rewrites the SRC URL.     * If resources are being captured the SRC is mapped to a local target if     * it matches the source, otherwise it is convered to a full URL to point     * back to the original site.     */    class LocalImageTag extends ImageTag    {        public void doSemanticAction ()            throws                ParserException        {            String image;                        // get the image url            image = getImageURL ();            // check if it needs to be captured            if (isToBeCaptured (image))            {   // add the image to the list needing to be copied                if (!mCopied.contains (image))                    if (!mImages.contains (image))                        mImages.add (image);                if (getCaptureResources ())                    image = makeLocalLink (image, mParser.getLexer ().getPage ().getUrl ());                // alter the link                setImageURL (image);            }        }    }    /**     * Base tag that doesn't show.     * The toHtml() method is overridden to return an empty string,     * effectively shutting off the base reference.     */    class LocalBaseHrefTag extends BaseHrefTag    {        // we don't want to have a base pointing back at the source page        public String toHtml ()        {            return ("");        }    }    /**     * Perform the capture.     */    public void capture ()    {               mPages.clear ();        mPages.add (getSource ());        while (0 != mPages.size ())            try            {                process (getFilter ());                while (0 != mImages.size ())                    copy ();            }            catch (ParserException pe)            {   // this exception handling is suboptimal,                // but it messages correctly about broken links                Throwable throwable;                                throwable = pe.getThrowable ();                if (null != throwable)                {                    throwable = throwable.getCause ();                    if (throwable instanceof FileNotFoundException)                        System.err.println ("broken link " + ((FileNotFoundException)throwable).getMessage () + " ignored");                    else                        pe.printStackTrace ();                }                else                    pe.printStackTrace ();            }    }    /**     * Mainline to capture a web site locally.     * @param args The command line arguments.     * There are three arguments the web site to capture, the local directory     * to save it to, and a flag (true or false) to indicate whether resources     * such as images and video are to be captured as well.     * These are requested via dialog boxes if not supplied.     * @exception MalformedURLException If the supplied URL is invalid.     * @exception IOException If an error occurs reading the page or resources.     */    public static void main (String[] args)        throws            MalformedURLException,            IOException    {        SiteCapturer worker;        String url;        JFileChooser chooser;        URL source;        String path;        File target;        Boolean capture;        int ret;                worker = new SiteCapturer ();        if (0 >= args.length)        {            url = (String)JOptionPane.showInputDialog (                null,                "Enter the URL to capture:",                "Web Site",                JOptionPane.PLAIN_MESSAGE,                null,                null,                "http://htmlparser.sourceforge.net/wiki");            if (null != url)                worker.setSource (url);            else                System.exit (1);        }        else            worker.setSource (args[0]);        if (1 >= args.length)        {            url = worker.getSource ();            source = new URL (url);            path = new File (new File ("." + File.separator), source.getHost () + File.separator).getCanonicalPath ();            target = new File (path);            chooser = new JFileChooser (target);            chooser.setDialogType (JFileChooser.SAVE_DIALOG);            chooser.setFileSelectionMode (JFileChooser.DIRECTORIES_ONLY);            chooser.setSelectedFile (target); // this doesn't frickin' work            chooser.setMultiSelectionEnabled (false);            chooser.setDialogTitle ("Target Directory");            ret = chooser.showSaveDialog (null);            if (ret == JFileChooser.APPROVE_OPTION)                worker.setTarget (chooser.getSelectedFile ().getAbsolutePath ());            else                System.exit (1);        }        else            worker.setTarget (args[1]);        if (2 >= args.length)        {            capture = (Boolean)JOptionPane.showInputDialog (                null,                "Should resources be captured:",                "Capture Resources",                JOptionPane.PLAIN_MESSAGE,                null,                new Object[] { Boolean.TRUE, Boolean.FALSE},                Boolean.TRUE);            if (null != capture)                worker.setCaptureResources (capture.booleanValue ());            else                System.exit (1);        }        else            worker.setCaptureResources ((Boolean.valueOf (args[2]).booleanValue ()));        worker.capture ();                System.exit (0);    }}
上一页 12
💿 文件大小 6824 K
👤 上传用户 gankai1983
📂 所属分类 Jsp/Servlet
🏷️ 相关标签

#程序 #分 #页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -