📄 sitecapturer.java
字号:
{ raw = makeLocalLink (link, ""); name = decode (raw); file = new File (getTarget (), name); System.out.println ("copying " + link + " to " + file.getAbsolutePath ()); // ensure directory exists dir = file.getParentFile (); if (!dir.exists ()) dir.mkdirs (); try { source = new URL (link); data = new byte [TRANSFER_SIZE]; try { in = source.openStream (); try { out = new FileOutputStream (file); try { while (-1 != (read = in.read (data, 0, data.length))) out.write (data, 0, read); } finally { out.close (); } } catch (FileNotFoundException fnfe) { fnfe.printStackTrace (); } finally { in.close (); } } catch (FileNotFoundException fnfe) { System.err.println ("broken link " + fnfe.getMessage () + " ignored"); } } catch (MalformedURLException murle) { murle.printStackTrace (); } catch (IOException ioe) { ioe.printStackTrace (); } } } /** * Process a single page. * @param filter The filter to apply to the collected nodes. * @exception ParserException If a parse error occurs. */ protected void process (NodeFilter filter) throws ParserException { String url; int bookmark; NodeList list; NodeList robots; MetaTag robot; String content; File file; File dir; PrintWriter out; // get the next URL and add it to the done pile url = (String)mPages.remove (0); System.out.println ("processing " + url); mFinished.add (url); try { bookmark = mPages.size (); // fetch the page and gather the list of nodes mParser.setURL (url); try { list = new NodeList (); for (NodeIterator e = mParser.elements (); e.hasMoreNodes (); ) list.add (e.nextNode ()); // URL conversion occurs in the tags } catch (EncodingChangeException ece) { // fix bug #998195 SiteCatpurer just crashed // try again with the encoding now set correctly // hopefully mPages, mImages, mCopied and mFinished won't be corrupted mParser.reset (); list = new NodeList (); for (NodeIterator e = mParser.elements (); e.hasMoreNodes (); ) list.add (e.nextNode ()); } // handle robots meta tag according to http://www.robotstxt.org/wc/meta-user.html // <meta name="robots" content="index,follow" /> // <meta name="robots" content="noindex,nofollow" /> robots = list.extractAllNodesThatMatch ( new AndFilter ( new NodeClassFilter (MetaTag.class), new HasAttributeFilter ("name", "robots")), true); if (0 != robots.size ()) { robot = (MetaTag)robots.elementAt (0); content = robot.getAttribute ("content").toLowerCase (); if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("nofollow"))) // reset mPages for (int i = bookmark; i < mPages.size (); i++) mPages.remove (i); if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("noindex"))) return; } if (null != filter) list.keepAllNodesThatMatch (filter, true); // save the page locally file = new File (getTarget (), makeLocalLink (url, "")); dir = file.getParentFile (); if (!dir.exists ()) dir.mkdirs (); else if (!dir.isDirectory ()) { dir = new File (dir.getParentFile (), dir.getName () + ".content"); if (!dir.exists ()) dir.mkdirs (); file = new File (dir, file.getName ()); } try { out = new PrintWriter (new FileOutputStream (file)); for (int i = 0; i < list.size (); i++) out.print (list.elementAt (i).toHtml ()); out.close (); } catch (FileNotFoundException fnfe) { fnfe.printStackTrace (); } } catch (ParserException pe) { String message; // this exception handling is suboptimal, // but it recognizes resources that aren't text/html message = pe.getMessage (); if ((null != message) && (message.endsWith ("does not contain text"))) { if (!mCopied.contains (url)) if (!mImages.contains (url)) mImages.add (url); mFinished.remove (url); } else throw pe; } } /** * Link tag that rewrites the HREF. * The HREF is changed to a local target if it matches the source. */ class LocalLinkTag extends LinkTag { public void doSemanticAction () throws ParserException { boolean html; String link; // get the link link = getLink (); // check if it needs to be captured if (isToBeCaptured (link)) { // add the link to a list to be processed if (mFinished.contains (link)) html = true; else if (mPages.contains (link)) html = true; else if (mCopied.contains (link)) html = false; else if (mImages.contains (link)) html = false; else { // this test is expensive, do it reluctantly html = isHtml (link); if (html) mPages.add (link); else mImages.add (link); } // alter the link if (html || (!html && getCaptureResources ())) link = makeLocalLink (link, mParser.getLexer ().getPage ().getUrl ()); setLink (link); } } } /** * Frame tag that rewrites the SRC URLs. * The SRC URLs are mapped to local targets if they match the source. */ class LocalFrameTag extends FrameTag { public void doSemanticAction () throws ParserException { boolean html; String link; // get the link link = getFrameLocation (); // check if it needs to be captured if (isToBeCaptured (link)) { // add the link to a list to be processed if (mFinished.contains (link)) html = true; else if (mPages.contains (link)) html = true; else if (mCopied.contains (link)) html = false; else if (mImages.contains (link)) html = false; else { // this test is expensive, do it reluctantly html = isHtml (link); if (html) mPages.add (link); else mImages.add (link); } // alter the link if (html || (!html && getCaptureResources ())) link = makeLocalLink (link, mParser.getLexer ().getPage ().getUrl ()); setFrameLocation (link); } } } /** * Image tag that rewrites the SRC URL. * If resources are being captured the SRC is mapped to a local target if * it matches the source, otherwise it is convered to a full URL to point * back to the original site. */ class LocalImageTag extends ImageTag { public void doSemanticAction () throws ParserException { String image; // get the image url image = getImageURL (); // check if it needs to be captured if (isToBeCaptured (image)) { // add the image to the list needing to be copied if (!mCopied.contains (image)) if (!mImages.contains (image)) mImages.add (image); if (getCaptureResources ()) image = makeLocalLink (image, mParser.getLexer ().getPage ().getUrl ()); // alter the link setImageURL (image); } } } /** * Base tag that doesn't show. * The toHtml() method is overridden to return an empty string, * effectively shutting off the base reference. */ class LocalBaseHrefTag extends BaseHrefTag { // we don't want to have a base pointing back at the source page public String toHtml () { return (""); } } /** * Perform the capture. */ public void capture () { mPages.clear (); mPages.add (getSource ()); while (0 != mPages.size ()) try { process (getFilter ()); while (0 != mImages.size ()) copy (); } catch (ParserException pe) { // this exception handling is suboptimal, // but it messages correctly about broken links Throwable throwable; throwable = pe.getThrowable (); if (null != throwable) { throwable = throwable.getCause (); if (throwable instanceof FileNotFoundException) System.err.println ("broken link " + ((FileNotFoundException)throwable).getMessage () + " ignored"); else pe.printStackTrace (); } else pe.printStackTrace (); } } /** * Mainline to capture a web site locally. * @param args The command line arguments. * There are three arguments the web site to capture, the local directory * to save it to, and a flag (true or false) to indicate whether resources * such as images and video are to be captured as well. * These are requested via dialog boxes if not supplied. * @exception MalformedURLException If the supplied URL is invalid. * @exception IOException If an error occurs reading the page or resources. */ public static void main (String[] args) throws MalformedURLException, IOException { SiteCapturer worker; String url; JFileChooser chooser; URL source; String path; File target; Boolean capture; int ret; worker = new SiteCapturer (); if (0 >= args.length) { url = (String)JOptionPane.showInputDialog ( null, "Enter the URL to capture:", "Web Site", JOptionPane.PLAIN_MESSAGE, null, null, "http://htmlparser.sourceforge.net/wiki"); if (null != url) worker.setSource (url); else System.exit (1); } else worker.setSource (args[0]); if (1 >= args.length) { url = worker.getSource (); source = new URL (url); path = new File (new File ("." + File.separator), source.getHost () + File.separator).getCanonicalPath (); target = new File (path); chooser = new JFileChooser (target); chooser.setDialogType (JFileChooser.SAVE_DIALOG); chooser.setFileSelectionMode (JFileChooser.DIRECTORIES_ONLY); chooser.setSelectedFile (target); // this doesn't frickin' work chooser.setMultiSelectionEnabled (false); chooser.setDialogTitle ("Target Directory"); ret = chooser.showSaveDialog (null); if (ret == JFileChooser.APPROVE_OPTION) worker.setTarget (chooser.getSelectedFile ().getAbsolutePath ()); else System.exit (1); } else worker.setTarget (args[1]); if (2 >= args.length) { capture = (Boolean)JOptionPane.showInputDialog ( null, "Should resources be captured:", "Capture Resources", JOptionPane.PLAIN_MESSAGE, null, new Object[] { Boolean.TRUE, Boolean.FALSE}, Boolean.TRUE); if (null != capture) worker.setCaptureResources (capture.booleanValue ()); else System.exit (1); } else worker.setCaptureResources ((Boolean.valueOf (args[2]).booleanValue ())); worker.capture (); System.exit (0); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -