📄 wikicapturer.java

📁 本程序用于对页面信息进行提取并分析
💻 JAVA
字号:
// HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML// http://sourceforge.org/projects/htmlparser// Copyright (C) 2003 Derrick Oswald//// Revision Control Information//// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications/WikiCapturer.java,v $// $Author: derrickoswald $// $Date: 2005/04/12 11:27:42 $// $Revision: 1.3 $//// This library is free software; you can redistribute it and/or// modify it under the terms of the GNU Lesser General Public// License as published by the Free Software Foundation; either// version 2.1 of the License, or (at your option) any later version.//// This library is distributed in the hope that it will be useful,// but WITHOUT ANY WARRANTY; without even the implied warranty of// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU// Lesser General Public License for more details.//// You should have received a copy of the GNU Lesser General Public// License along with this library; if not, write to the Free Software// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA//package org.htmlparser.parserapplications;import java.io.File;import java.io.IOException;import java.net.MalformedURLException;import java.net.URL;import javax.swing.JFileChooser;import javax.swing.JOptionPane;import org.htmlparser.filters.AndFilter;import org.htmlparser.filters.HasAttributeFilter;import org.htmlparser.filters.NotFilter;import org.htmlparser.filters.OrFilter;import org.htmlparser.filters.TagNameFilter;/** * Save a wikiwikiweb locally. * Illustrative program to save a wiki locally. */public class WikiCapturer    extends        SiteCapturer{    /**     * Create a wikicapturer.     */    public WikiCapturer ()    {    }    /**     * Returns <code>true</code> if the link is one we are interested in.     * @param link The link to be checked.     * @return <code>true</code> if the link has the source URL as a prefix     * and doesn't contain '?' or '#'; the former because we won't be able to     * handle server side queries in the static target directory structure and     * the latter because presumably the full page with that reference has     * already been captured previously. This performs a case insensitive     * comparison, which is cheating really, but it's cheap.     */    protected boolean isToBeCaptured (String link)    {        boolean ret;                ret = super.isToBeCaptured (link);        // eliminate PhpWiki specific pages        if (ret)            if (link.endsWith ("PhpWikiAdministration"))                ret = false;            else if (link.endsWith ("PhpWikiDocumentation"))                ret = false;            else if (link.endsWith ("TextFormattingRules"))                ret = false;            else if (link.endsWith ("NewMarkupTestPage"))                ret = false;            else if (link.endsWith ("OldMarkupTestPage"))                ret = false;            else if (link.endsWith ("OldTextFormattingRules"))                ret = false;            else if (link.endsWith ("PgsrcTranslation"))                ret = false;            else if (link.endsWith ("HowToUseWiki"))                ret = false;            else if (link.endsWith ("MoreAboutMechanics"))                ret = false;            else if (link.endsWith ("AddingPages"))                ret = false;            else if (link.endsWith ("WikiWikiWeb"))                ret = false;            else if (link.endsWith ("UserPreferences"))                ret = false;            else if (link.endsWith ("PhpWiki"))                ret = false;            else if (link.endsWith ("WabiSabi"))                ret = false;            else if (link.endsWith ("EditText"))                ret = false;            else if (link.endsWith ("FindPage"))                ret = false;            else if (link.endsWith ("RecentChanges"))                ret = false;            else if (link.endsWith ("RecentEdits"))                ret = false;            else if (link.endsWith ("RecentVisitors"))                ret = false;            else if (link.endsWith ("SteveWainstead"))                ret = false;        return (ret);    }    /**     * Mainline to capture a web site locally.     * @param args The command line arguments.     * There are three arguments the web site to capture, the local directory     * to save it to, and a flag (true or false) to indicate whether resources     * such as images and video are to be captured as well.     * These are requested via dialog boxes if not supplied.     * @exception MalformedURLException If the supplied URL is invalid.     * @exception IOException If an error occurs reading the pages or resources.     */    public static void main (String[] args)        throws            MalformedURLException,            IOException    {        WikiCapturer worker;        String url;        JFileChooser chooser;        URL source;        String path;        File target;        Boolean capture;        int ret;                worker = new WikiCapturer ();        if (0 >= args.length)        {            url = (String)JOptionPane.showInputDialog (                null,                "Enter the URL to capture:",                "Web Site",                JOptionPane.PLAIN_MESSAGE,                null,                null,                "http://htmlparser.sourceforge.net/wiki");            if (null != url)                worker.setSource (url);            else                System.exit (1);        }        else            worker.setSource (args[0]);        if (1 >= args.length)        {            url = worker.getSource ();            source = new URL (url);            path = new File (new File ("." + File.separator), source.getHost () + File.separator).getCanonicalPath ();            target = new File (path);            chooser = new JFileChooser (target);            chooser.setDialogType (JFileChooser.SAVE_DIALOG);            chooser.setFileSelectionMode (JFileChooser.DIRECTORIES_ONLY);            chooser.setSelectedFile (target); // this doesn't frickin' work            chooser.setMultiSelectionEnabled (false);            chooser.setDialogTitle ("Target Directory");            ret = chooser.showSaveDialog (null);            if (ret == JFileChooser.APPROVE_OPTION)                worker.setTarget (chooser.getSelectedFile ().getAbsolutePath ());            else                System.exit (1);        }        else            worker.setTarget (args[1]);        if (2 >= args.length)        {            capture = (Boolean)JOptionPane.showInputDialog (                null,                "Should resources be captured:",                "Capture Resources",                JOptionPane.PLAIN_MESSAGE,                null,                new Object[] { Boolean.TRUE, Boolean.FALSE},                Boolean.TRUE);            if (null != capture)                worker.setCaptureResources (capture.booleanValue ());            else                System.exit (1);        }        else            worker.setCaptureResources ((Boolean.valueOf (args[2]).booleanValue ()));        worker.setFilter (            new NotFilter (                new OrFilter (                    new AndFilter (                        new TagNameFilter ("DIV"),                        new HasAttributeFilter ("id", "navbar")),                     new OrFilter (                        new AndFilter (                            new TagNameFilter ("DIV"),                            new HasAttributeFilter ("id", "actionbar")),                        new AndFilter (                            new TagNameFilter ("DIV"),                            new HasAttributeFilter ("id", "xhtml-validator"))))));        worker.capture ();                System.exit (0);    }}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -