⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 heritrix.java

📁 Heritrix是一个开源,可扩展的web爬虫项目。Heritrix设计成严格按照robots.txt文件的排除指示和META robots标签。
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
    private final static String START_CRAWLING_OPER = "startCrawling";    private final static String STOP_CRAWLING_OPER = "stopCrawling";    private final static String ADD_CRAWL_JOB_OPER = "addJob";    private final static String TERMINATE_CRAWL_JOB_OPER =        "terminateCurrentJob";    private final static String DELETE_CRAWL_JOB_OPER = "deleteJob";    private final static String ALERT_OPER = "alert";    private final static String ADD_CRAWL_JOB_BASEDON_OPER = "addJobBasedon";    private final static String PENDING_JOBS_OPER = "pendingJobs";    private final static String COMPLETED_JOBS_OPER = "completedJobs";    private final static String CRAWLEND_REPORT_OPER = "crawlendReport";    private final static String SHUTDOWN_OPER = "shutdown";    private final static String LOG_OPER = "log";    private final static String REBIND_JNDI_OPER = "rebindJNDI";    private final static List OPERATION_LIST;    static {        OPERATION_LIST = Arrays.asList(new String [] {START_OPER, STOP_OPER,            INTERRUPT_OPER, START_CRAWLING_OPER, STOP_CRAWLING_OPER,            ADD_CRAWL_JOB_OPER, ADD_CRAWL_JOB_BASEDON_OPER,            DELETE_CRAWL_JOB_OPER, ALERT_OPER, PENDING_JOBS_OPER,            COMPLETED_JOBS_OPER, CRAWLEND_REPORT_OPER, SHUTDOWN_OPER,            LOG_OPER, DESTROY_OPER, TERMINATE_CRAWL_JOB_OPER,            REBIND_JNDI_OPER});    }    private CompositeType jobCompositeType = null;    private TabularType jobsTabularType = null;    private static final String [] JOB_KEYS =        new String [] {"uid", "name", "status"};    private static String adminUsername;    private static String adminPassword;        /**     * Constructor.     * Does not register the created instance with JMX.  Assumed this     * constructor is used by such as JMX agent creating an instance of     * Heritrix at the commmand of a remote client (In this case Heritrix will     * be registered by the invoking agent).     * @throws IOException     */    public Heritrix() throws IOException {        this(null, false);    }        public Heritrix(final boolean jmxregister) throws IOException {        this(null, jmxregister);    }        /**     * Constructor.     * @param name If null, we bring up the default Heritrix instance.     * @param jmxregister True if we are to register this instance with JMX     * agent.     * @throws IOException     */    public Heritrix(final String name, final boolean jmxregister)    throws IOException {        this(name, jmxregister, new CrawlJobHandler(getJobsdir()));    }        /**     * Constructor.     * @param name If null, we bring up the default Heritrix instance.     * @param jmxregister True if we are to register this instance with JMX     * agent.     * @param cjh CrawlJobHandler to use.     * @throws IOException     */    public Heritrix(final String name, final boolean jmxregister,            final CrawlJobHandler cjh)    throws IOException {        super();        containerInitialization();        this.jobHandler = cjh;        this.openMBeanInfo = buildMBeanInfo();        // Set up the alerting system.  SinkHandler is also a global so will        // catch alerts for all running Heritrix instances.  Will need to        // address (Add name of instance that threw the alert to SinkRecord?).        final SinkHandler sinkHandler = SinkHandler.getInstance();        if (sinkHandler == null) {            throw new NullPointerException("SinkHandler not found.");        }        // Adapt the alerting system to use SinkHandler.        this.alertManager = new AlertManager() {            public void add(SinkHandlerLogRecord record) {                sinkHandler.publish(record);            }            public Vector getAll() {                return sinkHandler.getAll();            }            public Vector getNewAll() {                return sinkHandler.getAllUnread();            }            public SinkHandlerLogRecord get(String alertID) {                return sinkHandler.get(Long.parseLong(alertID));            }                        public int getCount() {                return sinkHandler.getCount();            }            public int getNewCount() {                return sinkHandler.getUnreadCount();            }            public void remove(String alertID) {                sinkHandler.remove(Long.parseLong(alertID));            }            public void read(String alertID) {                sinkHandler.read(Long.parseLong(alertID));            }        };                try {            Heritrix.registerHeritrix(this, name, jmxregister);        } catch (InstanceAlreadyExistsException e) {            throw new RuntimeException(e);        } catch (MBeanRegistrationException e) {            throw new RuntimeException(e);        } catch (NotCompliantMBeanException e) {            throw new RuntimeException(e);        } catch (MalformedObjectNameException e) {            throw new RuntimeException(e);        }    }        /**     * Run setup tasks for this 'container'. Idempotent.     *      * @throws IOException     */    protected static void containerInitialization() throws IOException {        if (Heritrix.containerInitialized) {            return;        }        Heritrix.containerInitialized = true;        // Load up the properties.  This invocation adds heritrix properties        // to system properties so all available via System.getProperty.        // Note, loadProperties and patchLogging have global effects.  May be an        // issue if we're running inside a container such as tomcat or jboss.        Heritrix.loadProperties();        Heritrix.patchLogging();        Heritrix.configureTrustStore();        // Will run on SIGTERM but not on SIGKILL, unfortunately.        // Otherwise, ensures we cleanup after ourselves (Deregister from        // JMX and JNDI).        Runtime.getRuntime().addShutdownHook(            Heritrix.getShutdownThread(false, 0, "Heritrix shutdown hook"));        // Register this heritrix 'container' though we may be inside another        // tomcat or jboss container.        try {            registerContainerJndi();        } catch (Exception e) {            logger.log(Level.WARNING, "Failed jndi container registration.", e);        }    }        /**     * Do inverse of construction. Used by anyone who does a 'new Heritrix' when     * they want to cleanup the instance.     * Of note, there may be Heritrix threads still hanging around after the     * call to destroy completes.  They'll eventually go down after they've     * finished their cleanup routines.  In particular, if you are watching     * Heritrix via JMX, you can see the Heritrix instance JMX bean unregister     * ahead of the CrawlJob JMX bean that its hosting.     */    public void destroy() {        stop();        try {            Heritrix.unregisterHeritrix(this);        } catch (InstanceNotFoundException e) {            e.printStackTrace();        } catch (MBeanRegistrationException e) {            e.printStackTrace();        } catch (NullPointerException e) {            e.printStackTrace();        }        this.jobHandler = null;        this.openMBeanInfo = null;    }        /**     * Launch program.     * Optionally will launch a web server to host UI.  Will also register     * Heritrix MBean with first found JMX Agent (Usually the 1.5.0 JVM     * Agent).     *      * @param args Command line arguments.     * @throws Exception     */    public static void main(String[] args)    throws Exception {        Heritrix.commandLine = true;                // Set timezone here.  Would be problematic doing it if we're running        // inside in a container.        TimeZone.setDefault(TimeZone.getTimeZone("GMT"));                File startLog = new File(getHeritrixHome(), STARTLOG);        Heritrix.out = new PrintWriter(isDevelopment()?             System.out: new PrintStream(new FileOutputStream(startLog)));                try {            containerInitialization();            String status = doCmdLineArgs(args);            if (status != null) {                Heritrix.out.println(status);            }        }        catch(Exception e) {            // Show any exceptions in STARTLOG.            e.printStackTrace(Heritrix.out);            throw e;        }        finally {            // If not development, close the file that signals the wrapper            // script that we've started.  Otherwise, just flush it; if in            // development, the output is probably a console.            if (!isDevelopment()) {                if (Heritrix.out != null) {                    Heritrix.out.close();                }                System.out.println("Heritrix version: " +                        Heritrix.getVersion());            } else {                if (Heritrix.out != null) {                    Heritrix.out.flush();                }            }        }    }        protected static String doCmdLineArgs(final String [] args)    throws Exception {        // Get defaults for commandline arguments from the properties file.        String tmpStr = PropertyUtils.            getPropertyOrNull("heritrix.context");        if (tmpStr != null)  {            Heritrix.adminContext = tmpStr;        }        tmpStr = PropertyUtils.getPropertyOrNull("heritrix.cmdline.port");        if (tmpStr != null) {            Heritrix.guiPort = Integer.parseInt(tmpStr);        }        tmpStr = PropertyUtils.getPropertyOrNull("heritrix.cmdline.admin");        String adminLoginPassword = (tmpStr == null)? "": tmpStr;        String crawlOrderFile =            PropertyUtils.getPropertyOrNull("heritrix.cmdline.order");        tmpStr = PropertyUtils.getPropertyOrNull("heritrix.cmdline.run");        boolean runMode =            PropertyUtils.getBooleanProperty("heritrix.cmdline.run");        boolean selfTest = false;        String selfTestName = null;        CommandLineParser clp = new CommandLineParser(args, Heritrix.out,            Heritrix.getVersion());        List arguments = clp.getCommandLineArguments();        Option [] options = clp.getCommandLineOptions();        // Check passed argument.  Only one argument, the ORDER_FILE is allowed.        // If one argument, make sure exists and xml suffix.        if (arguments.size() > 1) {            clp.usage(1);        } else if (arguments.size() == 1) {            crawlOrderFile = (String)arguments.get(0);            if (!(new File(crawlOrderFile).exists())) {                clp.usage("ORDER.XML <" + crawlOrderFile +                    "> specified does not exist.", 1);            }            // Must end with '.xml'            if (crawlOrderFile.length() > 4 &&                    !crawlOrderFile.substring(crawlOrderFile.length() - 4).                        equalsIgnoreCase(".xml")) {                clp.usage("ORDER.XML <" + crawlOrderFile +                    "> does not have required '.xml' suffix.", 1);            }        }        // Now look at options passed.        for (int i = 0; i < options.length; i++) {            switch(options[i].getId()) {                case 'h':                    clp.usage();                    break;                case 'a':                    adminLoginPassword = options[i].getValue();                    break;                case 'n':                    if (crawlOrderFile == null) {                        clp.usage("You must specify an ORDER_FILE with" +                            " '--nowui' option.", 1);                    }                    Heritrix.gui = false;                    break;                                case 'b':                    Heritrix.guiHosts = parseHosts(options[i].getValue());                    break;                case 'p':                    try {                        Heritrix.guiPort =                            Integer.parseInt(options[i].getValue());                    } catch (NumberFormatException e) {                        clp.usage("Failed parse of port number: " +                            options[i].getValue(), 1);                    }                    if (Heritrix.guiPort <= 0) {                        clp.usage("Nonsensical port number: " +                            options[i].getValue(), 1);                    }                    break;                case 'r':                    runMode = true;                    break;                case 's':                    selfTestName = options[i].getValue();                    selfTest = true;                    break;                default:                    assert false: options[i].getId();            }        }        // Ok, we should now have everything to launch the program.        String status = null;        if (selfTest) {            // If more than just '--selftest' and '--port' passed, then            // there is confusion on what is being asked of us.  Print usage

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -