⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 crawljob.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
            this.controller.getState().toString(): "Illegal State";    }        /**     * Get the current status of this CrawlJob     *     * @return The current status of this CrawlJob     *         (see constants defined here beginning with STATUS)     */    public String getStatus() {        return this.status;    }    /**     * Returns the settings handler for this job. It will have been initialized.     * @return the settings handler for this job.     */    public XMLSettingsHandler getSettingsHandler() {        return this.settingsHandler;    }    /**     * Is this a new job?     * @return True if is new.     */    public boolean isNew() {        return isNew;    }    /**     * Set if the job is considered to be a profile     * @return True if is a profile.     */    public boolean isProfile() {        return isProfile;    }    /**     * Set if the job is considered a new job or not.     * @param b Is the job considered to be new.     */    public void setNew(boolean b) {        isNew = b;        writeJobFile(); //Save changes    }    /**     * Returns true if the job is being crawled.     * @return true if the job is being crawled     */    public boolean isRunning() {        return isRunning;    }    /**     * Set if job is being crawled.     * @param b Is job being crawled.     */    protected void setRunning(boolean b) {        isRunning = b;        writeJobFile(); // Save changes        //TODO: Job ending -> Save statistics tracker.        //TODO: This is likely to happen as the CrawlEnding event occurs,        // need to ensure that the StatisticsTracker is saved to disk on        // CrawlEnded. Maybe move responsibility for this into the        // StatisticsTracker?    }        protected void unregisterMBean() {        // Unregister current job from JMX agent, if there one.        if (this.mbeanServer == null) {            return;        }        try {            this.mbeanServer.unregisterMBean(this.mbeanName);            this.mbeanServer = null;        } catch (Exception e) {            logger.log(Level.SEVERE, "Failed with " + this.mbeanName, e);        }    }        /**     * Subclass of crawlcontroller that unregisters beans when stopped.     * Done as subclass so CrawlController doesn't get any JMX (or 'CrawlJob')     * pollution, so for sure CrawlJob is unregistered with JMX and so any     * listeners on the CrawlJob get a chance to get crawl ended message     * (These latter notifications may not actually be getting through -- TBD).     * <p>TODO: This override dirtys the data model since CC knows about CJs.     * The facility provided by this class emitting events and statistics so     * they can be read by JMX needs to go back into CC.  Probably best to     * registering in JMX the CC, rather than CJ.  Lets do this in Heritrix 2.0     * since means changing the JMX API some.     */    public class MBeanCrawlController extends CrawlController    implements Serializable {        private static final long serialVersionUID = -4608537998168407222L;        private CrawlJob cj = null;        private CompositeType ct =  null;                public CrawlJob getCrawlJob() {            return this.cj;        }        public void setCrawlJob(CrawlJob cj) {            this.cj = cj;        }                public void progressStatisticsEvent(final EventObject e) {            super.progressStatisticsEvent(e);            if (this.cj.getMbeanName() == null) {                // Can be null around job startup.  Return w/o doing anything.                return;            }                            Map s = ((StatisticsTracking)e.getSource()).getProgressStatistics();            // Convert the statistics to OpenType CompositeData and add as            // user data to Notification.            CompositeData cd = null;            try {                if (this.ct == null) {                    this.ct = JmxUtils.createCompositeType(s, PROG_STATS,                        PROG_STATS + " for " + this.cj.getMbeanName());                }                cd = new CompositeDataSupport(this.ct, s);            } catch (OpenDataException ode) {                ode.printStackTrace();            }            if (cd != null) {                Notification n = new Notification(PROG_STATS,                    this.cj.getMbeanName(), getNotificationsSequenceNumber(),                    ((StatisticsTracking)e.getSource()).                        getProgressStatisticsLine());                n.setUserData(cd);                this.cj.sendNotification(n);            }        }                protected void completeStop() {            try {                super.completeStop();            } finally {                if (this.cj != null) {                    this.cj.unregisterMBean();                }                this.cj = null;            }        }    }        protected CrawlController setupCrawlController()    throws InitializationException {        CrawlController controller = null;                // Check if we're to do a checkpoint recover.  If so, deserialize        // the checkpoint's CrawlController and use that in place of a new        // CrawlController instance.        Checkpoint cp = CrawlController.            getCheckpointRecover(getSettingsHandler().getOrder());        if (cp != null) {            try {            	controller = (MBeanCrawlController)CheckpointUtils.                    readObjectFromFile(MBeanCrawlController.class,                        cp.getDirectory());            } catch (FileNotFoundException e) {                throw new InitializationException(e);            } catch (IOException e) {                throw new InitializationException(e);            } catch (ClassNotFoundException e) {                throw new InitializationException(e);            }        } else {        	controller = new MBeanCrawlController();        }        return controller;    }        protected CrawlController createCrawlController() {    	return new MBeanCrawlController();    }        public void setupForCrawlStart()    throws InitializationException {        try {        	this.controller = setupCrawlController();            // Register as listener to get job finished notice.            this.controller.addCrawlStatusListener(this);            this.controller.initialize(getSettingsHandler());            // Set the crawl job this MBeanCrawlController needs to worry about.            ((MBeanCrawlController)this.controller).setCrawlJob(this);            // Create our mbean description and register our crawljob.            this.openMBeanInfo = buildMBeanInfo();            try {                Heritrix.registerMBean(this, getJmxJobName(),                    CRAWLJOB_JMXMBEAN_TYPE);            } catch (InstanceAlreadyExistsException e) {                throw new InitializationException(e);            } catch (MBeanRegistrationException e) {                throw new InitializationException(e);            } catch (NotCompliantMBeanException e) {                throw new InitializationException(e);            }        } catch (InitializationException e) {            // Can't load current job since it is misconfigured.            setStatus(CrawlJob.STATUS_MISCONFIGURED);            setErrorMessage("A fatal InitializationException occured when "                    + "loading job:\n" + e.getMessage());            // Log to stdout so its seen in logs as well as in UI.            e.printStackTrace();            this.controller = null;            throw e;        }        setStatus(CrawlJob.STATUS_RUNNING);        setRunning(true);    }        public void stopCrawling() {        if(this.controller != null) {            this.controller.requestCrawlStop();        }    }    /**     * @return One-line Frontier report.     */    public String getFrontierOneLine() {        if (this.controller == null || this.controller.getFrontier() == null) {            return "Crawler not running";        }        return this.controller.getFrontier().singleLineReport();    }        /**     * @param reportName Name of report to write.     * @return A report of the frontier's status.     */    public String getFrontierReport(final String reportName) {        if (this.controller == null || this.controller.getFrontier() == null) {            return "Crawler not running";        }        return ArchiveUtils.writeReportToString(this.controller.getFrontier(),                reportName);    }        /**     * Write the requested frontier report to the given PrintWriter     * @param reportName Name of report to write.     * @param writer Where to write to.     */    public void writeFrontierReport(String reportName, PrintWriter writer) {        if (this.controller == null || this.controller.getFrontier() == null) {            writer.println("Crawler not running.");            return;        }        this.controller.getFrontier().reportTo(reportName,writer);    }    /**     * @return One-line threads report.     */    public String getThreadOneLine() {        if (this.controller == null) {            return "Crawler not running";        }        return this.controller.oneLineReportThreads();    }        /**     * Get the CrawlControllers ToeThreads report for the running crawl.     * @return The CrawlControllers ToeThreads report     */    public String getThreadsReport() {        if (this.controller == null) {            return "Crawler not running";        }        return ArchiveUtils.writeReportToString(this.controller.getToePool(),                null);    }        /**     * Write the requested threads report to the given PrintWriter     * @param reportName Name of report to write.     * @param writer Where to write to.     */    public void writeThreadsReport(String reportName, PrintWriter writer) {        if (this.controller == null || this.controller.getFrontier() == null) {            writer.println("Crawler not running.");            return;        }        this.controller.getToePool().reportTo(reportName, writer);    }        /**     * Kills a thread. For details see     * {@link org.archive.crawler.framework.ToePool#killThread(int, boolean)     * ToePool.killThread(int, boolean)}.     * @param threadNumber Thread to kill.     * @param replace Should thread be replaced.     * @see org.archive.crawler.framework.ToePool#killThread(int, boolean)     */    public void killThread(int threadNumber, boolean replace) {        if (this.controller ==  null) {            return;        }        this.controller.killThread(threadNumber, replace);    }    /**     * Get the Processors report for the running crawl.     * @return The Processors report for the running crawl.     */    public String getProcessorsReport() {        if (this.controller == null) {            return "Crawler not running";        }        return ArchiveUtils.writeReportToString(this.controller,                CrawlController.PROCESSORS_REPORT);    }        /**     * Returns the directory where the configuration files for this job are     * located.     *     * @return the directory where the configuration files for this job are     *         located     */    public String getSettingsDirectory() {        return settingsHandler.getOrderFile().getPath();    }    /**     * Returns the path of the job's base directory. For profiles this is always

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -