📄 crawljobhandler.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
                        }                    }                }            }        }        // Now add in the default profile.  Its on the CLASSPATH and needs        // special handling.  Don't add if already a default present.        String parent = File.separator + PROFILES_DIR_NAME + File.separator;        if (!loadedDefault) {            loadProfile(new File(parent + DEFAULT_PROFILE, ORDER_FILE_NAME));        }        // Look to see if a default profile system property has been        // supplied. If so, use it instead.        // TODO: Try and read default profile from some permanent storage.        defaultProfile = DEFAULT_PROFILE;    }        /**     * Load one profile.     * @param profile Profile to load.     * @return True if loaded profile was the default profile.     */    protected boolean loadProfile(File profile) {        boolean loadedDefault = false;        // Ok, got the order file for this profile.        try {            // The directory name denotes the profiles UID and name.            XMLSettingsHandler newSettingsHandler =                new XMLSettingsHandler(profile);            CrawlJobErrorHandler cjseh =                new CrawlJobErrorHandler(Level.SEVERE);            newSettingsHandler.                setErrorReportingLevel(cjseh.getLevel());            newSettingsHandler.initialize();            addProfile(new CrawlJob(profile.getParentFile().getName(),                newSettingsHandler, cjseh));            loadedDefault = profile.getParentFile().getName().                equals(DEFAULT_PROFILE);        } catch (InvalidAttributeValueException e) {            System.err.println("Failed to load profile '" +                    profile.getParentFile().getName() +                    "'. InvalidAttributeValueException.");        }        return loadedDefault;    }    /**     * Add a new profile     * @param profile The new profile     */    public synchronized void addProfile(CrawlJob profile){        profileJobs.add(profile);    }        public synchronized void deleteProfile(CrawlJob cj) throws IOException {        File d = getProfilesDirectory();        File p = new File(d, cj.getJobName());        if (!p.exists()) {            throw new IOException("No profile named " + cj.getJobName() +                " at " + d.getAbsolutePath());        }        FileUtils.deleteDir(p);        this.profileJobs.remove(cj);    }    /**     * Returns a List of all known profiles.     * @return a List of all known profiles.     */    public synchronized List<CrawlJob> getProfiles(){        ArrayList<CrawlJob> tmp = new ArrayList<CrawlJob>(profileJobs.size());        tmp.addAll(profileJobs);        return tmp;    }    /**     * Submit a job to the handler. Job will be scheduled for crawling. At     * present it will not take the job's priority into consideration.     *     * @param job A new job for the handler     * @return CrawlJob that was added or null.     */    public CrawlJob addJob(CrawlJob job) {        if(job.isProfile()){            return null;     // Can't crawl profiles.        }        job.setStatus(CrawlJob.STATUS_PENDING);        if(job.isNew()){            // Are adding the new job to the pending queue.            this.newJob = null;            job.setNew(false);        }        this.pendingCrawlJobs.add(job);        if(isCrawling() == false && isRunning()) {            // Start crawling            startNextJob();        }        return job;    }    /**     * Returns the default profile. If no default profile has been set it will     * return the first profile that was set/loaded and still exists. If no     * profiles exist it will return null     * @return the default profile.     */    public synchronized CrawlJob getDefaultProfile() {        if(defaultProfile != null){            for(Iterator it = profileJobs.iterator(); it.hasNext();) {                CrawlJob item = (CrawlJob)it.next();                if(item.getJobName().equals(defaultProfile)){                    // Found it.                    return item;                }            }        }        if(profileJobs.size() > 0){            return (CrawlJob)profileJobs.first();        }        return null;    }    /**     * Set the default profile.     * @param profile The new default profile. The following must apply to it.     *                profile.isProfile() should return true and     *                this.getProfiles() should contain it.     */    public void setDefaultProfile(CrawlJob profile) {        defaultProfile = profile.getJobName();        // TODO: Make changes to default profile durable across restarts.    }    /**     * A List of all pending jobs     *     * @return A List of all pending jobs.     * No promises are made about the order of the list     */    public List<CrawlJob> getPendingJobs() {        ArrayList<CrawlJob> tmp         = new ArrayList<CrawlJob>(pendingCrawlJobs.size());        tmp.addAll(pendingCrawlJobs);        return tmp;    }    /**     * @return The job currently being crawled.     */    public CrawlJob getCurrentJob() {        return currentJob;    }    /**     * @return A List of all finished jobs.     */    public List<CrawlJob> getCompletedJobs() {        ArrayList<CrawlJob> tmp         = new ArrayList<CrawlJob>(completedCrawlJobs.size());        tmp.addAll(completedCrawlJobs);        return tmp;    }    /**     * Return a job with the given UID.     * Doesn't matter if it's pending, currently running, has finished running     * is new or a profile.     *     * @param jobUID The unique ID of the job.     * @return The job with the UID or null if no such job is found     */    public CrawlJob getJob(String jobUID) {        if (jobUID == null){            return null; // UID can't be null        }        // First check currently running job        if (currentJob != null && currentJob.getUID().equals(jobUID)) {            return currentJob;        } else if (newJob != null && newJob.getUID().equals(jobUID)) {            // Then check the 'new job'            return newJob;        } else {            // Then check pending jobs.            Iterator itPend = pendingCrawlJobs.iterator();            while (itPend.hasNext()) {                CrawlJob cj = (CrawlJob) itPend.next();                if (cj.getUID().equals(jobUID)) {                    return cj;                }            }            // Next check completed jobs.            Iterator itComp = completedCrawlJobs.iterator();            while (itComp.hasNext()) {                CrawlJob cj = (CrawlJob) itComp.next();                if (cj.getUID().equals(jobUID)) {                    return cj;                }            }            // And finally check the profiles.            for (Iterator i = getProfiles().iterator(); i.hasNext();) {                CrawlJob cj = (CrawlJob) i.next();                if (cj.getUID().equals(jobUID)) {                    return cj;                }            }        }        return null; // Nothing found, return null    }        /**     * @return True if we terminated a current job (False if no job to     * terminate)     */    public boolean terminateCurrentJob() {        if (this.currentJob == null) {            return false;        }        // requestCrawlStop will cause crawlEnding to be invoked.        // It will handle the clean up.        this.currentJob.stopCrawling();        synchronized (this) {            try {                // Take a few moments so that the controller can change                // states before the UI updates. The CrawlEnding event                // will wake us if it occurs sooner than this.                wait(3000);            } catch (InterruptedException e) {                // Ignore.            }        }        return true;    }    /**     * The specified job will be removed from the pending queue or aborted if     * currently running.  It will be placed in the list of completed jobs with     * appropriate status info. If the job is already in the completed list or     * no job with the given UID is found, no action will be taken.     *     * @param jobUID The UID (unique ID) of the job that is to be deleted.     *     */    public void deleteJob(String jobUID) {        // First check to see if we are deleting the current job.        if (currentJob != null && jobUID.equals(currentJob.getUID())) {            terminateCurrentJob();            return; // We're not going to find another job with the same UID        }                // Ok, it isn't the current job, let's check the pending jobs.        for(Iterator it = pendingCrawlJobs.iterator(); it.hasNext();) {            CrawlJob cj = (CrawlJob) it.next();            if (cj.getUID().equals(jobUID)) {                // Found the one to delete.                cj.setStatus(CrawlJob.STATUS_DELETED);                it.remove();                return; // We're not going to find another job with the same UID            }        }                // And finally the completed jobs.        for (Iterator it = completedCrawlJobs.iterator(); it.hasNext();) {            CrawlJob cj = (CrawlJob) it.next();            if (cj.getUID().equals(jobUID)) {                // Found the one to delete.                cj.setStatus(CrawlJob.STATUS_DELETED);                it.remove();                return; // No other job will have the same UID            }        }    }    /**     * Cause the current job to pause. If no current job is crawling this     * method will have no effect.      */    public void pauseJob() {        if (this.currentJob != null) {            this.currentJob.pause();        }    }    /**     * Cause the current job to resume crawling if it was paused. Will have no     * effect if the current job was not paused or if there is no current job.     * If the current job is still waiting to pause, this will not take effect     * until the job has actually paused. At which time it will immeditatly     * resume crawling.     */    public void resumeJob() {        if (this.currentJob != null) {            this.currentJob.resume();        }    }    /**     * Cause the current job to write a checkpoint to disk. Currently     * requires job to already be paused.     * @throws IllegalStateException Thrown if crawl is not paused.     */    public void checkpointJob() throws IllegalStateException {        if (this.currentJob != null) {            this.currentJob.checkpoint();        }    }    /**     * Returns a unique job ID.     * <p>     * No two calls to this method (on the same instance of this class) can ever     * return the same value. <br>     * Currently implemented to return a time stamp. That is subject to change     * though.     *     * @return A unique job ID.     *     * @see ArchiveUtils#TIMESTAMP17     */    public String getNextJobUID() {        return ArchiveUtils.get17DigitDate();    }    /**     * Creates a new job. The new job will be returned and also registered as     * the handler's 'new job'. The new job will be based on the settings     * provided but created in a new location on disk.     *     * @param baseOn     *            A CrawlJob (with a valid settingshandler) to use as the     *            template for the new job.     * @param recovery Whether to preinitialize new job as recovery of     * <code>baseOn</code> job.  String holds RECOVER_LOG if we are to     * do the recovery based off the recover.gz log -- See RecoveryJournal in     * the frontier package -- or it holds the name of     * the checkpoint we're to use recoverying.     * @param name     *            The name of the new job.     * @param description     *            Descriptions of the job.     * @param seeds     *            The contents of the new settings' seed file.     * @param priority     *            The priority of the new job.     *     * @return The new crawl job.     * @throws FatalConfigurationException If a problem occurs creating the     *             settings.     */    public CrawlJob newJob(CrawlJob baseOn, String recovery, String name,            String description, String seeds, int priority)
💿 文件大小 9430 K
👤 上传用户 zergwyk
📂 所属分类 Internet/网络编程
🏷️ 相关标签

#lucece
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -