📄 crawljobhandler.java
字号:
} } } } } // Now add in the default profile. Its on the CLASSPATH and needs // special handling. Don't add if already a default present. String parent = File.separator + PROFILES_DIR_NAME + File.separator; if (!loadedDefault) { loadProfile(new File(parent + DEFAULT_PROFILE, ORDER_FILE_NAME)); } // Look to see if a default profile system property has been // supplied. If so, use it instead. // TODO: Try and read default profile from some permanent storage. defaultProfile = DEFAULT_PROFILE; } /** * Load one profile. * @param profile Profile to load. * @return True if loaded profile was the default profile. */ protected boolean loadProfile(File profile) { boolean loadedDefault = false; // Ok, got the order file for this profile. try { // The directory name denotes the profiles UID and name. XMLSettingsHandler newSettingsHandler = new XMLSettingsHandler(profile); CrawlJobErrorHandler cjseh = new CrawlJobErrorHandler(Level.SEVERE); newSettingsHandler. setErrorReportingLevel(cjseh.getLevel()); newSettingsHandler.initialize(); addProfile(new CrawlJob(profile.getParentFile().getName(), newSettingsHandler, cjseh)); loadedDefault = profile.getParentFile().getName(). equals(DEFAULT_PROFILE); } catch (InvalidAttributeValueException e) { System.err.println("Failed to load profile '" + profile.getParentFile().getName() + "'. InvalidAttributeValueException."); } return loadedDefault; } /** * Add a new profile * @param profile The new profile */ public synchronized void addProfile(CrawlJob profile){ profileJobs.add(profile); } public synchronized void deleteProfile(CrawlJob cj) throws IOException { File d = getProfilesDirectory(); File p = new File(d, cj.getJobName()); if (!p.exists()) { throw new IOException("No profile named " + cj.getJobName() + " at " + d.getAbsolutePath()); } FileUtils.deleteDir(p); this.profileJobs.remove(cj); } /** * Returns a List of all known profiles. * @return a List of all known profiles. */ public synchronized List<CrawlJob> getProfiles(){ ArrayList<CrawlJob> tmp = new ArrayList<CrawlJob>(profileJobs.size()); tmp.addAll(profileJobs); return tmp; } /** * Submit a job to the handler. Job will be scheduled for crawling. At * present it will not take the job's priority into consideration. * * @param job A new job for the handler * @return CrawlJob that was added or null. */ public CrawlJob addJob(CrawlJob job) { if(job.isProfile()){ return null; // Can't crawl profiles. } job.setStatus(CrawlJob.STATUS_PENDING); if(job.isNew()){ // Are adding the new job to the pending queue. this.newJob = null; job.setNew(false); } this.pendingCrawlJobs.add(job); if(isCrawling() == false && isRunning()) { // Start crawling startNextJob(); } return job; } /** * Returns the default profile. If no default profile has been set it will * return the first profile that was set/loaded and still exists. If no * profiles exist it will return null * @return the default profile. */ public synchronized CrawlJob getDefaultProfile() { if(defaultProfile != null){ for(Iterator it = profileJobs.iterator(); it.hasNext();) { CrawlJob item = (CrawlJob)it.next(); if(item.getJobName().equals(defaultProfile)){ // Found it. return item; } } } if(profileJobs.size() > 0){ return (CrawlJob)profileJobs.first(); } return null; } /** * Set the default profile. * @param profile The new default profile. The following must apply to it. * profile.isProfile() should return true and * this.getProfiles() should contain it. */ public void setDefaultProfile(CrawlJob profile) { defaultProfile = profile.getJobName(); // TODO: Make changes to default profile durable across restarts. } /** * A List of all pending jobs * * @return A List of all pending jobs. * No promises are made about the order of the list */ public List<CrawlJob> getPendingJobs() { ArrayList<CrawlJob> tmp = new ArrayList<CrawlJob>(pendingCrawlJobs.size()); tmp.addAll(pendingCrawlJobs); return tmp; } /** * @return The job currently being crawled. */ public CrawlJob getCurrentJob() { return currentJob; } /** * @return A List of all finished jobs. */ public List<CrawlJob> getCompletedJobs() { ArrayList<CrawlJob> tmp = new ArrayList<CrawlJob>(completedCrawlJobs.size()); tmp.addAll(completedCrawlJobs); return tmp; } /** * Return a job with the given UID. * Doesn't matter if it's pending, currently running, has finished running * is new or a profile. * * @param jobUID The unique ID of the job. * @return The job with the UID or null if no such job is found */ public CrawlJob getJob(String jobUID) { if (jobUID == null){ return null; // UID can't be null } // First check currently running job if (currentJob != null && currentJob.getUID().equals(jobUID)) { return currentJob; } else if (newJob != null && newJob.getUID().equals(jobUID)) { // Then check the 'new job' return newJob; } else { // Then check pending jobs. Iterator itPend = pendingCrawlJobs.iterator(); while (itPend.hasNext()) { CrawlJob cj = (CrawlJob) itPend.next(); if (cj.getUID().equals(jobUID)) { return cj; } } // Next check completed jobs. Iterator itComp = completedCrawlJobs.iterator(); while (itComp.hasNext()) { CrawlJob cj = (CrawlJob) itComp.next(); if (cj.getUID().equals(jobUID)) { return cj; } } // And finally check the profiles. for (Iterator i = getProfiles().iterator(); i.hasNext();) { CrawlJob cj = (CrawlJob) i.next(); if (cj.getUID().equals(jobUID)) { return cj; } } } return null; // Nothing found, return null } /** * @return True if we terminated a current job (False if no job to * terminate) */ public boolean terminateCurrentJob() { if (this.currentJob == null) { return false; } // requestCrawlStop will cause crawlEnding to be invoked. // It will handle the clean up. this.currentJob.stopCrawling(); synchronized (this) { try { // Take a few moments so that the controller can change // states before the UI updates. The CrawlEnding event // will wake us if it occurs sooner than this. wait(3000); } catch (InterruptedException e) { // Ignore. } } return true; } /** * The specified job will be removed from the pending queue or aborted if * currently running. It will be placed in the list of completed jobs with * appropriate status info. If the job is already in the completed list or * no job with the given UID is found, no action will be taken. * * @param jobUID The UID (unique ID) of the job that is to be deleted. * */ public void deleteJob(String jobUID) { // First check to see if we are deleting the current job. if (currentJob != null && jobUID.equals(currentJob.getUID())) { terminateCurrentJob(); return; // We're not going to find another job with the same UID } // Ok, it isn't the current job, let's check the pending jobs. for(Iterator it = pendingCrawlJobs.iterator(); it.hasNext();) { CrawlJob cj = (CrawlJob) it.next(); if (cj.getUID().equals(jobUID)) { // Found the one to delete. cj.setStatus(CrawlJob.STATUS_DELETED); it.remove(); return; // We're not going to find another job with the same UID } } // And finally the completed jobs. for (Iterator it = completedCrawlJobs.iterator(); it.hasNext();) { CrawlJob cj = (CrawlJob) it.next(); if (cj.getUID().equals(jobUID)) { // Found the one to delete. cj.setStatus(CrawlJob.STATUS_DELETED); it.remove(); return; // No other job will have the same UID } } } /** * Cause the current job to pause. If no current job is crawling this * method will have no effect. */ public void pauseJob() { if (this.currentJob != null) { this.currentJob.pause(); } } /** * Cause the current job to resume crawling if it was paused. Will have no * effect if the current job was not paused or if there is no current job. * If the current job is still waiting to pause, this will not take effect * until the job has actually paused. At which time it will immeditatly * resume crawling. */ public void resumeJob() { if (this.currentJob != null) { this.currentJob.resume(); } } /** * Cause the current job to write a checkpoint to disk. Currently * requires job to already be paused. * @throws IllegalStateException Thrown if crawl is not paused. */ public void checkpointJob() throws IllegalStateException { if (this.currentJob != null) { this.currentJob.checkpoint(); } } /** * Returns a unique job ID. * <p> * No two calls to this method (on the same instance of this class) can ever * return the same value. <br> * Currently implemented to return a time stamp. That is subject to change * though. * * @return A unique job ID. * * @see ArchiveUtils#TIMESTAMP17 */ public String getNextJobUID() { return ArchiveUtils.get17DigitDate(); } /** * Creates a new job. The new job will be returned and also registered as * the handler's 'new job'. The new job will be based on the settings * provided but created in a new location on disk. * * @param baseOn * A CrawlJob (with a valid settingshandler) to use as the * template for the new job. * @param recovery Whether to preinitialize new job as recovery of * <code>baseOn</code> job. String holds RECOVER_LOG if we are to * do the recovery based off the recover.gz log -- See RecoveryJournal in * the frontier package -- or it holds the name of * the checkpoint we're to use recoverying. * @param name * The name of the new job. * @param description * Descriptions of the job. * @param seeds * The contents of the new settings' seed file. * @param priority * The priority of the new job. * * @return The new crawl job. * @throws FatalConfigurationException If a problem occurs creating the * settings. */ public CrawlJob newJob(CrawlJob baseOn, String recovery, String name, String description, String seeds, int priority)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -