📄 crawljob.java
字号:
* <p> Any job created with this constructor will be * considered a profile. Profiles are not stored on disk (only their * settings files are stored on disk). This is because their data is * predictible given any settings files. * @param UIDandName A unique ID for this job. For profiles this is the same * as name * @param settingsHandler The associated settings * @param errorHandler The crawl jobs settings error handler. * <tt>null</tt> means none is set */ protected CrawlJob(final String UIDandName, final XMLSettingsHandler settingsHandler, final CrawlJobErrorHandler errorHandler) { this(UIDandName, UIDandName, settingsHandler, errorHandler, PRIORITY_AVERAGE, null, STATUS_PROFILE, true, false); } public CrawlJob(final String UID, final String name, final XMLSettingsHandler settingsHandler, final CrawlJobErrorHandler errorHandler, final int priority, final File dir, final String status, final boolean isProfile, final boolean isNew) { super(); this.UID = UID; this.name = name; this.settingsHandler = settingsHandler; this.errorHandler = errorHandler; this.status = status; this.isProfile = isProfile; this.isNew = isNew; this.jobDir = dir; this.priority = priority; } /** * A constructor for reloading jobs from disk. Jobs (not profiles) have * their data written to persistent storage in the file system. This method * is used to load the job from such storage. This is done by the * <code>CrawlJobHandler</code>. * <p> * Proper structure of a job file (TODO: Maybe one day make this an XML file) * Line 1. UID <br> * Line 2. Job name (string) <br> * Line 3. Job status (string) <br> * Line 4. is job read only (true/false) <br> * Line 5. is job running (true/false) <br> * Line 6. job priority (int) <br> * Line 7. number of journal entries <br> * Line 8. setting file (with path) <br> * Line 9. statistics tracker file (with path) <br> * Line 10-?. error message (String, empty for null), can be many lines <br> * @param jobFile * a file containing information about the job to load. * @param errorHandler The crawl jobs settings error handler. * null means none is set * @throws InvalidJobFileException * if the specified file does not refer to a valid job file. * @throws IOException * if io operations fail */ protected CrawlJob(final File jobFile, final CrawlJobErrorHandler errorHandler) throws InvalidJobFileException, IOException { this(null, null, null, errorHandler, PRIORITY_AVERAGE, null, null, false, true); this.jobDir = jobFile.getParentFile(); // Check for corrupt job.state files (can be corrupt if we crash). if (jobFile.length() == 0) { throw new InvalidJobFileException(jobFile.getCanonicalPath() + " is corrupt (length is zero)"); } // Open file. Read data and set up class variables accordingly... BufferedReader jobReader = new BufferedReader(new FileReader(jobFile), 4096); // UID this.UID = jobReader.readLine(); // name this.name = jobReader.readLine(); // status this.status = jobReader.readLine(); if(status.equals(STATUS_ABORTED)==false && status.equals(STATUS_CREATED)==false && status.equals(STATUS_DELETED)==false && status.equals(STATUS_FINISHED)==false && status.equals(STATUS_FINISHED_ABNORMAL)==false && status.equals(STATUS_FINISHED_DATA_LIMIT)==false && status.equals(STATUS_FINISHED_DOCUMENT_LIMIT)==false && status.equals(STATUS_FINISHED_TIME_LIMIT)==false && status.equals(STATUS_MISCONFIGURED)==false && status.equals(STATUS_PAUSED)==false && status.equals(STATUS_CHECKPOINTING)==false && status.equals(STATUS_PENDING)==false && status.equals(STATUS_RUNNING)==false && status.equals(STATUS_WAITING_FOR_PAUSE)==false && status.equals(STATUS_PREPARING)==false){ // status is invalid. Must be one of the above throw new InvalidJobFileException("Status (line 3) in job file " + "is not valid: '" + status + "'"); } // isReadOnly String tmp = jobReader.readLine(); if(tmp.equals("true")){ isReadOnly = true; } else if(tmp.equals("false")){ isReadOnly = false; } else { throw new InvalidJobFileException("isReadOnly (line 4) in job" + " file '" + jobFile.getAbsolutePath() + "' is not " + "valid: '" + tmp + "'"); } // isRunning tmp = jobReader.readLine(); if(tmp.equals("true")){ this.isRunning = true; } else if(tmp.equals("false")){ this.isRunning = false; } else { throw new InvalidJobFileException("isRunning (line 5) in job " + "file '" + jobFile.getAbsolutePath() + "' is not valid: " + "'" + tmp + "'"); } // priority tmp = jobReader.readLine(); try{ this.priority = Integer.parseInt(tmp); } catch(NumberFormatException e){ throw new InvalidJobFileException("priority (line 5) in job " + "file '" + jobFile.getAbsolutePath() + "' is not valid: " + "'" + tmp + "'"); } // numberOfJournalEntries tmp = jobReader.readLine(); try{ this.numberOfJournalEntries = Integer.parseInt(tmp); } catch(NumberFormatException e){ throw new InvalidJobFileException("numberOfJournalEntries " + "(line 5) in job file '" + jobFile.getAbsolutePath() + "' is not valid: " + "'" + tmp + "'"); } // settingsHandler tmp = jobReader.readLine(); try { File f = new File(tmp); this.settingsHandler = new XMLSettingsHandler((f.isAbsolute())? f: new File(jobDir, f.getName())); if(this.errorHandler != null){ this.settingsHandler.registerValueErrorHandler(errorHandler); } this.settingsHandler.initialize(); } catch (InvalidAttributeValueException e1) { throw new InvalidJobFileException("Problem reading from settings " + "file (" + tmp + ") specified in job file '" + jobFile.getAbsolutePath() + "'\n" + e1.getMessage()); } // Statistics tracker. jobReader.readLine(); // errorMessage // TODO: Multilines tmp = jobReader.readLine(); errorMessage = ""; while(tmp!=null){ errorMessage+=tmp+'\n'; tmp = jobReader.readLine(); } if(errorMessage.length()==0){ // Empty error message should be null errorMessage = null; } // TODO: Load stattrack if needed. // TODO: This should be inside a finally block. jobReader.close(); } /** * Cause the job to be written to persistent storage. * This will also save the statistics tracker if it is not null and the * job status is finished (regardless of how it's finished) */ private void writeJobFile() { if (isProfile) { return; } final String jobDirAbsolute = jobDir.getAbsolutePath(); if (!jobDir.exists() || !jobDir.canWrite()) { logger.warning("Can't update status on " + jobDirAbsolute + " because file does not" + " exist (or is unwriteable)"); return; } File f = new File(jobDirAbsolute, "state.job"); String settingsFile = getSettingsDirectory(); // Make settingsFile's path relative if order.xml is somewhere in the // job's directory tree if(settingsFile.startsWith(jobDirAbsolute.concat(File.separator))) { settingsFile = settingsFile.substring(jobDirAbsolute.length()+1); } try { FileWriter jobWriter = new FileWriter(f, false); try { jobWriter.write(UID + "\n"); jobWriter.write(name + "\n"); jobWriter.write(status + "\n"); jobWriter.write(isReadOnly + "\n"); jobWriter.write(isRunning + "\n"); jobWriter.write(priority + "\n"); jobWriter.write(numberOfJournalEntries + "\n"); jobWriter.write(settingsFile + "\n"); jobWriter.write(statisticsFileSave + "\n");// TODO: Is this // right? // Can be multiple lines so we keep it last if (errorMessage != null) { jobWriter.write(errorMessage + "\n"); } } finally { if (jobWriter != null) { jobWriter.close(); } } } catch (IOException e) { logger.log(Level.WARNING, "An IOException occured saving job " + name + " (" + UID + ")", e); } } /** * Returns this jobs unique ID (UID) that was issued by the * CrawlJobHandler() when this job was first created. * * @return Job This jobs UID. * @see CrawlJobHandler#getNextJobUID() */ public String getUID(){ return UID; } /** * Returns this job's 'name'. The name comes from the settings for this job, * need not be unique and may change. For a unique identifier use * {@link #getUID() getUID()}. * <p> * The name corrisponds to the value of the 'name' tag in the 'meta' section * of the settings file. * * @return This job's 'name' */ public String getJobName(){ return name; } /** * Return the combination of given name and UID most commonly * used in administrative interface. * * @return Job's name with UID notation */ public String getDisplayName() { return getJobName()+" ["+getUID()+"]"; } /** * Set this job's level of priority. * * @param priority The level of priority * * @see #getJobPriority() * @see #PRIORITY_MINIMAL * @see #PRIORITY_LOW * @see #PRIORITY_AVERAGE * @see #PRIORITY_HIGH * @see #PRIORITY_CRITICAL */ public void setJobPriority(int priority) { this.priority = priority; } /** * Get this job's level of priority. * * @return this job's priority * @see #setJobPriority(int) * @see #PRIORITY_MINIMAL * @see #PRIORITY_LOW * @see #PRIORITY_AVERAGE * @see #PRIORITY_HIGH * @see #PRIORITY_CRITICAL */ public int getJobPriority() { return priority; } /** * Once called no changes can be made to the settings for this job. * Typically this is done once a crawl is completed and further changes * to the crawl order are therefor meaningless. */ public void setReadOnly() { isReadOnly = true; writeJobFile(); //Save changes } /** * Is job read only? * @return false until setReadOnly has been invoked, after that it returns true. */ public boolean isReadOnly(){ return isReadOnly; } /** * Set the status of this CrawlJob. * * @param status Current status of CrawlJob * (see constants defined here beginning with STATUS) */ public void setStatus(String status) { this.status = status; writeJobFile(); //Save changes // TODO: If job finished, save StatisticsTracker! } /** * @return Status of the crawler (Used by JMX). */ public String getCrawlStatus() { return this.controller != null?
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -