📄 crawljob.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
     * <p> Any job created with this constructor will be     * considered a profile. Profiles are not stored on disk (only their     * settings files are stored on disk). This is because their data is     * predictible given any settings files.     * @param UIDandName A unique ID for this job. For profiles this is the same     *           as name     * @param settingsHandler The associated settings     * @param errorHandler The crawl jobs settings error handler.     *           <tt>null</tt> means none is set     */    protected CrawlJob(final String UIDandName,            final XMLSettingsHandler settingsHandler,            final CrawlJobErrorHandler errorHandler) {        this(UIDandName, UIDandName, settingsHandler, errorHandler,            PRIORITY_AVERAGE, null, STATUS_PROFILE, true, false);    }        public CrawlJob(final String UID,            final String name, final XMLSettingsHandler settingsHandler,            final CrawlJobErrorHandler errorHandler, final int priority,            final File dir, final String status, final boolean isProfile,            final boolean isNew) {        super();        this.UID = UID;        this.name = name;        this.settingsHandler = settingsHandler;        this.errorHandler = errorHandler;        this.status = status;        this.isProfile = isProfile;        this.isNew = isNew;        this.jobDir = dir;        this.priority = priority;    }    /**     * A constructor for reloading jobs from disk. Jobs (not profiles) have     * their data written to persistent storage in the file system. This method     * is used to load the job from such storage. This is done by the     * <code>CrawlJobHandler</code>.     * <p>     * Proper structure of a job file (TODO: Maybe one day make this an XML file)     * Line 1. UID <br>     * Line 2. Job name (string) <br>     * Line 3. Job status (string) <br>     * Line 4. is job read only (true/false) <br>     * Line 5. is job running (true/false) <br>     * Line 6. job priority (int) <br>     * Line 7. number of journal entries <br>     * Line 8. setting file (with path) <br>     * Line 9. statistics tracker file (with path) <br>     * Line 10-?. error message (String, empty for null), can be many lines <br>     * @param jobFile     *            a file containing information about the job to load.     * @param errorHandler The crawl jobs settings error handler.     *            null means none is set     * @throws InvalidJobFileException     *            if the specified file does not refer to a valid job file.     * @throws IOException     *            if io operations fail     */    protected CrawlJob(final File jobFile,            final CrawlJobErrorHandler errorHandler)            throws InvalidJobFileException, IOException {        this(null, null, null, errorHandler,                PRIORITY_AVERAGE, null, null, false, true);        this.jobDir = jobFile.getParentFile();                // Check for corrupt job.state files (can be corrupt if we crash).        if (jobFile.length() == 0) {            throw new InvalidJobFileException(jobFile.getCanonicalPath() +                " is corrupt (length is zero)");        }                // Open file. Read data and set up class variables accordingly...        BufferedReader jobReader =            new BufferedReader(new FileReader(jobFile), 4096);        // UID        this.UID = jobReader.readLine();        // name        this.name = jobReader.readLine();        // status        this.status = jobReader.readLine();        if(status.equals(STATUS_ABORTED)==false                && status.equals(STATUS_CREATED)==false                && status.equals(STATUS_DELETED)==false                && status.equals(STATUS_FINISHED)==false                && status.equals(STATUS_FINISHED_ABNORMAL)==false                && status.equals(STATUS_FINISHED_DATA_LIMIT)==false                && status.equals(STATUS_FINISHED_DOCUMENT_LIMIT)==false                && status.equals(STATUS_FINISHED_TIME_LIMIT)==false                && status.equals(STATUS_MISCONFIGURED)==false                && status.equals(STATUS_PAUSED)==false                && status.equals(STATUS_CHECKPOINTING)==false                && status.equals(STATUS_PENDING)==false                && status.equals(STATUS_RUNNING)==false                && status.equals(STATUS_WAITING_FOR_PAUSE)==false                && status.equals(STATUS_PREPARING)==false){            // status is invalid. Must be one of the above            throw new InvalidJobFileException("Status (line 3) in job file " +                    "is not valid: '" + status + "'");        }        // isReadOnly        String tmp = jobReader.readLine();        if(tmp.equals("true")){            isReadOnly = true;        } else if(tmp.equals("false")){            isReadOnly = false;        } else {            throw new InvalidJobFileException("isReadOnly (line 4) in job" +                    " file '" + jobFile.getAbsolutePath() + "' is not " +                    "valid: '" + tmp + "'");        }        // isRunning        tmp = jobReader.readLine();        if(tmp.equals("true")){            this.isRunning = true;        } else if(tmp.equals("false")){            this.isRunning = false;        } else {            throw new InvalidJobFileException("isRunning (line 5) in job " +                    "file '" + jobFile.getAbsolutePath() + "' is not valid: " +                    "'" + tmp + "'");        }        // priority        tmp = jobReader.readLine();        try{            this.priority = Integer.parseInt(tmp);        } catch(NumberFormatException e){            throw new InvalidJobFileException("priority (line 5) in job " +                    "file '" + jobFile.getAbsolutePath() + "' is not valid: " +                    "'" + tmp + "'");        }        // numberOfJournalEntries        tmp = jobReader.readLine();        try{            this.numberOfJournalEntries = Integer.parseInt(tmp);        } catch(NumberFormatException e){            throw new InvalidJobFileException("numberOfJournalEntries " +                    "(line 5) in job file '" + jobFile.getAbsolutePath() +                    "' is not valid: " + "'" + tmp + "'");        }        // settingsHandler        tmp = jobReader.readLine();        try {            File f = new File(tmp);            this.settingsHandler = new XMLSettingsHandler((f.isAbsolute())?                f: new File(jobDir, f.getName()));            if(this.errorHandler != null){                this.settingsHandler.registerValueErrorHandler(errorHandler);            }            this.settingsHandler.initialize();        } catch (InvalidAttributeValueException e1) {            throw new InvalidJobFileException("Problem reading from settings " +                    "file (" + tmp + ") specified in job file '" +                    jobFile.getAbsolutePath() + "'\n" + e1.getMessage());        }        // Statistics tracker.        jobReader.readLine();        // errorMessage        // TODO: Multilines        tmp = jobReader.readLine();        errorMessage = "";        while(tmp!=null){            errorMessage+=tmp+'\n';            tmp = jobReader.readLine();        }        if(errorMessage.length()==0){            // Empty error message should be null            errorMessage = null;        }        // TODO: Load stattrack if needed.        // TODO: This should be inside a finally block.        jobReader.close();    }    /**     * Cause the job to be written to persistent storage.     * This will also save the statistics tracker if it is not null and the     * job status is finished (regardless of how it's finished)     */    private void writeJobFile() {        if (isProfile) {            return;        }                final String jobDirAbsolute = jobDir.getAbsolutePath();        if (!jobDir.exists() || !jobDir.canWrite()) {            logger.warning("Can't update status on " +                jobDirAbsolute + " because file does not" +                " exist (or is unwriteable)");            return;        }        File f = new File(jobDirAbsolute, "state.job");        String settingsFile = getSettingsDirectory();        // Make settingsFile's path relative if order.xml is somewhere in the        // job's directory tree        if(settingsFile.startsWith(jobDirAbsolute.concat(File.separator))) {            settingsFile = settingsFile.substring(jobDirAbsolute.length()+1);        }        try {            FileWriter jobWriter = new FileWriter(f, false);            try {                jobWriter.write(UID + "\n");                jobWriter.write(name + "\n");                jobWriter.write(status + "\n");                jobWriter.write(isReadOnly + "\n");                jobWriter.write(isRunning + "\n");                jobWriter.write(priority + "\n");                jobWriter.write(numberOfJournalEntries + "\n");                jobWriter.write(settingsFile + "\n");                jobWriter.write(statisticsFileSave + "\n");// TODO: Is this                                                            // right?                // Can be multiple lines so we keep it last                if (errorMessage != null) {                    jobWriter.write(errorMessage + "\n");                }            } finally {                if (jobWriter != null) {                    jobWriter.close();                }            }        } catch (IOException e) {            logger.log(Level.WARNING, "An IOException occured saving job " +                    name + " (" + UID + ")", e);        }    }      /**     * Returns this jobs unique ID (UID) that was issued by the     * CrawlJobHandler() when this job was first created.     *      * @return Job This jobs UID.     * @see CrawlJobHandler#getNextJobUID()     */    public String getUID(){        return UID;    }    /**     * Returns this job's 'name'. The name comes from the settings for this job,     * need not be unique and may change. For a unique identifier use     * {@link #getUID() getUID()}.     * <p>     * The name corrisponds to the value of the 'name' tag in the 'meta' section     * of the settings file.     *     * @return This job's 'name'     */    public String getJobName(){        return name;    }    /**     * Return the combination of given name and UID most commonly     * used in administrative interface.     *     * @return Job's name with UID notation     */    public String getDisplayName() {        return getJobName()+" ["+getUID()+"]";    }    /**     * Set this job's level of priority.     *     * @param priority The level of priority     *     * @see #getJobPriority()     * @see #PRIORITY_MINIMAL     * @see #PRIORITY_LOW     * @see #PRIORITY_AVERAGE     * @see #PRIORITY_HIGH     * @see #PRIORITY_CRITICAL     */    public void setJobPriority(int priority) {        this.priority = priority;    }    /**     * Get this job's level of priority.     *     * @return this job's priority     * @see #setJobPriority(int)     * @see #PRIORITY_MINIMAL     * @see #PRIORITY_LOW     * @see #PRIORITY_AVERAGE     * @see #PRIORITY_HIGH     * @see #PRIORITY_CRITICAL     */    public int getJobPriority() {        return priority;    }    /**     * Once called no changes can be made to the settings for this job.     * Typically this is done once a crawl is completed and further changes     * to the crawl order are therefor meaningless.     */    public void setReadOnly() {        isReadOnly = true;        writeJobFile(); //Save changes    }    /**     * Is job read only?     * @return false until setReadOnly has been invoked, after that it returns true.     */    public boolean isReadOnly(){        return isReadOnly;    }    /**     * Set the status of this CrawlJob.     *     * @param status Current status of CrawlJob     *         (see constants defined here beginning with STATUS)     */    public void setStatus(String status) {        this.status = status;        writeJobFile(); //Save changes        // TODO: If job finished, save StatisticsTracker!    }    /**     * @return Status of the crawler (Used by JMX).     */    public String getCrawlStatus() {        return this.controller != null?
💿 文件大小 9430 K
👤 上传用户 zergwyk
📂 所属分类 Internet/网络编程
🏷️ 相关标签

#lucece
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -