📄 crawljob.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
     * equal to <code>new File(getSettingsDirectory())</code>.     * @return the path of the job's base directory.     */    public File getDirectory(){        return isProfile? new File(getSettingsDirectory()): jobDir;    }    /**     * Get the error message associated with this job. Will return null if there     * is no error message.     * @return the error message associated with this job     */    public String getErrorMessage() {        return errorMessage;    }    /**     * Set an error message for this job. Generally this only occurs if the job     * is misconfigured.     * @param string the error message associated with this job     */    public void setErrorMessage(String string) {        errorMessage = string;        writeJobFile(); //Save changes    }    /**     * @return Returns the number of journal entries.     */    public int getNumberOfJournalEntries() {        return numberOfJournalEntries;    }    /**     * @param numberOfJournalEntries The number of journal entries to set.     */    public void setNumberOfJournalEntries(int numberOfJournalEntries) {        this.numberOfJournalEntries = numberOfJournalEntries;        writeJobFile();    }    /**     * @return Returns the error handler for this crawl job     */    public CrawlJobErrorHandler getErrorHandler() {        return errorHandler;    }    /**     * Read all the checkpoints found in the job's checkpoints     * directory into Checkpoint instances     * @return Collection containing list of all checkpoints.     */    public Collection scanCheckpoints() {        File checkpointsDirectory =            settingsHandler.getOrder().getCheckpointsDirectory();        File[] perCheckpointDirs = checkpointsDirectory.listFiles();        Collection<Checkpoint> checkpoints = new ArrayList<Checkpoint>();        if (perCheckpointDirs != null) {            for (int i = 0; i < perCheckpointDirs.length; i++) {                Checkpoint cp = new Checkpoint(perCheckpointDirs[i]);                checkpoints.add(cp);            }        }        return checkpoints;    }    /**     * Returns the absolute path of the specified log.     * Note: If crawl has not begun, this file may not exist.     * @param log     * @return the absolute path for the specified log.     * @throws AttributeNotFoundException     * @throws ReflectionException     * @throws MBeanException     */    public String getLogPath(String log)     throws AttributeNotFoundException, MBeanException, ReflectionException {        String logsPath = (String)settingsHandler.getOrder().            getAttribute(CrawlOrder.ATTR_LOGS_PATH);        CrawlOrder order = settingsHandler.getOrder();        String diskPath = (String) order.getAttribute(null,            CrawlOrder.ATTR_DISK_PATH);        File disk = settingsHandler.            getPathRelativeToWorkingDirectory(diskPath);        File f = new File(logsPath, log);        if (!f.isAbsolute()) {            f = new File(disk.getPath(), f.getPath());        }        return f.getAbsolutePath();    }    // OpenMBean implementation.        protected void pause() {        if (this.controller != null && this.controller.isPaused() == false) {            this.controller.requestCrawlPause();        }    }        protected void resume() {        if (this.controller != null) {            this.controller.requestCrawlResume();        }    }    /**     * @throws IllegalStateException Thrown if crawl is not paused.     */    protected void checkpoint() throws IllegalStateException {        if (this.controller != null) {            this.controller.requestCrawlCheckpoint();        }    }        /**     * @return True if checkpointing.     */    public boolean isCheckpointing() {        return this.controller != null? this.controller.isCheckpointing(): false;    }        /**     * If its a HostQueuesFrontier, needs to be flushed for the queued.     */    protected void flush() {        // Nothing to do.    }    /**     * Delete any URI from the frontier of the current (paused) job that match     * the specified regular expression. If the current job is not paused (or     * there is no current job) nothing will be done.     * @param regexpr Regular expression to delete URIs by.     * @return the number of URIs deleted     */    public long deleteURIsFromPending(String regexpr){        return (this.controller != null &&                this.controller.getFrontier() != null &&                this.controller.isPaused())?            this.controller.getFrontier().deleteURIs(regexpr): 0;    }        public String importUris(String file, String style, String force) {        return importUris(file, style, "true".equals(force));    }        public String importUris(final String fileOrUrl, final String style,            final boolean forceRevisit) {        return importUris(fileOrUrl, style, forceRevisit, false);    }    /**     * @param fileOrUrl Name of file w/ seeds.     * @param style What style of seeds -- crawl log, recovery journal, or     * seeds file.     * @param forceRevisit Should we revisit even if seen before?     * @param areSeeds Is the file exclusively seeds?     * @return A display string that has a count of all added.     */    public String importUris(final String fileOrUrl, final String style,            final boolean forceRevisit, final boolean areSeeds) {        InputStream is =            IoUtils.getInputStream(this.controller.getDisk(), fileOrUrl);        String message = null;        // Do we have an inputstream?        if (is == null) {            message = "Failed to get inputstream from " + fileOrUrl;            logger.severe(message);        } else {            int addedCount = importUris(is, style, forceRevisit, areSeeds);            message = Integer.toString(addedCount) + " URIs added from " +                fileOrUrl;        }        return message;    }        protected int importUris(InputStream is, String style,            boolean forceRevisit) {        return importUris(is, style, forceRevisit, false);    }        /**     * Import URIs.     * @param is Stream to use as URI source.     * @param style Style in which URIs are rendored.  Currently support for     * <code>recoveryJournal</code>, <code>crawlLog</code>, and seeds file     * format (i.e <code>default</code>) where <code>default</code> style is     * a UURI per line (comments allowed).     * @param forceRevisit Whether we should revisit this URI even if we've     * visited it previously.     * @param areSeeds Are the imported URIs seeds?     * @return Count of added URIs.     */    protected int importUris(InputStream is, String style,            boolean forceRevisit, final boolean areSeeds) {        // Figure the regex to use parsing each line of input stream.        String extractor;        String output;        if(CRAWL_LOG_STYLE.equals(style)) {            // Skip first 3 fields            extractor = "\\S+\\s+\\S+\\s+\\S+\\s+(\\S+\\s+\\S+\\s+\\S+\\s+).*";            output = "$1";        } else if (RECOVERY_JOURNAL_STYLE.equals(style)) {            // Skip the begin-of-line directive            extractor = "\\S+\\s+((\\S+)(?:\\s+\\S+\\s+\\S+)?)\\s*";            output = "$1";        } else {            extractor =                RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT;            output = RegexpLineIterator.ENTRY;        }                // Read the input stream.        BufferedReader br = null;        int addedCount = 0;        try {            br = new BufferedReader(new InputStreamReader(is));            Iterator iter = new RegexpLineIterator(new LineReadingIterator(br),                RegexpLineIterator.COMMENT_LINE, extractor, output);            while(iter.hasNext()) {                try {                    importUri((String)iter.next(), forceRevisit, areSeeds,                        false);                    addedCount++;                } catch (URIException e) {                    e.printStackTrace();                }            }            br.close();            flush();        } catch (IOException e) {            e.printStackTrace();        }        return addedCount;    }        /**     * Schedule a uri.     * @param uri Uri to schedule.     * @param forceFetch Should it be forcefetched.     * @param isSeed True if seed.     * @throws URIException     */    public void importUri(final String uri, final boolean forceFetch,            final boolean isSeed)    throws URIException {        importUri(uri, forceFetch, isSeed, true);    }        /**     * Schedule a uri.     * @param str String that can be: 1. a UURI, 2. a snippet of the     * crawl.log line, or 3. a snippet from recover log.  See     * {@link #importUris(InputStream, String, boolean)} for how it subparses     * the lines from crawl.log and recover.log.     * @param forceFetch Should it be forcefetched.     * @param isSeed True if seed.     * @param isFlush If true, flush the frontier IF it implements     * flushing.     * @throws URIException     */    public void importUri(final String str, final boolean forceFetch,            final boolean isSeed, final boolean isFlush)    throws URIException {        CandidateURI caUri = CandidateURI.fromString(str);        caUri.setForceFetch(forceFetch);        if (isSeed) {            caUri.setIsSeed(isSeed);            if (caUri.getVia() == null || caUri.getVia().length() <= 0) {                // Danger of double-add of seeds because of this code here.                // Only call addSeed if no via.  If a via, the schedule will                // take care of updating scope.                this.controller.getScope().addSeed(caUri);            }        }        this.controller.getFrontier().schedule(caUri);        if (isFlush) {            flush();        }    }            /**     * @return Our mbean info (Needed for CrawlJob to qualify as a     * DynamicMBean).     */    public MBeanInfo getMBeanInfo() {        return this.openMBeanInfo;    }        /**     * Build up the MBean info for Heritrix main.     * @return Return created mbean info instance.     * @throws InitializationException      */    protected OpenMBeanInfoSupport buildMBeanInfo()    throws InitializationException {        // Start adding my attributes.        List<OpenMBeanAttributeInfo> attributes         = new ArrayList<OpenMBeanAttributeInfo>();        // Attributes.        attributes.add(new OpenMBeanAttributeInfoSupport(NAME_ATTR,            "Crawl job name", SimpleType.STRING, true, false, false));        attributes.add(new OpenMBeanAttributeInfoSupport(STATUS_ATTR,            "Short basic status message", SimpleType.STRING, true, false,            false));        attributes.add(                new OpenMBeanAttributeInfoSupport(FRONTIER_SHORT_REPORT_ATTR,                "Short frontier report", SimpleType.STRING, true,                false, false));        attributes.add(                new OpenMBeanAttributeInfoSupport(THREADS_SHORT_REPORT_ATTR,                "Short threads report", SimpleType.STRING, true,                false, false));        attributes.add(new OpenMBeanAttributeInfoSupport(UID_ATTR,            "Crawl job UID", SimpleType.STRING, true, false, false));          attributes.add(new OpenMBeanAttributeInfoSupport(TOTAL_DATA_ATTR,            "Total data received", SimpleType.LONG, true, false, false));        attributes.add(new OpenMBeanAttributeInfoSupport(CRAWL_TIME_ATTR,            "Crawl time", SimpleType.LONG, true, false, false));        attributes.add(new OpenMBeanAttributeInfoSupport(CURRENT_DOC_RATE_ATTR,            "Current crawling rate (Docs/sec)", SimpleType.DOUBLE,            true, false, false));        attributes.add(new OpenMBeanAttributeInfoSupport(CURRENT_KB_RATE_ATTR,            "Current crawling rate (Kb/sec)", SimpleType.LONG,            true, false, false));        attributes.add(new OpenMBeanAttributeInfoSupport(THREAD_COUNT_ATTR,            "Active thread count", SimpleType.INTEGER, true, false, false));
💿 文件大小 9430 K
👤 上传用户 zergwyk
📂 所属分类 Internet/网络编程
🏷️ 相关标签

#lucece
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -