📄 crawljob.java
字号:
* equal to <code>new File(getSettingsDirectory())</code>. * @return the path of the job's base directory. */ public File getDirectory(){ return isProfile? new File(getSettingsDirectory()): jobDir; } /** * Get the error message associated with this job. Will return null if there * is no error message. * @return the error message associated with this job */ public String getErrorMessage() { return errorMessage; } /** * Set an error message for this job. Generally this only occurs if the job * is misconfigured. * @param string the error message associated with this job */ public void setErrorMessage(String string) { errorMessage = string; writeJobFile(); //Save changes } /** * @return Returns the number of journal entries. */ public int getNumberOfJournalEntries() { return numberOfJournalEntries; } /** * @param numberOfJournalEntries The number of journal entries to set. */ public void setNumberOfJournalEntries(int numberOfJournalEntries) { this.numberOfJournalEntries = numberOfJournalEntries; writeJobFile(); } /** * @return Returns the error handler for this crawl job */ public CrawlJobErrorHandler getErrorHandler() { return errorHandler; } /** * Read all the checkpoints found in the job's checkpoints * directory into Checkpoint instances * @return Collection containing list of all checkpoints. */ public Collection scanCheckpoints() { File checkpointsDirectory = settingsHandler.getOrder().getCheckpointsDirectory(); File[] perCheckpointDirs = checkpointsDirectory.listFiles(); Collection<Checkpoint> checkpoints = new ArrayList<Checkpoint>(); if (perCheckpointDirs != null) { for (int i = 0; i < perCheckpointDirs.length; i++) { Checkpoint cp = new Checkpoint(perCheckpointDirs[i]); checkpoints.add(cp); } } return checkpoints; } /** * Returns the absolute path of the specified log. * Note: If crawl has not begun, this file may not exist. * @param log * @return the absolute path for the specified log. * @throws AttributeNotFoundException * @throws ReflectionException * @throws MBeanException */ public String getLogPath(String log) throws AttributeNotFoundException, MBeanException, ReflectionException { String logsPath = (String)settingsHandler.getOrder(). getAttribute(CrawlOrder.ATTR_LOGS_PATH); CrawlOrder order = settingsHandler.getOrder(); String diskPath = (String) order.getAttribute(null, CrawlOrder.ATTR_DISK_PATH); File disk = settingsHandler. getPathRelativeToWorkingDirectory(diskPath); File f = new File(logsPath, log); if (!f.isAbsolute()) { f = new File(disk.getPath(), f.getPath()); } return f.getAbsolutePath(); } // OpenMBean implementation. protected void pause() { if (this.controller != null && this.controller.isPaused() == false) { this.controller.requestCrawlPause(); } } protected void resume() { if (this.controller != null) { this.controller.requestCrawlResume(); } } /** * @throws IllegalStateException Thrown if crawl is not paused. */ protected void checkpoint() throws IllegalStateException { if (this.controller != null) { this.controller.requestCrawlCheckpoint(); } } /** * @return True if checkpointing. */ public boolean isCheckpointing() { return this.controller != null? this.controller.isCheckpointing(): false; } /** * If its a HostQueuesFrontier, needs to be flushed for the queued. */ protected void flush() { // Nothing to do. } /** * Delete any URI from the frontier of the current (paused) job that match * the specified regular expression. If the current job is not paused (or * there is no current job) nothing will be done. * @param regexpr Regular expression to delete URIs by. * @return the number of URIs deleted */ public long deleteURIsFromPending(String regexpr){ return (this.controller != null && this.controller.getFrontier() != null && this.controller.isPaused())? this.controller.getFrontier().deleteURIs(regexpr): 0; } public String importUris(String file, String style, String force) { return importUris(file, style, "true".equals(force)); } public String importUris(final String fileOrUrl, final String style, final boolean forceRevisit) { return importUris(fileOrUrl, style, forceRevisit, false); } /** * @param fileOrUrl Name of file w/ seeds. * @param style What style of seeds -- crawl log, recovery journal, or * seeds file. * @param forceRevisit Should we revisit even if seen before? * @param areSeeds Is the file exclusively seeds? * @return A display string that has a count of all added. */ public String importUris(final String fileOrUrl, final String style, final boolean forceRevisit, final boolean areSeeds) { InputStream is = IoUtils.getInputStream(this.controller.getDisk(), fileOrUrl); String message = null; // Do we have an inputstream? if (is == null) { message = "Failed to get inputstream from " + fileOrUrl; logger.severe(message); } else { int addedCount = importUris(is, style, forceRevisit, areSeeds); message = Integer.toString(addedCount) + " URIs added from " + fileOrUrl; } return message; } protected int importUris(InputStream is, String style, boolean forceRevisit) { return importUris(is, style, forceRevisit, false); } /** * Import URIs. * @param is Stream to use as URI source. * @param style Style in which URIs are rendored. Currently support for * <code>recoveryJournal</code>, <code>crawlLog</code>, and seeds file * format (i.e <code>default</code>) where <code>default</code> style is * a UURI per line (comments allowed). * @param forceRevisit Whether we should revisit this URI even if we've * visited it previously. * @param areSeeds Are the imported URIs seeds? * @return Count of added URIs. */ protected int importUris(InputStream is, String style, boolean forceRevisit, final boolean areSeeds) { // Figure the regex to use parsing each line of input stream. String extractor; String output; if(CRAWL_LOG_STYLE.equals(style)) { // Skip first 3 fields extractor = "\\S+\\s+\\S+\\s+\\S+\\s+(\\S+\\s+\\S+\\s+\\S+\\s+).*"; output = "$1"; } else if (RECOVERY_JOURNAL_STYLE.equals(style)) { // Skip the begin-of-line directive extractor = "\\S+\\s+((\\S+)(?:\\s+\\S+\\s+\\S+)?)\\s*"; output = "$1"; } else { extractor = RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT; output = RegexpLineIterator.ENTRY; } // Read the input stream. BufferedReader br = null; int addedCount = 0; try { br = new BufferedReader(new InputStreamReader(is)); Iterator iter = new RegexpLineIterator(new LineReadingIterator(br), RegexpLineIterator.COMMENT_LINE, extractor, output); while(iter.hasNext()) { try { importUri((String)iter.next(), forceRevisit, areSeeds, false); addedCount++; } catch (URIException e) { e.printStackTrace(); } } br.close(); flush(); } catch (IOException e) { e.printStackTrace(); } return addedCount; } /** * Schedule a uri. * @param uri Uri to schedule. * @param forceFetch Should it be forcefetched. * @param isSeed True if seed. * @throws URIException */ public void importUri(final String uri, final boolean forceFetch, final boolean isSeed) throws URIException { importUri(uri, forceFetch, isSeed, true); } /** * Schedule a uri. * @param str String that can be: 1. a UURI, 2. a snippet of the * crawl.log line, or 3. a snippet from recover log. See * {@link #importUris(InputStream, String, boolean)} for how it subparses * the lines from crawl.log and recover.log. * @param forceFetch Should it be forcefetched. * @param isSeed True if seed. * @param isFlush If true, flush the frontier IF it implements * flushing. * @throws URIException */ public void importUri(final String str, final boolean forceFetch, final boolean isSeed, final boolean isFlush) throws URIException { CandidateURI caUri = CandidateURI.fromString(str); caUri.setForceFetch(forceFetch); if (isSeed) { caUri.setIsSeed(isSeed); if (caUri.getVia() == null || caUri.getVia().length() <= 0) { // Danger of double-add of seeds because of this code here. // Only call addSeed if no via. If a via, the schedule will // take care of updating scope. this.controller.getScope().addSeed(caUri); } } this.controller.getFrontier().schedule(caUri); if (isFlush) { flush(); } } /** * @return Our mbean info (Needed for CrawlJob to qualify as a * DynamicMBean). */ public MBeanInfo getMBeanInfo() { return this.openMBeanInfo; } /** * Build up the MBean info for Heritrix main. * @return Return created mbean info instance. * @throws InitializationException */ protected OpenMBeanInfoSupport buildMBeanInfo() throws InitializationException { // Start adding my attributes. List<OpenMBeanAttributeInfo> attributes = new ArrayList<OpenMBeanAttributeInfo>(); // Attributes. attributes.add(new OpenMBeanAttributeInfoSupport(NAME_ATTR, "Crawl job name", SimpleType.STRING, true, false, false)); attributes.add(new OpenMBeanAttributeInfoSupport(STATUS_ATTR, "Short basic status message", SimpleType.STRING, true, false, false)); attributes.add( new OpenMBeanAttributeInfoSupport(FRONTIER_SHORT_REPORT_ATTR, "Short frontier report", SimpleType.STRING, true, false, false)); attributes.add( new OpenMBeanAttributeInfoSupport(THREADS_SHORT_REPORT_ATTR, "Short threads report", SimpleType.STRING, true, false, false)); attributes.add(new OpenMBeanAttributeInfoSupport(UID_ATTR, "Crawl job UID", SimpleType.STRING, true, false, false)); attributes.add(new OpenMBeanAttributeInfoSupport(TOTAL_DATA_ATTR, "Total data received", SimpleType.LONG, true, false, false)); attributes.add(new OpenMBeanAttributeInfoSupport(CRAWL_TIME_ATTR, "Crawl time", SimpleType.LONG, true, false, false)); attributes.add(new OpenMBeanAttributeInfoSupport(CURRENT_DOC_RATE_ATTR, "Current crawling rate (Docs/sec)", SimpleType.DOUBLE, true, false, false)); attributes.add(new OpenMBeanAttributeInfoSupport(CURRENT_KB_RATE_ATTR, "Current crawling rate (Kb/sec)", SimpleType.LONG, true, false, false)); attributes.add(new OpenMBeanAttributeInfoSupport(THREAD_COUNT_ATTR, "Active thread count", SimpleType.INTEGER, true, false, false));
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -