📄 crawlorder.java
字号:
"Note, you must preserve the '+' before the 'http'.", "Mozilla/5.0 (compatible; heritrix/@VERSION@ +PROJECT_URL_HERE)")); e = httpHeaders.addElementToDefinition(new SimpleType(ATTR_FROM, "Contact information. This field must contain a valid " + "e-mail address for the person or organization responsible" + "for this crawl: e.g. 'webmaster@loc.gov'", "CONTACT_EMAIL_ADDRESS_HERE")); addElementToDefinition(new RobotsHonoringPolicy()); e = addElementToDefinition(new ModuleType( Frontier.ATTR_NAME, "Frontier")); e.setLegalValueType(Frontier.class); e = (MapType) addElementToDefinition(new MapType(ATTR_RULES, "Ordered list of url canonicalization rules. " + "Rules are applied in the order listed from top to bottom.", BaseRule.class)); e.setOverrideable(true); e.setExpertSetting(true); e = addElementToDefinition(new MapType( ATTR_PRE_FETCH_PROCESSORS, "Processors to run prior to" + " fetching anything from the network.", Processor.class)); e.setOverrideable(false); e = addElementToDefinition(new MapType( ATTR_FETCH_PROCESSORS, "Processors that fetch documents." , Processor.class)); e.setOverrideable(false); e = addElementToDefinition(new MapType( ATTR_EXTRACT_PROCESSORS, "Processors that extract new URIs" + " from fetched documents.", Processor.class)); e.setOverrideable(false); e = addElementToDefinition(new MapType( ATTR_WRITE_PROCESSORS, "Processors that write documents" + " to archives.", Processor.class)); e.setOverrideable(false); e = addElementToDefinition(new MapType( ATTR_POST_PROCESSORS, "Processors that do cleanup and feed" + " the frontier with new URIs.", Processor.class)); e.setOverrideable(false); loggers = (MapType) addElementToDefinition(new MapType(ATTR_LOGGERS, "Statistics tracking modules. Any number of specialized " + "statistics tracker that monitor a crawl and write logs, " + "reports and/or provide information to the user interface.")); e = addElementToDefinition(new SimpleType(ATTR_RECOVER_PATH, "Optional. Points at recover log (or recover.gz log) OR " + "the checkpoint directory to use recovering a crawl.", "")); e.setOverrideable(false); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType( ATTR_CHECKPOINT_COPY_BDBJE_LOGS, "When true, on a checkpoint, we copy off the bdbje log files to " + "the checkpoint directory. To recover a checkpoint, just " + "set the " + ATTR_RECOVER_PATH + " to point at the checkpoint " + "directory to recover. This is default setting. " + "But if crawl is large, " + "copying bdbje log files can take tens of minutes and even " + "upwards of an hour (Copying bdbje log files will consume bulk " + "of time checkpointing). If this setting is false, we do NOT copy " + "bdbje logs on checkpoint AND we set bdbje to NEVER delete log " + "files (instead we have it rename files-to-delete with a '.del'" + "extension). Assumption is that when this setting is false, " + "an external process is managing the removal of bdbje log files " + "and that come time to recover from a checkpoint, the files that " + "comprise a checkpoint are manually assembled. This is an expert " + "setting.", DEFAULT_CHECKPOINT_COPY_BDBJE_LOGS)); e.setOverrideable(false); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_RECOVER_RETAIN_FAILURES, "When recovering via the recover.log, should failures " + "in the log be retained in the recovered crawl, " + "preventing the corresponding URIs from being retried. " + "Default is false, meaning failures are forgotten, and " + "the corresponding URIs will be retried in the recovered " + "crawl.", Boolean.FALSE)); e.setOverrideable(false); e.setExpertSetting(true); e = addElementToDefinition( new CredentialStore(CredentialStore.ATTR_NAME)); e.setOverrideable(true); e.setExpertSetting(true); } /** * @param curi * @return user-agent header value to use */ public String getUserAgent(CrawlURI curi) { return ((String) httpHeaders.getUncheckedAttribute(curi, ATTR_USER_AGENT)); } /** * @param curi * @return from header value to use */ public String getFrom(CrawlURI curi) { String res = null; try { res = (String) httpHeaders.getAttribute(ATTR_FROM, curi); } catch (AttributeNotFoundException e) { logger.severe(e.getMessage()); } return res; } /** * Returns the set number of maximum toe threads. * @return Number of maximum toe threads */ public int getMaxToes() { Integer res = null; try { res = (Integer) getAttribute(null, ATTR_MAX_TOE_THREADS); } catch (AttributeNotFoundException e) { logger.severe(e.getMessage()); } return res.intValue(); } /** * This method gets the RobotsHonoringPolicy object from the orders file. * * @return the new RobotsHonoringPolicy */ public RobotsHonoringPolicy getRobotsHonoringPolicy() { try { return (RobotsHonoringPolicy) getAttribute(null, RobotsHonoringPolicy.ATTR_NAME); } catch (AttributeNotFoundException e) { logger.severe(e.getMessage()); return null; } } /** Get the name of the order file. * * @return the name of the order file. */ public String getCrawlOrderName() { return getSettingsHandler().getSettingsObject(null).getName(); } /** * @return The crawl controller. */ public CrawlController getController() { return controller; } /** * @param controller */ public void setController(CrawlController controller) { this.controller = controller; } /** * Returns the Map of the StatisticsTracking modules that are included in the * configuration that the current instance of this class is representing. * @return Map of the StatisticsTracking modules */ public MapType getLoggers() { return loggers; } /** * Checks if the User Agent and From field are set 'correctly' in * the specified Crawl Order. * * @throws FatalConfigurationException */ public void checkUserAgentAndFrom() throws FatalConfigurationException { // don't start the crawl if they're using the default user-agent String userAgent = this.getUserAgent(null); String from = this.getFrom(null); if (!(userAgent.matches(ACCEPTABLE_USER_AGENT) && from.matches(ACCEPTABLE_FROM))) { throw new FatalConfigurationException("unacceptable user-agent " + " or from (Reedit your order file)."); } } /** * @return Checkpoint directory. */ public File getCheckpointsDirectory() { try { return getDirectoryRelativeToDiskPath((String) getAttribute(null, CrawlOrder.ATTR_CHECKPOINTS_PATH)); } catch (AttributeNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); return null; } } private File getDirectoryRelativeToDiskPath(String subpath) { File disk; try { disk = getSettingsHandler().getPathRelativeToWorkingDirectory( (String) getAttribute(null, CrawlOrder.ATTR_DISK_PATH)); return new File(disk, subpath); } catch (AttributeNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); return null; } } /** * Return fullpath to the directory named by <code>key</code> * in settings. * If directory does not exist, it and all intermediary dirs * will be created. * @param key Key to use going to settings. * @return Full path to directory named by <code>key</code>. * @throws AttributeNotFoundException */ public File getSettingsDir(String key) throws AttributeNotFoundException { String path = (String)getAttribute(null, key); File f = new File(path); if (!f.isAbsolute()) { f = getDirectoryRelativeToDiskPath(path); } if (!f.exists()) { f.mkdirs(); } return f; } }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -