crawlorder.java

来自「Heritrix是一个开源,可扩展的web爬虫项目。Heritrix设计成严格按」· Java 代码 · 共 492 行 · 第 1/2 页
JAVA
492 行
                "Note, you must preserve the '+' before the 'http'.",          "Mozilla/5.0 (compatible; heritrix/@VERSION@ +PROJECT_URL_HERE)"));        e = httpHeaders.addElementToDefinition(new SimpleType(ATTR_FROM,                "Contact information. This field must contain a valid " +                "e-mail address for the person or organization responsible" +                "for this crawl: e.g. 'webmaster@loc.gov'",                "CONTACT_EMAIL_ADDRESS_HERE"));        addElementToDefinition(new RobotsHonoringPolicy());        e = addElementToDefinition(new ModuleType(                Frontier.ATTR_NAME, "Frontier"));        e.setLegalValueType(Frontier.class);        e = (MapType) addElementToDefinition(new MapType(ATTR_RULES,            "Ordered list of url canonicalization rules. " +            "Rules are applied in the order listed from top to bottom.",            BaseRule.class));        e.setOverrideable(true);        e.setExpertSetting(true);                e = addElementToDefinition(new MapType(                ATTR_PRE_FETCH_PROCESSORS, "Processors to run prior to" +                        " fetching anything from the network.",                        Processor.class));        e.setOverrideable(false);        e = addElementToDefinition(new MapType(                ATTR_FETCH_PROCESSORS, "Processors that fetch documents."                , Processor.class));        e.setOverrideable(false);        e = addElementToDefinition(new MapType(                ATTR_EXTRACT_PROCESSORS, "Processors that extract new URIs" +                        " from fetched documents.", Processor.class));        e.setOverrideable(false);        e = addElementToDefinition(new MapType(                ATTR_WRITE_PROCESSORS, "Processors that write documents" +                        " to archives.", Processor.class));        e.setOverrideable(false);        e = addElementToDefinition(new MapType(                ATTR_POST_PROCESSORS, "Processors that do cleanup and feed" +                        " the frontier with new URIs.", Processor.class));        e.setOverrideable(false);        loggers = (MapType) addElementToDefinition(new MapType(ATTR_LOGGERS,                "Statistics tracking modules. Any number of specialized " +                "statistics tracker that monitor a crawl and write logs, " +                "reports and/or provide information to the user interface."));        e = addElementToDefinition(new SimpleType(ATTR_RECOVER_PATH,                "Optional. Points at recover log (or recover.gz log) OR " +                "the checkpoint directory to use recovering a crawl.", ""));        e.setOverrideable(false);        e.setExpertSetting(true);                e = addElementToDefinition(new SimpleType(            ATTR_CHECKPOINT_COPY_BDBJE_LOGS,            "When true, on a checkpoint, we copy off the bdbje log files to " +            "the checkpoint directory. To recover a checkpoint, just " +            "set the " + ATTR_RECOVER_PATH + " to point at the checkpoint " +            "directory to recover.  This is default setting. " +            "But if crawl is large, " +            "copying bdbje log files can take tens of minutes and even " +            "upwards of an hour (Copying bdbje log files will consume bulk " +            "of time checkpointing). If this setting is false, we do NOT copy " +            "bdbje logs on checkpoint AND we set bdbje to NEVER delete log " +            "files (instead we have it rename files-to-delete with a '.del'" +            "extension). Assumption is that when this setting is false, " +            "an external process is managing the removal of bdbje log files " +            "and that come time to recover from a checkpoint, the files that " +            "comprise a checkpoint are manually assembled. This is an expert " +            "setting.",            DEFAULT_CHECKPOINT_COPY_BDBJE_LOGS));        e.setOverrideable(false);        e.setExpertSetting(true);        e = addElementToDefinition(new SimpleType(ATTR_RECOVER_RETAIN_FAILURES,                "When recovering via the recover.log, should failures " +                "in the log be retained in the recovered crawl, " +                "preventing the corresponding URIs from being retried. " +                "Default is false, meaning failures are forgotten, and " +                "the corresponding URIs will be retried in the recovered " +                "crawl.", Boolean.FALSE));        e.setOverrideable(false);        e.setExpertSetting(true);                e = addElementToDefinition(           new CredentialStore(CredentialStore.ATTR_NAME));        e.setOverrideable(true);        e.setExpertSetting(true);    }    /**     * @param curi     * @return user-agent header value to use     */    public String getUserAgent(CrawlURI curi) {        return ((String) httpHeaders.getUncheckedAttribute(curi, ATTR_USER_AGENT));    }    /**     * @param curi     * @return from header value to use     */    public String getFrom(CrawlURI curi) {        String res = null;        try {            res = (String) httpHeaders.getAttribute(ATTR_FROM, curi);        } catch (AttributeNotFoundException e) {            logger.severe(e.getMessage());        }        return res;    }    /**     * Returns the set number of maximum toe threads.     * @return Number of maximum toe threads     */    public int getMaxToes() {        Integer res = null;        try {            res = (Integer) getAttribute(null, ATTR_MAX_TOE_THREADS);        } catch (AttributeNotFoundException e) {            logger.severe(e.getMessage());        }        return res.intValue();    }    /**     * This method gets the RobotsHonoringPolicy object from the orders file.     *     * @return the new RobotsHonoringPolicy     */    public RobotsHonoringPolicy getRobotsHonoringPolicy() {        try {            return (RobotsHonoringPolicy) getAttribute(null, RobotsHonoringPolicy.ATTR_NAME);        } catch (AttributeNotFoundException e) {            logger.severe(e.getMessage());            return null;        }     }    /** Get the name of the order file.     *     * @return the name of the order file.     */    public String getCrawlOrderName() {        return getSettingsHandler().getSettingsObject(null).getName();    }    /**     * @return The crawl controller.     */    public CrawlController getController() {        return controller;    }    /**     * @param controller     */    public void setController(CrawlController controller) {        this.controller = controller;    }    /**     * Returns the Map of the StatisticsTracking modules that are included in the     * configuration that the current instance of this class is representing.     * @return Map of the StatisticsTracking modules     */    public MapType getLoggers() {        return loggers;    }    /**     * Checks if the User Agent and From field are set 'correctly' in     * the specified Crawl Order.     *     * @throws FatalConfigurationException     */    public void checkUserAgentAndFrom() throws FatalConfigurationException {        // don't start the crawl if they're using the default user-agent        String userAgent = this.getUserAgent(null);        String from = this.getFrom(null);        if (!(userAgent.matches(ACCEPTABLE_USER_AGENT)            && from.matches(ACCEPTABLE_FROM))) {            throw new FatalConfigurationException("unacceptable user-agent " +                    " or from (Reedit your order file).");        }    }    /**     * @return Checkpoint directory.     */    public File getCheckpointsDirectory() {        try {            return getDirectoryRelativeToDiskPath((String) getAttribute(null,                    CrawlOrder.ATTR_CHECKPOINTS_PATH));        } catch (AttributeNotFoundException e) {            // TODO Auto-generated catch block            e.printStackTrace();            return null;        }    }    private File getDirectoryRelativeToDiskPath(String subpath) {        File disk;        try {            disk = getSettingsHandler().getPathRelativeToWorkingDirectory(                    (String) getAttribute(null, CrawlOrder.ATTR_DISK_PATH));            return new File(disk, subpath);        } catch (AttributeNotFoundException e) {            // TODO Auto-generated catch block            e.printStackTrace();            return null;        }    }        /**     * Return fullpath to the directory named by <code>key</code>     * in settings.     * If directory does not exist, it and all intermediary dirs     * will be created.     * @param key Key to use going to settings.     * @return Full path to directory named by <code>key</code>.     * @throws AttributeNotFoundException     */    public File getSettingsDir(String key)    throws AttributeNotFoundException {        String path = (String)getAttribute(null, key);        File f = new File(path);        if (!f.isAbsolute()) {            f = getDirectoryRelativeToDiskPath(path);        }        if (!f.exists()) {            f.mkdirs();        }        return f;    }        }
crawlorder.java - 源码说明

本页面展示了「Heritrix是一个开源,可扩展的web爬虫项目。Heritrix设计成严格按照robots.txt文件的排除指示和META robots标签。」中的 crawlorder.java 源码文件，采用 Java 编程语言编写，共 492 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与Heritrix相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?