📄 crawlorder.java
字号:
/* * CrawlOrder * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/datamodel/CrawlOrder.java,v 1.57 2006/08/11 05:29:08 gojomo Exp $ * * Created on May 15, 2003 * * Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */package org.archive.crawler.datamodel;import java.io.File;import java.io.Serializable;import java.util.logging.Logger;import javax.management.AttributeNotFoundException;import javax.management.MBeanException;import javax.management.ReflectionException;import org.archive.crawler.framework.CrawlController;import org.archive.crawler.framework.CrawlScope;import org.archive.crawler.framework.Frontier;import org.archive.crawler.framework.Processor;import org.archive.crawler.framework.exceptions.FatalConfigurationException;import org.archive.crawler.settings.MapType;import org.archive.crawler.settings.ModuleType;import org.archive.crawler.settings.SimpleType;import org.archive.crawler.settings.Type;import org.archive.crawler.url.canonicalize.BaseRule;/** * Represents the 'root' of the settings hierarchy. Contains those settings that * do not belong to any specific module, but rather relate to the crawl as a * whole (much of this is used by the CrawlController directly or indirectly). * * @see org.archive.crawler.settings.ModuleType */public class CrawlOrder extends ModuleType implements Serializable { private static Logger logger = Logger.getLogger("org.archive.crawler.datamodel.CrawlOrder"); public static final String ATTR_NAME = "crawl-order"; public static final String ATTR_SETTINGS_DIRECTORY = "settings-directory"; public static final String ATTR_DISK_PATH = "disk-path"; public static final String ATTR_LOGS_PATH = "logs-path"; public static final String ATTR_CHECKPOINTS_PATH = "checkpoints-path"; public static final String ATTR_STATE_PATH = "state-path"; public static final String ATTR_SCRATCH_PATH = "scratch-path"; public static final String ATTR_RECOVER_PATH = "recover-path"; public static final String ATTR_RECOVER_RETAIN_FAILURES = "recover-retain-failures"; public static final String ATTR_MAX_BYTES_DOWNLOAD = "max-bytes-download"; public static final String ATTR_MAX_DOCUMENT_DOWNLOAD = "max-document-download"; public static final String ATTR_MAX_TIME_SEC = "max-time-sec"; public static final String ATTR_MAX_TOE_THREADS = "max-toe-threads"; public static final String ATTR_HTTP_HEADERS = "http-headers"; public static final String ATTR_USER_AGENT = "user-agent"; public static final String ATTR_FROM = "from"; public static final String ATTR_PRE_FETCH_PROCESSORS = "pre-fetch-processors"; public static final String ATTR_FETCH_PROCESSORS = "fetch-processors"; public static final String ATTR_EXTRACT_PROCESSORS = "extract-processors"; public static final String ATTR_WRITE_PROCESSORS = "write-processors"; public static final String ATTR_POST_PROCESSORS = "post-processors"; public static final String ATTR_LOGGERS = "loggers"; public static final String ATTR_RULES = "uri-canonicalization-rules"; public static final String ATTR_RECORDER_OUT_BUFFER = "recorder-out-buffer-bytes"; public static final String ATTR_RECORDER_IN_BUFFER = "recorder-in-buffer-bytes"; /** Percentage of heap to allocate to bdb cache */ public static final String ATTR_BDB_CACHE_PERCENT = "bdb-cache-percent"; /** * When checkpointing, copy the bdb logs. * Default is true. If false, then we do not copy logs on checkpoint AND * we tell bdbje never to delete log files; instead it renames * files-to-delete with a '.del' extension. Assumption is that when this * setting is false, an external process is managing the removing of * bdbje log files and that come time to recover from a checkpoint, the * files that comprise a checkpoint are manually assembled. */ public static final String ATTR_CHECKPOINT_COPY_BDBJE_LOGS = "checkpoint-copy-bdbje-logs"; public static final Boolean DEFAULT_CHECKPOINT_COPY_BDBJE_LOGS = Boolean.TRUE; /** * Default size of bdb cache. */ private final static Integer DEFAULT_BDB_CACHE_PERCENT = new Integer(0); private transient String caseFlattenedUserAgent; private transient MapType httpHeaders; private transient MapType loggers; private transient CrawlController controller; /** * Regex for acceptable user-agent format. */ private static String ACCEPTABLE_USER_AGENT = "\\S+.*\\(.*\\+http(s)?://\\S+\\.\\S+.*\\).*"; /** * Regex for acceptable from address. */ private static String ACCEPTABLE_FROM = "\\S+@\\S+\\.\\S+"; /** Construct a CrawlOrder. */ public CrawlOrder() { super(ATTR_NAME, "Heritrix crawl order. This forms the root of " + "the settings framework."); Type e; e = addElementToDefinition(new SimpleType(ATTR_SETTINGS_DIRECTORY, "Directory where override settings are kept. The settings " + "for many modules can be overridden based on the domain or " + "subdomain of the URI being processed. This setting specifies" + " a file level directory to store those settings. The path" + " is relative to 'disk-path' unless" + " an absolute path is provided.", "settings")); e.setOverrideable(false); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_DISK_PATH, "Directory where logs, arcs and other run time files will " + "be kept. If this path is a relative path, it will be " + "relative to the crawl order.", "")); e.setOverrideable(false); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_LOGS_PATH, "Directory where crawler log files will be kept. If this path " + "is a relative path, it will be relative to the 'disk-path'.", "logs")); e.setOverrideable(false); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_CHECKPOINTS_PATH, "Directory where crawler checkpoint files will be kept. " + "If this path " + "is a relative path, it will be relative to the 'disk-path'.", "checkpoints")); e.setOverrideable(false); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_STATE_PATH, "Directory where crawler-state files will be kept. If this path " + "is a relative path, it will be relative to the 'disk-path'.", "state")); e.setOverrideable(false); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_SCRATCH_PATH, "Directory where discardable temporary files will be kept. " + "If this path " + "is a relative path, it will be relative to the 'disk-path'.", "scratch")); e.setOverrideable(false); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_MAX_BYTES_DOWNLOAD, "Maximum number of bytes to download. Once this number is" + " exceeded the crawler will stop. " + "A value of zero means no upper limit.", new Long(0))); e.setOverrideable(false); e = addElementToDefinition(new SimpleType(ATTR_MAX_DOCUMENT_DOWNLOAD, "Maximum number of documents to download. Once this number" + " is exceeded the crawler will stop. " + "A value of zero means no upper limit.", new Long(0))); e.setOverrideable(false); e = addElementToDefinition(new SimpleType(ATTR_MAX_TIME_SEC, "Maximum amount of time to crawl (in seconds). Once this" + " much time has elapsed the crawler will stop. A value of" + " zero means no upper limit.", new Long(0))); e.setOverrideable(false); e = addElementToDefinition(new SimpleType(ATTR_MAX_TOE_THREADS, "Maximum number of threads processing URIs at the same time.", new Integer(100))); e.setOverrideable(false); e = addElementToDefinition(new SimpleType(ATTR_RECORDER_OUT_BUFFER, "Size in bytes of in-memory buffer to record outbound " + "traffic. One such buffer is reserved for every ToeThread.", new Integer(4096))); e.setOverrideable(false); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_RECORDER_IN_BUFFER, "Size in bytes of in-memory buffer to record inbound " + "traffic. One such buffer is reserved for every ToeThread.", new Integer(65536))); e.setOverrideable(false); e.setExpertSetting(true); e = addElementToDefinition(new SimpleType(ATTR_BDB_CACHE_PERCENT, "Percentage of heap to allocate to BerkeleyDB JE cache. " + "Default of zero means no preference (accept BDB's default, " + "usually 60%, or the je.maxMemoryPercent property value).", DEFAULT_BDB_CACHE_PERCENT)); e.setExpertSetting(true); e.setOverrideable(false); addElementToDefinition(new CrawlScope()); httpHeaders = (MapType) addElementToDefinition(new MapType( ATTR_HTTP_HEADERS, "HTTP headers. Information that will " + "be used when constructing the HTTP headers of " + "the crawler's HTTP requests.")); e = httpHeaders.addElementToDefinition(new SimpleType(ATTR_USER_AGENT, "User agent to act as. Field must contain valid URL " + "that links to website of person or organization " + "running the crawl. Replace 'PROJECT_URL_HERE' in " + "initial template. E.g. If organization " + "is Library of Congress, a valid user agent would be:" + "'Mozilla/5.0 (compatible; loc-crawler/0.11.0 " + "+http://loc.gov)'. " +
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -