📄 crawlcontroller.java
字号:
/* Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * CrawlController.java * Created on May 14, 2003 * * $Id: CrawlController.java 4917 2007-02-20 22:15:23Z gojomo $ */package org.archive.crawler.framework;import java.io.File;import java.io.FileOutputStream;import java.io.FilenameFilter;import java.io.IOException;import java.io.ObjectInputStream;import java.io.PrintWriter;import java.io.Serializable;import java.util.ArrayList;import java.util.Arrays;import java.util.Collections;import java.util.EventObject;import java.util.HashMap;import java.util.HashSet;import java.util.Hashtable;import java.util.Iterator;import java.util.LinkedList;import java.util.List;import java.util.Map;import java.util.Set;import java.util.TreeSet;import java.util.concurrent.locks.ReentrantLock;import java.util.logging.FileHandler;import java.util.logging.Formatter;import java.util.logging.Level;import java.util.logging.Logger;import javax.management.AttributeNotFoundException;import javax.management.InvalidAttributeValueException;import javax.management.MBeanException;import javax.management.ReflectionException;import org.apache.commons.httpclient.URIException;import org.archive.crawler.admin.CrawlJob;import org.archive.crawler.admin.StatisticsTracker;import org.archive.crawler.datamodel.Checkpoint;import org.archive.crawler.datamodel.CrawlOrder;import org.archive.crawler.datamodel.CrawlURI;import org.archive.crawler.datamodel.ServerCache;import org.archive.crawler.event.CrawlStatusListener;import org.archive.crawler.event.CrawlURIDispositionListener;import org.archive.crawler.framework.exceptions.FatalConfigurationException;import org.archive.crawler.framework.exceptions.InitializationException;import org.archive.crawler.io.LocalErrorFormatter;import org.archive.crawler.io.RuntimeErrorFormatter;import org.archive.crawler.io.StatisticsLogFormatter;import org.archive.crawler.io.UriErrorFormatter;import org.archive.crawler.io.UriProcessingFormatter;import org.archive.crawler.settings.MapType;import org.archive.crawler.settings.SettingsHandler;import org.archive.crawler.util.CheckpointUtils;import org.archive.io.GenerationFileHandler;import org.archive.net.UURI;import org.archive.net.UURIFactory;import org.archive.util.ArchiveUtils;import org.archive.util.CachedBdbMap;import org.archive.util.FileUtils;import org.archive.util.Reporter;import org.archive.util.bdbje.EnhancedEnvironment;import org.xbill.DNS.DClass;import org.xbill.DNS.Lookup;import com.sleepycat.bind.serial.StoredClassCatalog;import com.sleepycat.je.CheckpointConfig;import com.sleepycat.je.Database;import com.sleepycat.je.DatabaseException;import com.sleepycat.je.DbInternal;import com.sleepycat.je.EnvironmentConfig;import com.sleepycat.je.dbi.EnvironmentImpl;import com.sleepycat.je.utilint.DbLsn;/** * CrawlController collects all the classes which cooperate to * perform a crawl and provides a high-level interface to the * running crawl. * * As the "global context" for a crawl, subcomponents will * often reach each other through the CrawlController. * * @author Gordon Mohr */public class CrawlController implements Serializable, Reporter { // be robust against trivial implementation changes private static final long serialVersionUID = ArchiveUtils.classnameBasedUID(CrawlController.class,1); /** * Messages from the crawlcontroller. * * They appear on console. */ private final static Logger LOGGER = Logger.getLogger(CrawlController.class.getName()); // manifest support /** abbrieviation label for config files in manifest */ public static final char MANIFEST_CONFIG_FILE = 'C'; /** abbrieviation label for report files in manifest */ public static final char MANIFEST_REPORT_FILE = 'R'; /** abbrieviation label for log files in manifest */ public static final char MANIFEST_LOG_FILE = 'L'; // key log names private static final String LOGNAME_PROGRESS_STATISTICS = "progress-statistics"; private static final String LOGNAME_URI_ERRORS = "uri-errors"; private static final String LOGNAME_RUNTIME_ERRORS = "runtime-errors"; private static final String LOGNAME_LOCAL_ERRORS = "local-errors"; private static final String LOGNAME_CRAWL = "crawl"; // key subcomponents which define and implement a crawl in progress private transient CrawlOrder order; private transient CrawlScope scope; private transient ProcessorChainList processorChains; private transient Frontier frontier; private transient ToePool toePool; private transient ServerCache serverCache; // This gets passed into the initialize method. private transient SettingsHandler settingsHandler; // Used to enable/disable single-threaded operation after OOM private volatile transient boolean singleThreadMode = false; private transient ReentrantLock singleThreadLock = null; // emergency reserve of memory to allow some progress/reporting after OOM private transient LinkedList<char[]> reserveMemory; private static final int RESERVE_BLOCKS = 1; private static final int RESERVE_BLOCK_SIZE = 6*2^20; // 6MB // crawl state: as requested or actual /** * Crawl exit status. */ private transient String sExit; private static final Object NASCENT = "NASCENT".intern(); private static final Object RUNNING = "RUNNING".intern(); private static final Object PAUSED = "PAUSED".intern(); private static final Object PAUSING = "PAUSING".intern(); private static final Object CHECKPOINTING = "CHECKPOINTING".intern(); private static final Object STOPPING = "STOPPING".intern(); private static final Object FINISHED = "FINISHED".intern(); private static final Object STARTED = "STARTED".intern(); private static final Object PREPARING = "PREPARING".intern(); transient private Object state = NASCENT; // disk paths private transient File disk; // overall disk path private transient File logsDisk; // for log files /** * For temp files representing state of crawler (eg queues) */ private transient File stateDisk; /** * For discardable temp files (eg fetch buffers). */ private transient File scratchDisk; /** * Directory that holds checkpoint. */ private transient File checkpointsDisk; /** * Checkpointer. * Knows if checkpoint in progress and what name of checkpoint is. Also runs * checkpoints. */ private Checkpointer checkpointer; /** * Gets set to checkpoint we're in recovering if in checkpoint recover * mode. Gets setup by {@link #getCheckpointRecover()}. */ private transient Checkpoint checkpointRecover = null; // crawl limits private long maxBytes; private long maxDocument; private long maxTime; /** * A manifest of all files used/created during this crawl. Written to file * at the end of the crawl (the absolutely last thing done). */ private StringBuffer manifest; /** * Record of fileHandlers established for loggers, * assisting file rotation. */ transient private Map<Logger,FileHandler> fileHandlers; /** suffix to use on active logs */ public static final String CURRENT_LOG_SUFFIX = ".log"; /** * Crawl progress logger. * * No exceptions. Logs summary result of each url processing. */ public transient Logger uriProcessing; /** * This logger contains unexpected runtime errors. * * Would contain errors trying to set up a job or failures inside * processors that they are not prepared to recover from. */ public transient Logger runtimeErrors; /** * This logger is for job-scoped logging, specifically errors which * happen and are handled within a particular processor. * * Examples would be socket timeouts, exceptions thrown by extractors, etc. */ public transient Logger localErrors; /** * Special log for URI format problems, wherever they may occur. */ public transient Logger uriErrors; /** * Statistics tracker writes here at regular intervals. */ private transient Logger progressStats; /** * Logger to hold job summary report. * * Large state reports made at infrequent intervals (e.g. job ending) go * here. */ public transient Logger reports; protected StatisticsTracking statistics = null; /** * List of crawl status listeners. * * All iterations need to synchronize on this object if they're to avoid * concurrent modification exceptions. * See {@link java.util.Collections#synchronizedList(List)}. */ private transient List<CrawlStatusListener> registeredCrawlStatusListeners = Collections.synchronizedList(new ArrayList<CrawlStatusListener>()); // Since there is a high probability that there will only ever by one // CrawlURIDispositionListner we will use this while there is only one: private transient CrawlURIDispositionListener registeredCrawlURIDispositionListener; // And then switch to the array once there is more then one. protected transient ArrayList<CrawlURIDispositionListener> registeredCrawlURIDispositionListeners; /** Shared bdb Environment for Frontier subcomponents */ // TODO: investigate using multiple environments to split disk accesses // across separate physical disks private transient EnhancedEnvironment bdbEnvironment = null; /** * Keep a list of all BigMap instance made -- shouldn't be many -- so that * we can checkpoint. */ private transient Map<String,CachedBdbMap<?,?>> bigmaps = null; /** * Default constructor */ public CrawlController() { super(); // Defer most setup to initialize methods } /** * Starting from nothing, set up CrawlController and associated * classes to be ready for a first crawl. * * @param sH Settings handler. * @throws InitializationException */ public void initialize(SettingsHandler sH) throws InitializationException { sendCrawlStateChangeEvent(PREPARING, CrawlJob.STATUS_PREPARING); this.singleThreadLock = new ReentrantLock(); this.settingsHandler = sH; this.order = settingsHandler.getOrder(); this.order.setController(this); this.bigmaps = new Hashtable<String,CachedBdbMap<?,?>>(); sExit = ""; this.manifest = new StringBuffer(); String onFailMessage = ""; try { onFailMessage = "You must set the User-Agent and From HTTP" + " header values to acceptable strings. \n" + " User-Agent: [software-name](+[info-url])[misc]\n" + " From: [email-address]\n"; order.checkUserAgentAndFrom(); onFailMessage = "Unable to setup disk"; if (disk == null) { setupDisk(); } onFailMessage = "Unable to create log file(s)"; setupLogs(); // Figure if we're to do a checkpoint restore. If so, get the // checkpointRecover instance and then put into place the old bdb // log files. If any of the log files already exist in target state // diretory, WE DO NOT OVERWRITE (Makes for faster recovery). // CrawlController checkpoint recovery code manages restoration of // the old StatisticsTracker, any BigMaps used by the Crawler and // the moving of bdb log files into place only. Other objects // interested in recovery need to ask if // CrawlController#isCheckpointRecover is set to figure if in // recovery and then take appropriate recovery action // (These objects can call CrawlController#getCheckpointRecover // to get the directory that might hold files/objects dropped // checkpointing). Such objects will need to use a technique other // than object serialization restoring settings because they'll
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -