📄 crawlcontroller.java

📁 Heritrix是一个开源,可扩展的web爬虫项目。Heritrix设计成严格按照robots.txt文件的排除指示和META robots标签。
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
12 3 4 5 下一页
/* Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA * * CrawlController.java * Created on May 14, 2003 * * $Id: CrawlController.java 4917 2007-02-20 22:15:23Z gojomo $ */package org.archive.crawler.framework;import java.io.File;import java.io.FileOutputStream;import java.io.FilenameFilter;import java.io.IOException;import java.io.ObjectInputStream;import java.io.PrintWriter;import java.io.Serializable;import java.util.ArrayList;import java.util.Arrays;import java.util.Collections;import java.util.EventObject;import java.util.HashMap;import java.util.HashSet;import java.util.Hashtable;import java.util.Iterator;import java.util.LinkedList;import java.util.List;import java.util.Map;import java.util.Set;import java.util.TreeSet;import java.util.concurrent.locks.ReentrantLock;import java.util.logging.FileHandler;import java.util.logging.Formatter;import java.util.logging.Level;import java.util.logging.Logger;import javax.management.AttributeNotFoundException;import javax.management.InvalidAttributeValueException;import javax.management.MBeanException;import javax.management.ReflectionException;import org.apache.commons.httpclient.URIException;import org.archive.crawler.admin.CrawlJob;import org.archive.crawler.admin.StatisticsTracker;import org.archive.crawler.datamodel.Checkpoint;import org.archive.crawler.datamodel.CrawlOrder;import org.archive.crawler.datamodel.CrawlURI;import org.archive.crawler.datamodel.ServerCache;import org.archive.crawler.event.CrawlStatusListener;import org.archive.crawler.event.CrawlURIDispositionListener;import org.archive.crawler.framework.exceptions.FatalConfigurationException;import org.archive.crawler.framework.exceptions.InitializationException;import org.archive.crawler.io.LocalErrorFormatter;import org.archive.crawler.io.RuntimeErrorFormatter;import org.archive.crawler.io.StatisticsLogFormatter;import org.archive.crawler.io.UriErrorFormatter;import org.archive.crawler.io.UriProcessingFormatter;import org.archive.crawler.settings.MapType;import org.archive.crawler.settings.SettingsHandler;import org.archive.crawler.util.CheckpointUtils;import org.archive.io.GenerationFileHandler;import org.archive.net.UURI;import org.archive.net.UURIFactory;import org.archive.util.ArchiveUtils;import org.archive.util.CachedBdbMap;import org.archive.util.FileUtils;import org.archive.util.Reporter;import org.archive.util.bdbje.EnhancedEnvironment;import org.xbill.DNS.DClass;import org.xbill.DNS.Lookup;import com.sleepycat.bind.serial.StoredClassCatalog;import com.sleepycat.je.CheckpointConfig;import com.sleepycat.je.Database;import com.sleepycat.je.DatabaseException;import com.sleepycat.je.DbInternal;import com.sleepycat.je.EnvironmentConfig;import com.sleepycat.je.dbi.EnvironmentImpl;import com.sleepycat.je.utilint.DbLsn;/** * CrawlController collects all the classes which cooperate to * perform a crawl and provides a high-level interface to the * running crawl. * * As the "global context" for a crawl, subcomponents will * often reach each other through the CrawlController. * * @author Gordon Mohr */public class CrawlController implements Serializable, Reporter {    // be robust against trivial implementation changes    private static final long serialVersionUID =        ArchiveUtils.classnameBasedUID(CrawlController.class,1);    /**     * Messages from the crawlcontroller.     *     * They appear on console.     */    private final static Logger LOGGER =        Logger.getLogger(CrawlController.class.getName());    // manifest support    /** abbrieviation label for config files in manifest */    public static final char MANIFEST_CONFIG_FILE = 'C';    /** abbrieviation label for report files in manifest */    public static final char MANIFEST_REPORT_FILE = 'R';    /** abbrieviation label for log files in manifest */    public static final char MANIFEST_LOG_FILE = 'L';    // key log names    private static final String LOGNAME_PROGRESS_STATISTICS =        "progress-statistics";    private static final String LOGNAME_URI_ERRORS = "uri-errors";    private static final String LOGNAME_RUNTIME_ERRORS = "runtime-errors";    private static final String LOGNAME_LOCAL_ERRORS = "local-errors";    private static final String LOGNAME_CRAWL = "crawl";    // key subcomponents which define and implement a crawl in progress    private transient CrawlOrder order;    private transient CrawlScope scope;    private transient ProcessorChainList processorChains;        private transient Frontier frontier;    private transient ToePool toePool;        private transient ServerCache serverCache;        // This gets passed into the initialize method.    private transient SettingsHandler settingsHandler;    // Used to enable/disable single-threaded operation after OOM    private volatile transient boolean singleThreadMode = false;     private transient ReentrantLock singleThreadLock = null;    // emergency reserve of memory to allow some progress/reporting after OOM    private transient LinkedList<char[]> reserveMemory;    private static final int RESERVE_BLOCKS = 1;    private static final int RESERVE_BLOCK_SIZE = 6*2^20; // 6MB    // crawl state: as requested or actual        /**     * Crawl exit status.     */    private transient String sExit;    private static final Object NASCENT = "NASCENT".intern();    private static final Object RUNNING = "RUNNING".intern();    private static final Object PAUSED = "PAUSED".intern();    private static final Object PAUSING = "PAUSING".intern();    private static final Object CHECKPOINTING = "CHECKPOINTING".intern();    private static final Object STOPPING = "STOPPING".intern();    private static final Object FINISHED = "FINISHED".intern();    private static final Object STARTED = "STARTED".intern();    private static final Object PREPARING = "PREPARING".intern();    transient private Object state = NASCENT;    // disk paths    private transient File disk;        // overall disk path    private transient File logsDisk;    // for log files        /**     * For temp files representing state of crawler (eg queues)     */    private transient File stateDisk;        /**     * For discardable temp files (eg fetch buffers).     */    private transient File scratchDisk;    /**     * Directory that holds checkpoint.     */    private transient File checkpointsDisk;        /**     * Checkpointer.     * Knows if checkpoint in progress and what name of checkpoint is.  Also runs     * checkpoints.     */    private Checkpointer checkpointer;        /**     * Gets set to checkpoint we're in recovering if in checkpoint recover     * mode.  Gets setup by {@link #getCheckpointRecover()}.     */    private transient Checkpoint checkpointRecover = null;    // crawl limits    private long maxBytes;    private long maxDocument;    private long maxTime;    /**     * A manifest of all files used/created during this crawl. Written to file     * at the end of the crawl (the absolutely last thing done).     */    private StringBuffer manifest;    /**     * Record of fileHandlers established for loggers,     * assisting file rotation.     */    transient private Map<Logger,FileHandler> fileHandlers;    /** suffix to use on active logs */    public static final String CURRENT_LOG_SUFFIX = ".log";    /**     * Crawl progress logger.     *     * No exceptions.  Logs summary result of each url processing.     */    public transient Logger uriProcessing;    /**     * This logger contains unexpected runtime errors.     *     * Would contain errors trying to set up a job or failures inside     * processors that they are not prepared to recover from.     */    public transient Logger runtimeErrors;    /**     * This logger is for job-scoped logging, specifically errors which     * happen and are handled within a particular processor.     *     * Examples would be socket timeouts, exceptions thrown by extractors, etc.     */    public transient Logger localErrors;    /**     * Special log for URI format problems, wherever they may occur.     */    public transient Logger uriErrors;    /**     * Statistics tracker writes here at regular intervals.     */    private transient Logger progressStats;    /**     * Logger to hold job summary report.     *     * Large state reports made at infrequent intervals (e.g. job ending) go     * here.     */    public transient Logger reports;    protected StatisticsTracking statistics = null;    /**     * List of crawl status listeners.     *     * All iterations need to synchronize on this object if they're to avoid     * concurrent modification exceptions.     * See {@link java.util.Collections#synchronizedList(List)}.     */    private transient List<CrawlStatusListener> registeredCrawlStatusListeners =        Collections.synchronizedList(new ArrayList<CrawlStatusListener>());        // Since there is a high probability that there will only ever by one    // CrawlURIDispositionListner we will use this while there is only one:    private transient CrawlURIDispositionListener        registeredCrawlURIDispositionListener;    // And then switch to the array once there is more then one.     protected transient ArrayList<CrawlURIDispositionListener>      registeredCrawlURIDispositionListeners;        /** Shared bdb Environment for Frontier subcomponents */    // TODO: investigate using multiple environments to split disk accesses    // across separate physical disks    private transient EnhancedEnvironment bdbEnvironment = null;        /**     * Keep a list of all BigMap instance made -- shouldn't be many -- so that     * we can checkpoint.     */    private transient Map<String,CachedBdbMap<?,?>> bigmaps = null;        /**     * Default constructor     */    public CrawlController() {        super();        // Defer most setup to initialize methods    }    /**     * Starting from nothing, set up CrawlController and associated     * classes to be ready for a first crawl.     *     * @param sH Settings handler.     * @throws InitializationException     */    public void initialize(SettingsHandler sH)    throws InitializationException {        sendCrawlStateChangeEvent(PREPARING, CrawlJob.STATUS_PREPARING);        this.singleThreadLock = new ReentrantLock();        this.settingsHandler = sH;        this.order = settingsHandler.getOrder();        this.order.setController(this);        this.bigmaps = new Hashtable<String,CachedBdbMap<?,?>>();        sExit = "";        this.manifest = new StringBuffer();        String onFailMessage = "";        try {            onFailMessage = "You must set the User-Agent and From HTTP" +            " header values to acceptable strings. \n" +            " User-Agent: [software-name](+[info-url])[misc]\n" +            " From: [email-address]\n";            order.checkUserAgentAndFrom();            onFailMessage = "Unable to setup disk";            if (disk == null) {                setupDisk();            }            onFailMessage = "Unable to create log file(s)";            setupLogs();                        // Figure if we're to do a checkpoint restore. If so, get the            // checkpointRecover instance and then put into place the old bdb            // log files. If any of the log files already exist in target state            // diretory, WE DO NOT OVERWRITE (Makes for faster recovery).            // CrawlController checkpoint recovery code manages restoration of            // the old StatisticsTracker, any BigMaps used by the Crawler and            // the moving of bdb log files into place only. Other objects            // interested in recovery need to ask if            // CrawlController#isCheckpointRecover is set to figure if in            // recovery and then take appropriate recovery action            // (These objects can call CrawlController#getCheckpointRecover            // to get the directory that might hold files/objects dropped            // checkpointing).  Such objects will need to use a technique other            // than object serialization restoring settings because they'll
12 3 4 5 下一页
💿 文件大小 10016 K
👤 上传用户 qqpp2q
📂 所属分类数值算法/人工智能
🏷️ 相关标签

#Heritrix #robots #META #web
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -