📄 crawljob.java
字号:
/* CrawlJob * * Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */package org.archive.crawler.admin;import java.io.BufferedReader;import java.io.File;import java.io.FileNotFoundException;import java.io.FileReader;import java.io.FileWriter;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.PrintWriter;import java.io.Serializable;import java.io.StringWriter;import java.util.ArrayList;import java.util.Arrays;import java.util.Collection;import java.util.EventObject;import java.util.Hashtable;import java.util.Iterator;import java.util.List;import java.util.Map;import java.util.logging.Level;import java.util.logging.Logger;import javax.management.Attribute;import javax.management.AttributeList;import javax.management.AttributeNotFoundException;import javax.management.DynamicMBean;import javax.management.InstanceAlreadyExistsException;import javax.management.InvalidAttributeValueException;import javax.management.MBeanAttributeInfo;import javax.management.MBeanException;import javax.management.MBeanInfo;import javax.management.MBeanNotificationInfo;import javax.management.MBeanOperationInfo;import javax.management.MBeanParameterInfo;import javax.management.MBeanRegistration;import javax.management.MBeanRegistrationException;import javax.management.MBeanServer;import javax.management.NotCompliantMBeanException;import javax.management.Notification;import javax.management.NotificationBroadcasterSupport;import javax.management.ObjectName;import javax.management.ReflectionException;import javax.management.RuntimeOperationsException;import javax.management.openmbean.CompositeData;import javax.management.openmbean.CompositeDataSupport;import javax.management.openmbean.CompositeType;import javax.management.openmbean.OpenDataException;import javax.management.openmbean.OpenMBeanAttributeInfo;import javax.management.openmbean.OpenMBeanAttributeInfoSupport;import javax.management.openmbean.OpenMBeanConstructorInfoSupport;import javax.management.openmbean.OpenMBeanInfoSupport;import javax.management.openmbean.OpenMBeanOperationInfo;import javax.management.openmbean.OpenMBeanOperationInfoSupport;import javax.management.openmbean.OpenMBeanParameterInfo;import javax.management.openmbean.OpenMBeanParameterInfoSupport;import javax.management.openmbean.SimpleType;import org.apache.commons.httpclient.URIException;import org.archive.crawler.Heritrix;import org.archive.crawler.datamodel.CandidateURI;import org.archive.crawler.datamodel.Checkpoint;import org.archive.crawler.datamodel.CrawlOrder;import org.archive.crawler.event.CrawlStatusListener;import org.archive.crawler.framework.CrawlController;import org.archive.crawler.framework.FrontierMarker;import org.archive.crawler.framework.StatisticsTracking;import org.archive.crawler.framework.exceptions.InitializationException;import org.archive.crawler.framework.exceptions.InvalidFrontierMarkerException;import org.archive.crawler.frontier.AbstractFrontier;import org.archive.crawler.settings.ComplexType;import org.archive.crawler.settings.ModuleAttributeInfo;import org.archive.crawler.settings.TextField;import org.archive.crawler.settings.XMLSettingsHandler;import org.archive.crawler.util.CheckpointUtils;import org.archive.crawler.util.IoUtils;import org.archive.util.ArchiveUtils;import org.archive.util.FileUtils;import org.archive.util.JEMBeanHelper;import org.archive.util.JmxUtils;import org.archive.util.iterator.LineReadingIterator;import org.archive.util.iterator.RegexpLineIterator;import com.sleepycat.je.DatabaseException;import com.sleepycat.je.Environment;/** * A CrawlJob encapsulates a 'crawl order' with any and all information and * methods needed by a CrawlJobHandler to accept and execute them. * * <p>A given crawl job may also be a 'profile' for a crawl. In that case it * should not be executed as a crawl but can be edited and used as a template * for creating new CrawlJobs. * * <p>All of it's constructors are protected since only a CrawlJobHander * should construct new CrawlJobs. * * @author Kristinn Sigurdsson * * @see org.archive.crawler.admin.CrawlJobHandler#newJob(CrawlJob, String, * String, String, String, int) * @see org.archive.crawler.admin.CrawlJobHandler#newProfile(CrawlJob, * String, String, String) */public class CrawlJob extends NotificationBroadcasterSupportimplements DynamicMBean, MBeanRegistration, CrawlStatusListener, Serializable { /** * Eclipse generated serial number. */ private static final long serialVersionUID = 3411161000452525856L; private static final Logger logger = Logger.getLogger(CrawlJob.class.getName()); /* * Possible values for Priority */ /** lowest */ public static final int PRIORITY_MINIMAL = 0; /** low */ public static final int PRIORITY_LOW = 1; /** average */ public static final int PRIORITY_AVERAGE = 2; /** high */ public static final int PRIORITY_HIGH = 3; /** highest */ public static final int PRIORITY_CRITICAL = 4; /* * Possible states for a Job. */ /** Inital value. May not be ready to run/incomplete. */ public static final String STATUS_CREATED = "Created"; /** Job has been successfully submitted to a CrawlJobHandler */ public static final String STATUS_PENDING = "Pending"; /** Job is being crawled */ public static final String STATUS_RUNNING = "Running"; /** Job was deleted by user, will not be displayed in UI. */ public static final String STATUS_DELETED = "Deleted"; /** Job was terminted by user input while crawling */ public static final String STATUS_ABORTED = "Finished - Ended by operator"; /** Something went very wrong */ public static final String STATUS_FINISHED_ABNORMAL = "Finished - Abnormal exit from crawling"; /** Job finished normally having completed its crawl. */ public static final String STATUS_FINISHED = "Finished"; /** Job finished normally when the specified timelimit was hit. */ public static final String STATUS_FINISHED_TIME_LIMIT = "Finished - Timelimit hit"; /** Job finished normally when the specifed amount of * data (MB) had been downloaded */ public static final String STATUS_FINISHED_DATA_LIMIT = "Finished - Maximum amount of data limit hit"; /** Job finished normally when the specified number of documents had been * fetched. */ public static final String STATUS_FINISHED_DOCUMENT_LIMIT = "Finished - Maximum number of documents limit hit"; /** Job is going to be temporarly stopped after active threads are finished. */ public static final String STATUS_WAITING_FOR_PAUSE = "Pausing - " + "Waiting for threads to finish"; /** Job was temporarly stopped. State is kept so it can be resumed */ public static final String STATUS_PAUSED = "Paused"; /** * Job is being checkpointed. When finished checkpointing, job is set * back to STATUS_PAUSED (Job must be first paused before checkpointing * will run). */ public static final String STATUS_CHECKPOINTING = "Checkpointing"; /** Job could not be launced due to an InitializationException */ public static final String STATUS_MISCONFIGURED = "Could not launch job " + "- Fatal InitializationException"; /** Job is actually a profile */ public static final String STATUS_PROFILE = "Profile"; public static final String STATUS_PREPARING = "Preparing"; // Class variables private String UID; //A UID issued by the CrawlJobHandler. private String name; private String status; private boolean isReadOnly = false; private boolean isNew = true; private boolean isProfile = false; private boolean isRunning = false; private int priority; private int numberOfJournalEntries = 0; private String statisticsFileSave = ""; private String errorMessage = null; private File jobDir = null; private transient CrawlJobErrorHandler errorHandler = null; protected transient XMLSettingsHandler settingsHandler; private transient CrawlController controller = null; private static final String RECOVERY_JOURNAL_STYLE = "recoveryJournal"; private static final String CRAWL_LOG_STYLE = "crawlLog"; // OpenMBean support. /** * Server we registered with. Maybe null. */ private transient MBeanServer mbeanServer = null; private transient ObjectName mbeanName = null; private static final String CRAWLJOB_JMXMBEAN_TYPE = JmxUtils.SERVICE + ".Job"; private transient JEMBeanHelper bdbjeMBeanHelper = null; private transient List<String> bdbjeAttributeNameList = null; private transient List<String> bdbjeOperationsNameList = null; /** * The MBean we've registered ourselves with (May be null * throughout life of Heritrix). */ private transient OpenMBeanInfoSupport openMBeanInfo; private final static String NAME_ATTR = "Name"; private final static String UID_ATTR = "UID"; private final static String STATUS_ATTR = "Status"; private final static String FRONTIER_SHORT_REPORT_ATTR = "FrontierShortReport"; private final static String THREADS_SHORT_REPORT_ATTR = "ThreadsShortReport"; private final static String TOTAL_DATA_ATTR = "TotalData"; private final static String CRAWL_TIME_ATTR = "CrawlTime"; private final static String DOC_RATE_ATTR = "DocRate"; private final static String CURRENT_DOC_RATE_ATTR = "CurrentDocRate"; private final static String KB_RATE_ATTR = "KbRate"; private final static String CURRENT_KB_RATE_ATTR = "CurrentKbRate"; private final static String THREAD_COUNT_ATTR = "ThreadCount"; private final static String DOWNLOAD_COUNT_ATTR = "DownloadedCount"; private final static String DISCOVERED_COUNT_ATTR = "DiscoveredCount"; private final static String [] ATTRIBUTE_ARRAY = {NAME_ATTR, UID_ATTR, STATUS_ATTR, FRONTIER_SHORT_REPORT_ATTR, THREADS_SHORT_REPORT_ATTR, TOTAL_DATA_ATTR, CRAWL_TIME_ATTR, DOC_RATE_ATTR, CURRENT_DOC_RATE_ATTR, KB_RATE_ATTR, CURRENT_KB_RATE_ATTR, THREAD_COUNT_ATTR, DOWNLOAD_COUNT_ATTR, DISCOVERED_COUNT_ATTR}; private final static List ATTRIBUTE_LIST = Arrays.asList(ATTRIBUTE_ARRAY); private final static String IMPORT_URI_OPER = "importUri"; private final static String IMPORT_URIS_OPER = "importUris"; private final static String PAUSE_OPER = "pause"; private final static String RESUME_OPER = "resume"; private final static String FRONTIER_REPORT_OPER = "frontierReport"; private final static String THREADS_REPORT_OPER = "threadsReport"; private final static String SEEDS_REPORT_OPER = "seedsReport"; private final static String CHECKPOINT_OPER = "startCheckpoint"; private final static String PROGRESS_STATISTICS_OPER = "progressStatistics"; private final static String PROGRESS_STATISTICS_LEGEND_OPER = "progressStatisticsLegend"; private final static String PROG_STATS = "progressStatistics"; // Same as JEMBeanHelper.OP_DB_STAT private final static String OP_DB_STAT = "getDatabaseStats"; /** * Don't add the following crawl-order items. */ private final static List ORDER_EXCLUDE; static { ORDER_EXCLUDE = Arrays.asList(new String [] {"bdb-cache-percent", "extract-processors", "DNS", "uri-included-structure"}); } /** * Sequence number for jmx notifications. */ private static int notificationsSequenceNumber = 1; /** * A shutdown Constructor. */ protected CrawlJob() { super(); } /** * A constructor for jobs. * * <p> Create, ready to crawl, jobs. * @param UID A unique ID for this job. Typically emitted by the * CrawlJobHandler. * @param name The name of the job * @param settingsHandler The associated settings * @param errorHandler The crawl jobs settings error handler. * <tt>null</tt> means none is set * @param priority job priority. * @param dir The directory that is considered this jobs working directory. */ public CrawlJob(final String UID, final String name, final XMLSettingsHandler settingsHandler, final CrawlJobErrorHandler errorHandler, final int priority, final File dir) { this(UID, name, settingsHandler, errorHandler, priority, dir, null, false, true); } /** * A constructor for profiles. *
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -