⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 crawljob.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
/* CrawlJob * * Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */package org.archive.crawler.admin;import java.io.BufferedReader;import java.io.File;import java.io.FileNotFoundException;import java.io.FileReader;import java.io.FileWriter;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.PrintWriter;import java.io.Serializable;import java.io.StringWriter;import java.util.ArrayList;import java.util.Arrays;import java.util.Collection;import java.util.EventObject;import java.util.Hashtable;import java.util.Iterator;import java.util.List;import java.util.Map;import java.util.logging.Level;import java.util.logging.Logger;import javax.management.Attribute;import javax.management.AttributeList;import javax.management.AttributeNotFoundException;import javax.management.DynamicMBean;import javax.management.InstanceAlreadyExistsException;import javax.management.InvalidAttributeValueException;import javax.management.MBeanAttributeInfo;import javax.management.MBeanException;import javax.management.MBeanInfo;import javax.management.MBeanNotificationInfo;import javax.management.MBeanOperationInfo;import javax.management.MBeanParameterInfo;import javax.management.MBeanRegistration;import javax.management.MBeanRegistrationException;import javax.management.MBeanServer;import javax.management.NotCompliantMBeanException;import javax.management.Notification;import javax.management.NotificationBroadcasterSupport;import javax.management.ObjectName;import javax.management.ReflectionException;import javax.management.RuntimeOperationsException;import javax.management.openmbean.CompositeData;import javax.management.openmbean.CompositeDataSupport;import javax.management.openmbean.CompositeType;import javax.management.openmbean.OpenDataException;import javax.management.openmbean.OpenMBeanAttributeInfo;import javax.management.openmbean.OpenMBeanAttributeInfoSupport;import javax.management.openmbean.OpenMBeanConstructorInfoSupport;import javax.management.openmbean.OpenMBeanInfoSupport;import javax.management.openmbean.OpenMBeanOperationInfo;import javax.management.openmbean.OpenMBeanOperationInfoSupport;import javax.management.openmbean.OpenMBeanParameterInfo;import javax.management.openmbean.OpenMBeanParameterInfoSupport;import javax.management.openmbean.SimpleType;import org.apache.commons.httpclient.URIException;import org.archive.crawler.Heritrix;import org.archive.crawler.datamodel.CandidateURI;import org.archive.crawler.datamodel.Checkpoint;import org.archive.crawler.datamodel.CrawlOrder;import org.archive.crawler.event.CrawlStatusListener;import org.archive.crawler.framework.CrawlController;import org.archive.crawler.framework.FrontierMarker;import org.archive.crawler.framework.StatisticsTracking;import org.archive.crawler.framework.exceptions.InitializationException;import org.archive.crawler.framework.exceptions.InvalidFrontierMarkerException;import org.archive.crawler.frontier.AbstractFrontier;import org.archive.crawler.settings.ComplexType;import org.archive.crawler.settings.ModuleAttributeInfo;import org.archive.crawler.settings.TextField;import org.archive.crawler.settings.XMLSettingsHandler;import org.archive.crawler.util.CheckpointUtils;import org.archive.crawler.util.IoUtils;import org.archive.util.ArchiveUtils;import org.archive.util.FileUtils;import org.archive.util.JEMBeanHelper;import org.archive.util.JmxUtils;import org.archive.util.iterator.LineReadingIterator;import org.archive.util.iterator.RegexpLineIterator;import com.sleepycat.je.DatabaseException;import com.sleepycat.je.Environment;/** * A CrawlJob encapsulates a 'crawl order' with any and all information and * methods needed by a CrawlJobHandler to accept and execute them. * * <p>A given crawl job may also be a 'profile' for a crawl. In that case it * should not be executed as a crawl but can be edited and used as a template * for creating new CrawlJobs. * * <p>All of it's constructors are protected since only a CrawlJobHander * should construct new CrawlJobs. * * @author Kristinn Sigurdsson * * @see org.archive.crawler.admin.CrawlJobHandler#newJob(CrawlJob, String, * String, String, String, int) * @see org.archive.crawler.admin.CrawlJobHandler#newProfile(CrawlJob, *  String, String, String) */public class CrawlJob extends NotificationBroadcasterSupportimplements DynamicMBean, MBeanRegistration, CrawlStatusListener, Serializable {    /**     * Eclipse generated serial number.     */    private static final long serialVersionUID = 3411161000452525856L;        private static final Logger logger =        Logger.getLogger(CrawlJob.class.getName());    /*     * Possible values for Priority     */    /** lowest */    public static final int PRIORITY_MINIMAL = 0;    /** low */    public static final int PRIORITY_LOW = 1;    /** average */    public static final int PRIORITY_AVERAGE = 2;    /** high */    public static final int PRIORITY_HIGH = 3;    /** highest */    public static final int PRIORITY_CRITICAL = 4;    /*     * Possible states for a Job.     */    /** Inital value. May not be ready to run/incomplete. */    public static final String STATUS_CREATED = "Created";    /** Job has been successfully submitted to a CrawlJobHandler */    public static final String STATUS_PENDING = "Pending";    /** Job is being crawled */    public static final String STATUS_RUNNING = "Running";    /** Job was deleted by user, will not be displayed in UI. */    public static final String STATUS_DELETED = "Deleted";    /** Job was terminted by user input while crawling */    public static final String STATUS_ABORTED = "Finished - Ended by operator";    /** Something went very wrong */    public static final String STATUS_FINISHED_ABNORMAL =        "Finished - Abnormal exit from crawling";    /** Job finished normally having completed its crawl. */    public static final String STATUS_FINISHED = "Finished";    /** Job finished normally when the specified timelimit was hit. */    public static final String STATUS_FINISHED_TIME_LIMIT =        "Finished - Timelimit hit";    /** Job finished normally when the specifed amount of      * data (MB) had been downloaded */    public static final String STATUS_FINISHED_DATA_LIMIT =        "Finished - Maximum amount of data limit hit";    /** Job finished normally when the specified number of documents had been     * fetched.     */    public static final String STATUS_FINISHED_DOCUMENT_LIMIT =        "Finished - Maximum number of documents limit hit";    /** Job is going to be temporarly stopped after active threads are finished. */    public static final String STATUS_WAITING_FOR_PAUSE = "Pausing - " +        "Waiting for threads to finish";    /** Job was temporarly stopped. State is kept so it can be resumed */    public static final String STATUS_PAUSED = "Paused";    /**     * Job is being checkpointed.  When finished checkpointing, job is set     * back to STATUS_PAUSED (Job must be first paused before checkpointing     * will run).     */    public static final String STATUS_CHECKPOINTING = "Checkpointing";    /** Job could not be launced due to an InitializationException */    public static final String STATUS_MISCONFIGURED = "Could not launch job " +        "- Fatal InitializationException";    /** Job is actually a profile */    public static final String STATUS_PROFILE = "Profile";        public static final String STATUS_PREPARING = "Preparing";    // Class variables    private String UID;       //A UID issued by the CrawlJobHandler.    private String name;    private String status;    private boolean isReadOnly = false;    private boolean isNew = true;    private boolean isProfile = false;    private boolean isRunning = false;    private int priority;    private int numberOfJournalEntries = 0;        private String statisticsFileSave = "";    private String errorMessage = null;    private File jobDir = null;    private transient CrawlJobErrorHandler errorHandler = null;    protected transient XMLSettingsHandler settingsHandler;        private transient CrawlController controller = null;        private static final String RECOVERY_JOURNAL_STYLE = "recoveryJournal";    private static final String CRAWL_LOG_STYLE = "crawlLog";        // OpenMBean support.    /**     * Server we registered with. Maybe null.     */    private transient MBeanServer mbeanServer = null;    private transient ObjectName mbeanName = null;    private static final String CRAWLJOB_JMXMBEAN_TYPE =        JmxUtils.SERVICE + ".Job";    private transient JEMBeanHelper bdbjeMBeanHelper = null;    private transient List<String> bdbjeAttributeNameList = null;    private transient List<String> bdbjeOperationsNameList = null;            /**     * The MBean we've registered ourselves with (May be null     * throughout life of Heritrix).     */    private transient OpenMBeanInfoSupport openMBeanInfo;        private final static String NAME_ATTR = "Name";    private final static String UID_ATTR = "UID";    private final static String STATUS_ATTR = "Status";    private final static String FRONTIER_SHORT_REPORT_ATTR =        "FrontierShortReport";    private final static String THREADS_SHORT_REPORT_ATTR =        "ThreadsShortReport";    private final static String TOTAL_DATA_ATTR = "TotalData";    private final static String CRAWL_TIME_ATTR = "CrawlTime";    private final static String DOC_RATE_ATTR = "DocRate";    private final static String CURRENT_DOC_RATE_ATTR = "CurrentDocRate";    private final static String KB_RATE_ATTR = "KbRate";    private final static String CURRENT_KB_RATE_ATTR = "CurrentKbRate";    private final static String THREAD_COUNT_ATTR = "ThreadCount";    private final static String DOWNLOAD_COUNT_ATTR = "DownloadedCount";    private final static String DISCOVERED_COUNT_ATTR = "DiscoveredCount";    private final static String [] ATTRIBUTE_ARRAY = {NAME_ATTR, UID_ATTR,        STATUS_ATTR, FRONTIER_SHORT_REPORT_ATTR, THREADS_SHORT_REPORT_ATTR,        TOTAL_DATA_ATTR, CRAWL_TIME_ATTR, DOC_RATE_ATTR,        CURRENT_DOC_RATE_ATTR, KB_RATE_ATTR, CURRENT_KB_RATE_ATTR,        THREAD_COUNT_ATTR, DOWNLOAD_COUNT_ATTR, DISCOVERED_COUNT_ATTR};    private final static List ATTRIBUTE_LIST = Arrays.asList(ATTRIBUTE_ARRAY);        private final static String IMPORT_URI_OPER = "importUri";    private final static String IMPORT_URIS_OPER = "importUris";    private final static String PAUSE_OPER = "pause";    private final static String RESUME_OPER = "resume";    private final static String FRONTIER_REPORT_OPER = "frontierReport";    private final static String THREADS_REPORT_OPER = "threadsReport";    private final static String SEEDS_REPORT_OPER = "seedsReport";    private final static String CHECKPOINT_OPER = "startCheckpoint";    private final static String PROGRESS_STATISTICS_OPER =        "progressStatistics";    private final static String PROGRESS_STATISTICS_LEGEND_OPER =        "progressStatisticsLegend";        private final static String PROG_STATS = "progressStatistics";        // Same as JEMBeanHelper.OP_DB_STAT    private final static String OP_DB_STAT = "getDatabaseStats";        /**     * Don't add the following crawl-order items.     */    private final static List ORDER_EXCLUDE;    static {        ORDER_EXCLUDE = Arrays.asList(new String [] {"bdb-cache-percent",            "extract-processors", "DNS", "uri-included-structure"});    }        /**     * Sequence number for jmx notifications.     */    private static int notificationsSequenceNumber = 1;        /**     * A shutdown Constructor.     */    protected CrawlJob() {        super();    }    /**     * A constructor for jobs.     *     * <p> Create, ready to crawl, jobs.     * @param UID A unique ID for this job. Typically emitted by the     *            CrawlJobHandler.     * @param name The name of the job     * @param settingsHandler The associated settings     * @param errorHandler The crawl jobs settings error handler.     *           <tt>null</tt> means none is set     * @param priority job priority.     * @param dir The directory that is considered this jobs working directory.     */    public CrawlJob(final String UID,            final String name, final XMLSettingsHandler settingsHandler,            final CrawlJobErrorHandler errorHandler, final int priority,            final File dir) {        this(UID, name, settingsHandler, errorHandler,                priority, dir, null, false, true);    }    /**     * A constructor for profiles.     *

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -