📄 crawljobhandler.java
字号:
/* CrawlJobHandler * * $Id: CrawlJobHandler.java 5055 2007-04-10 22:12:56Z gojomo $ * * Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */package org.archive.crawler.admin;import java.io.BufferedReader;import java.io.BufferedWriter;import java.io.File;import java.io.FileWriter;import java.io.FilenameFilter;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.net.URL;import java.net.URI;import java.util.ArrayList;import java.util.Comparator;import java.util.Date;import java.util.Enumeration;import java.util.Iterator;import java.util.List;import java.util.TreeSet;import java.util.logging.Level;import java.util.logging.Logger;import javax.management.Attribute;import javax.management.AttributeNotFoundException;import javax.management.InvalidAttributeValueException;import javax.management.MBeanException;import javax.management.ReflectionException;import org.apache.commons.httpclient.URIException;import org.archive.crawler.Heritrix;import org.archive.crawler.datamodel.CrawlOrder;import org.archive.crawler.event.CrawlStatusListener;import org.archive.crawler.framework.FrontierMarker;import org.archive.crawler.framework.exceptions.FatalConfigurationException;import org.archive.crawler.framework.exceptions.InitializationException;import org.archive.crawler.framework.exceptions.InvalidFrontierMarkerException;import org.archive.crawler.frontier.FrontierJournal;import org.archive.crawler.frontier.RecoveryJournal;import org.archive.crawler.settings.ComplexType;import org.archive.crawler.settings.CrawlerSettings;import org.archive.crawler.settings.SettingsHandler;import org.archive.crawler.settings.XMLSettingsHandler;import org.archive.util.ArchiveUtils;import org.archive.util.FileUtils;/** * This class manages CrawlJobs. Submitted crawl jobs are queued up and run * in order when the crawler is running. * <p>Basically this provides a layer between any potential user interface and * the CrawlJobs. It keeps the lists of completed jobs, pending jobs, etc. * <p> * The jobs managed by the handler can be divided into the following: * <ul> * <li> <code>Pending</code> - Jobs that are ready to run and are waiting their * turn. These can be edited, viewed, deleted etc. * <li> <code>Running</code> - Only one job can be running at a time. There may * be no job running. The running job can be viewed * and edited to some extent. It can also be * terminated. This job should have a * StatisticsTracking module attached to it for more * details on the crawl. * <li><code>Completed</code> - Jobs that have finished crawling or have been * deleted from the pending queue or terminated * while running. They can not be edited but can be * viewed. They retain the StatisticsTracking * module from their run. * <li> <code>New job</code> - At any given time their can be one 'new job' the * new job is not considered ready to run. It can * be edited or discarded (in which case it will be * totally destroyed, including any files on disk). * Once an operator deems the job ready to run it * can be moved to the pending queue. * <li> <code>Profiles</code> - Jobs under profiles are not actual jobs. They can * be edited normally but can not be submitted to * the pending queue. New jobs can be created * using a profile as it's template. * * @author Kristinn Sigurdsson * * @see org.archive.crawler.admin.CrawlJob */public class CrawlJobHandler implements CrawlStatusListener { private static final Logger logger = Logger.getLogger(CrawlJobHandler.class.getName()); /** * Name of system property whose specification overrides default profile * used. * */ public static final String DEFAULT_PROFILE_NAME = "heritrix.default.profile"; /** * Default profile name. */ public static final String DEFAULT_PROFILE = "default"; /** * Name of the profiles directory. */ public static final String PROFILES_DIR_NAME = "profiles"; public static final String ORDER_FILE_NAME = "order.xml"; /** * Job currently being crawled. */ private CrawlJob currentJob = null; /** * A new job that is being created/configured. Not yet ready for crawling. */ private CrawlJob newJob = null; /** * Thread to start the next job in background */ private Thread startingNextJob = null; /** * A list of pending CrawlJobs. */ private TreeSet<CrawlJob> pendingCrawlJobs; /** * A list of completed CrawlJobs. */ //private Vector completedCrawlJobs = new Vector(); private TreeSet<CrawlJob> completedCrawlJobs; /** * A list of profile CrawlJobs. */ private TreeSet<CrawlJob> profileJobs; // The UIDs of profiles should be NOT be timestamps. A descriptive name is // ideal. private String defaultProfile = null; /** * If true the crawler is 'running'. That is the next pending job will start * crawling as soon as the current job (if any) is completed. */ private boolean running = false; /** * String to indicate recovery should be based on the recovery log, not * based on checkpointing. */ public static final String RECOVER_LOG = "recover"; /** * Jobs directory. */ private final File jobsDir; /** * Constructor. * @param jobsDir Jobs directory. */ public CrawlJobHandler(final File jobsDir) { this(jobsDir, true, true); } /** * Constructor allowing for optional loading of profiles and jobs. * @param jobsDir Jobs directory. * @param loadJobs If true then any applicable jobs will be loaded. * @param loadProfiles If true then any applicable profiles will be loaded. */ public CrawlJobHandler(final File jobsDir, final boolean loadJobs, final boolean loadProfiles) { this.jobsDir = jobsDir; // Make a comparator for CrawlJobs. Comparator<CrawlJob> comp = new Comparator<CrawlJob>(){ public int compare(CrawlJob job1, CrawlJob job2) { if( job1.getJobPriority() < job2.getJobPriority() ){ return -1; } else if( job1.getJobPriority() > job2.getJobPriority() ){ return 1; } else { // Same priority, use UID (which should be a timestamp). // Lower UID (string compare) means earlier time. return job1.getUID().compareTo(job2.getUID()); } } }; this.pendingCrawlJobs = new TreeSet<CrawlJob>(comp); this.completedCrawlJobs = new TreeSet<CrawlJob>(comp); // Profiles always have the same priority so it will be sorted by name this.profileJobs = new TreeSet<CrawlJob>(comp); if (loadProfiles){ loadProfiles(); } if (loadJobs){ loadJobs(); } } /** * Find the state.job file in the job directory. * @param jobDir Directory to look in. * @return Full path to 'state.job' file or null if none found. */ protected File getStateJobFile(final File jobDir) { // Need to find job file ('state.job'). File[] jobFiles = jobDir.listFiles(new FilenameFilter() { public boolean accept(File dir, String name) { return name.toLowerCase().endsWith(".job") && (new File(dir, name)).canRead(); } }); return (jobFiles.length == 1)? jobFiles[0]: null; } /** * Loads any availible jobs in the jobs directory. * <p> * Availible jobs are any directory containing a file called * <code>state.job</code>. The file must contain valid job information. */ private void loadJobs() { this.jobsDir.mkdirs(); File[] jobs = this.jobsDir.listFiles(); for (int i = 0; i < jobs.length; i++) { if (jobs[i].isDirectory()) { File jobFile = getStateJobFile(jobs[i]); if (jobFile != null) { loadJob(jobFile); } } } } /** * Loads a job given a specific job file. The loaded job will be placed in * the list of completed jobs or pending queue depending on its status. * Running jobs will have their status set to 'finished abnormally' and put * into the completed list. * @param job The job file of the job to load. */ protected void loadJob(final File job) { CrawlJob cjob = null; try { // Load the CrawlJob cjob = new CrawlJob(job, new CrawlJobErrorHandler()); } catch (InvalidJobFileException e) { logger.log(Level.INFO, "Invalid job file for " + job.getAbsolutePath(), e); return; } catch (IOException e) { logger.log(Level.INFO, "IOException for " + job.getName() + ", " + job.getAbsolutePath(), e); return; } // TODO: Move test into CrawlJob. // Check job status and place it accordingly. if (cjob.getStatus().equals(CrawlJob.STATUS_RUNNING) || cjob.getStatus().equals(CrawlJob.STATUS_PAUSED) || cjob.getStatus().equals(CrawlJob.STATUS_CHECKPOINTING) || cjob.getStatus().equals(CrawlJob.STATUS_WAITING_FOR_PAUSE) ){ // Was a running job. cjob.setStatus(CrawlJob.STATUS_FINISHED_ABNORMAL); this.completedCrawlJobs.add(cjob); } else if( cjob.getStatus().equals(CrawlJob.STATUS_PENDING) ) { // Was a pending job. this.pendingCrawlJobs.add(cjob); } else if( cjob.getStatus().equals(CrawlJob.STATUS_CREATED) || cjob.getStatus().equals(CrawlJob.STATUS_DELETED) ) { // Ignore for now. TODO: Add to 'recycle bin' } else { // Must have been completed. this.completedCrawlJobs.add(cjob); } } /** * Looks in conf dir for a profiles dir. * @return the directory where profiles are stored else null if none * available * @throws IOException */ private File getProfilesDirectory() throws IOException { URL webappProfilePath = Heritrix.class.getResource("/" + PROFILES_DIR_NAME); if (webappProfilePath != null) { try { return new File(new URI(webappProfilePath.toString())); } catch (java.lang.IllegalArgumentException e) { // e.g. "profiles" within a jar file // try Heritrix.getConfdir() in this case } catch (java.net.URISyntaxException e) { e.printStackTrace(); } } return (Heritrix.getConfdir(false) == null)? null: new File(Heritrix.getConfdir().getAbsolutePath(), PROFILES_DIR_NAME); } /** * Loads the default profile and all other profiles found on disk. */ private void loadProfiles() { boolean loadedDefault = false; File profileDir = null; try { profileDir = getProfilesDirectory(); } catch (IOException e) { e.printStackTrace(); } if (profileDir != null) { File[] ps = profileDir.listFiles(); if (ps != null && ps.length > 0) { for (int i = 0; i < ps.length; i++) { File f = ps[i]; if (f.isDirectory()) { // Each directory in the profiles directory should // contain the file order.xml. File profile = new File(f, ORDER_FILE_NAME); if (profile.canRead()) { boolean b = loadProfile(profile); if (b) { loadedDefault = b; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -