📄 crawljobhandler.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
12 3 4 下一页
/* CrawlJobHandler * * $Id: CrawlJobHandler.java 5055 2007-04-10 22:12:56Z gojomo $ * * Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */package org.archive.crawler.admin;import java.io.BufferedReader;import java.io.BufferedWriter;import java.io.File;import java.io.FileWriter;import java.io.FilenameFilter;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.net.URL;import java.net.URI;import java.util.ArrayList;import java.util.Comparator;import java.util.Date;import java.util.Enumeration;import java.util.Iterator;import java.util.List;import java.util.TreeSet;import java.util.logging.Level;import java.util.logging.Logger;import javax.management.Attribute;import javax.management.AttributeNotFoundException;import javax.management.InvalidAttributeValueException;import javax.management.MBeanException;import javax.management.ReflectionException;import org.apache.commons.httpclient.URIException;import org.archive.crawler.Heritrix;import org.archive.crawler.datamodel.CrawlOrder;import org.archive.crawler.event.CrawlStatusListener;import org.archive.crawler.framework.FrontierMarker;import org.archive.crawler.framework.exceptions.FatalConfigurationException;import org.archive.crawler.framework.exceptions.InitializationException;import org.archive.crawler.framework.exceptions.InvalidFrontierMarkerException;import org.archive.crawler.frontier.FrontierJournal;import org.archive.crawler.frontier.RecoveryJournal;import org.archive.crawler.settings.ComplexType;import org.archive.crawler.settings.CrawlerSettings;import org.archive.crawler.settings.SettingsHandler;import org.archive.crawler.settings.XMLSettingsHandler;import org.archive.util.ArchiveUtils;import org.archive.util.FileUtils;/** * This class manages CrawlJobs. Submitted crawl jobs are queued up and run * in order when the crawler is running. * <p>Basically this provides a layer between any potential user interface and * the CrawlJobs.  It keeps the lists of completed jobs, pending jobs, etc. * <p> * The jobs managed by the handler can be divided into the following: * <ul> *  <li> <code>Pending</code> - Jobs that are ready to run and are waiting their *                              turn. These can be edited, viewed, deleted etc. *  <li> <code>Running</code> - Only one job can be running at a time. There may *                              be no job running. The running job can be viewed *                              and edited to some extent. It can also be *                              terminated. This job should have a *                              StatisticsTracking module attached to it for more *                              details on the crawl. * <li><code>Completed</code> - Jobs that have finished crawling or have been *                              deleted from the pending queue or terminated *                              while running. They can not be edited but can be *                              viewed. They retain the StatisticsTracking *                              module from their run. *  <li> <code>New job</code> - At any given time their can be one 'new job' the *                              new job is not considered ready to run. It can *                              be edited or discarded (in which case it will be *                              totally destroyed, including any files on disk). *                              Once an operator deems the job ready to run it *                              can be moved to the pending queue. * <li> <code>Profiles</code> - Jobs under profiles are not actual jobs. They can *                              be edited normally but can not be submitted to *                              the pending queue. New jobs can be created *                              using a profile as it's template. * * @author Kristinn Sigurdsson * * @see org.archive.crawler.admin.CrawlJob */public class CrawlJobHandler implements CrawlStatusListener {    private static final Logger logger =        Logger.getLogger(CrawlJobHandler.class.getName());    /**     * Name of system property whose specification overrides default profile     * used.     *     */    public static final String DEFAULT_PROFILE_NAME        = "heritrix.default.profile";    /**     * Default profile name.     */    public static final String DEFAULT_PROFILE = "default";        /**     * Name of the profiles directory.     */    public static final String PROFILES_DIR_NAME = "profiles";        public static final String ORDER_FILE_NAME = "order.xml";    /**     * Job currently being crawled.     */    private CrawlJob currentJob = null;        /**     * A new job that is being created/configured. Not yet ready for crawling.     */    private CrawlJob newJob = null;	/**	 * Thread to start the next job in background     */        private Thread startingNextJob = null;    /**     * A list of pending CrawlJobs.     */    private TreeSet<CrawlJob> pendingCrawlJobs;    /**     * A list of completed CrawlJobs.     */    //private Vector completedCrawlJobs = new Vector();    private TreeSet<CrawlJob> completedCrawlJobs;    /**     * A list of profile CrawlJobs.     */    private TreeSet<CrawlJob> profileJobs;        // The UIDs of profiles should be NOT be timestamps. A descriptive name is    // ideal.    private String defaultProfile = null;    /**     * If true the crawler is 'running'. That is the next pending job will start     * crawling as soon as the current job (if any) is completed.     */    private boolean running = false;        /**     * String to indicate recovery should be based on the recovery log, not     * based on checkpointing.     */    public static final String RECOVER_LOG = "recover";        /**     * Jobs directory.     */    private final File jobsDir;        /**     * Constructor.     * @param jobsDir Jobs directory.     */    public CrawlJobHandler(final File jobsDir) {        this(jobsDir, true, true);    }    /**     * Constructor allowing for optional loading of profiles and jobs.     * @param jobsDir Jobs directory.     * @param loadJobs If true then any applicable jobs will be loaded.     * @param loadProfiles If true then any applicable profiles will be loaded.     */    public CrawlJobHandler(final File jobsDir,            final boolean loadJobs, final boolean loadProfiles) {        this.jobsDir = jobsDir;        // Make a comparator for CrawlJobs.        Comparator<CrawlJob> comp = new Comparator<CrawlJob>(){            public int compare(CrawlJob job1, CrawlJob job2) {                if( job1.getJobPriority() < job2.getJobPriority() ){                    return -1;                } else if( job1.getJobPriority() > job2.getJobPriority() ){                    return 1;                } else {                    // Same priority, use UID (which should be a timestamp).                    // Lower UID (string compare) means earlier time.                    return job1.getUID().compareTo(job2.getUID());                }            }        };        this.pendingCrawlJobs = new TreeSet<CrawlJob>(comp);        this.completedCrawlJobs = new TreeSet<CrawlJob>(comp);        // Profiles always have the same priority so it will be sorted by name        this.profileJobs = new TreeSet<CrawlJob>(comp);        if (loadProfiles){            loadProfiles();        }        if (loadJobs){            loadJobs();        }    }        /**     * Find the state.job file in the job directory.     * @param jobDir Directory to look in.     * @return Full path to 'state.job' file or null if none found.     */    protected File getStateJobFile(final File jobDir) {        // Need to find job file ('state.job').        File[] jobFiles = jobDir.listFiles(new FilenameFilter() {            public boolean accept(File dir, String name) {                return name.toLowerCase().endsWith(".job") &&                    (new File(dir, name)).canRead();            }                    });        return (jobFiles.length == 1)? jobFiles[0]: null;    }    /**     * Loads any availible jobs in the jobs directory.     * <p>     * Availible jobs are any directory containing a file called     * <code>state.job</code>. The file must contain valid job information.     */    private void loadJobs() {        this.jobsDir.mkdirs();        File[] jobs = this.jobsDir.listFiles();        for (int i = 0; i < jobs.length; i++) {            if (jobs[i].isDirectory()) {                File jobFile = getStateJobFile(jobs[i]);                if (jobFile != null) {                    loadJob(jobFile);                }            }        }    }    /**     * Loads a job given a specific job file. The loaded job will be placed in     * the list of completed jobs or pending queue depending on its status.     * Running jobs will have their status set to 'finished abnormally' and put     * into the completed list.     * @param job The job file of the job to load.     */    protected void loadJob(final File job) {        CrawlJob cjob = null;        try {            // Load the CrawlJob            cjob = new CrawlJob(job, new CrawlJobErrorHandler());        } catch (InvalidJobFileException e) {            logger.log(Level.INFO,                    "Invalid job file for " + job.getAbsolutePath(), e);            return;        } catch (IOException e) {            logger.log(Level.INFO, "IOException for " + job.getName() +                    ", " + job.getAbsolutePath(), e);            return;        }                // TODO: Move test into CrawlJob.        // Check job status and place it accordingly.        if (cjob.getStatus().equals(CrawlJob.STATUS_RUNNING)                || cjob.getStatus().equals(CrawlJob.STATUS_PAUSED)                || cjob.getStatus().equals(CrawlJob.STATUS_CHECKPOINTING)                || cjob.getStatus().equals(CrawlJob.STATUS_WAITING_FOR_PAUSE) ){            // Was a running job.            cjob.setStatus(CrawlJob.STATUS_FINISHED_ABNORMAL);            this.completedCrawlJobs.add(cjob);        } else if( cjob.getStatus().equals(CrawlJob.STATUS_PENDING) ) {            // Was a pending job.            this.pendingCrawlJobs.add(cjob);        } else if( cjob.getStatus().equals(CrawlJob.STATUS_CREATED)                || cjob.getStatus().equals(CrawlJob.STATUS_DELETED) ) {            // Ignore for now. TODO: Add to 'recycle bin'        } else {            // Must have been completed.            this.completedCrawlJobs.add(cjob);        }    }    /**     * Looks in conf dir for a profiles dir.     * @return the directory where profiles are stored else null if none     * available     * @throws IOException     */    private File getProfilesDirectory() throws IOException {        URL webappProfilePath = Heritrix.class.getResource("/" +             PROFILES_DIR_NAME);        if (webappProfilePath != null) {            try {                return new File(new URI(webappProfilePath.toString()));            } catch (java.lang.IllegalArgumentException e) {                // e.g. "profiles" within a jar file                // try Heritrix.getConfdir() in this case            } catch (java.net.URISyntaxException e) {                e.printStackTrace();            }        }        return (Heritrix.getConfdir(false) == null)? null:            new File(Heritrix.getConfdir().getAbsolutePath(),                 PROFILES_DIR_NAME);            }    /**     * Loads the default profile and all other profiles found on disk.     */    private void loadProfiles() {        boolean loadedDefault = false;        File profileDir = null;		try {			profileDir = getProfilesDirectory();		} catch (IOException e) {			e.printStackTrace();		}		if (profileDir != null) {            File[] ps = profileDir.listFiles();            if (ps != null && ps.length > 0) {                for (int i = 0; i < ps.length; i++) {                    File f = ps[i];                    if (f.isDirectory()) {                        // Each directory in the profiles directory should                        // contain the file order.xml.                        File profile = new File(f, ORDER_FILE_NAME);                        if (profile.canRead()) {                            boolean b = loadProfile(profile);                            if (b) {                                loadedDefault = b;                            }
12 3 4 下一页
💿 文件大小 9430 K
👤 上传用户 zergwyk
📂 所属分类 Internet/网络编程
🏷️ 相关标签

#lucece
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -