📄 heritrix.java
字号:
/* Heritrix * * $Id: Heritrix.java,v 1.142.2.1 2006/09/18 20:42:55 stack-sf Exp $ * * Created on May 15, 2003 * * Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */package org.archive.crawler;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.PrintStream;import java.io.PrintWriter;import java.net.HttpURLConnection;import java.net.InetAddress;import java.net.URL;import java.net.URLConnection;import java.net.UnknownHostException;import java.util.ArrayList;import java.util.Arrays;import java.util.Collection;import java.util.Collections;import java.util.Enumeration;import java.util.Hashtable;import java.util.Iterator;import java.util.List;import java.util.Map;import java.util.Properties;import java.util.StringTokenizer;import java.util.TimeZone;import java.util.Vector;import java.util.logging.Level;import java.util.logging.LogManager;import java.util.logging.Logger;import javax.management.Attribute;import javax.management.AttributeList;import javax.management.AttributeNotFoundException;import javax.management.DynamicMBean;import javax.management.InstanceAlreadyExistsException;import javax.management.InstanceNotFoundException;import javax.management.InvalidAttributeValueException;import javax.management.MBeanInfo;import javax.management.MBeanNotificationInfo;import javax.management.MBeanOperationInfo;import javax.management.MBeanRegistration;import javax.management.MBeanRegistrationException;import javax.management.MBeanServer;import javax.management.MBeanServerFactory;import javax.management.MalformedObjectNameException;import javax.management.NotCompliantMBeanException;import javax.management.ObjectName;import javax.management.ReflectionException;import javax.management.RuntimeOperationsException;import javax.management.openmbean.CompositeData;import javax.management.openmbean.CompositeDataSupport;import javax.management.openmbean.CompositeType;import javax.management.openmbean.OpenDataException;import javax.management.openmbean.OpenMBeanAttributeInfoSupport;import javax.management.openmbean.OpenMBeanConstructorInfoSupport;import javax.management.openmbean.OpenMBeanInfoSupport;import javax.management.openmbean.OpenMBeanOperationInfoSupport;import javax.management.openmbean.OpenMBeanParameterInfo;import javax.management.openmbean.OpenMBeanParameterInfoSupport;import javax.management.openmbean.OpenType;import javax.management.openmbean.SimpleType;import javax.management.openmbean.TabularData;import javax.management.openmbean.TabularDataSupport;import javax.management.openmbean.TabularType;import javax.naming.CompoundName;import javax.naming.Context;import javax.naming.NameNotFoundException;import javax.naming.NamingException;import javax.naming.NoInitialContextException;import org.apache.commons.cli.Option;import org.archive.crawler.admin.CrawlJob;import org.archive.crawler.admin.CrawlJobErrorHandler;import org.archive.crawler.admin.CrawlJobHandler;import org.archive.crawler.datamodel.CredentialStore;import org.archive.crawler.datamodel.credential.Credential;import org.archive.crawler.event.CrawlStatusListener;import org.archive.crawler.framework.AlertManager;import org.archive.crawler.framework.CrawlController;import org.archive.crawler.framework.exceptions.FatalConfigurationException;import org.archive.crawler.framework.exceptions.InitializationException;import org.archive.crawler.selftest.SelfTestCrawlJobHandler;import org.archive.crawler.settings.XMLSettingsHandler;import org.archive.io.SinkHandler;import org.archive.io.SinkHandlerLogRecord;import org.archive.net.UURI;import org.archive.util.FileUtils;import org.archive.util.IoUtils;import org.archive.util.JmxUtils;import org.archive.util.JndiUtils;import org.archive.util.PropertyUtils;import org.archive.util.TextUtils;import sun.net.www.protocol.file.FileURLConnection;/** * Main class for Heritrix crawler. * * Heritrix is usually launched by a shell script that backgrounds heritrix that * redirects all stdout and stderr emitted by heritrix to a log file. So that * startup messages emitted subsequent to the redirection of stdout and stderr * show on the console, this class prints usage or startup output such as where * the web UI can be found, etc., to a STARTLOG that the shell script is waiting * on. As soon as the shell script sees output in this file, it prints its * content and breaks out of its wait. See ${HERITRIX_HOME}/bin/heritrix. * * <p> * Heritrix can also be embedded or launched by webapp initialization or by JMX * bootstrapping. So far I count 4 methods of instantiation: * <ol> * <li>From this classes main -- the method usually used;</li> * <li>From the Heritrix UI (The local-instances.jsp) page;</li> * <li>A creation by a JMX agent at the behest of a remote JMX client; and</li> * <li>A container such as tomcat or jboss.</li> * </ol> * * @author gojomo * @author Kristinn Sigurdsson * @author Stack */public class Heritrix implements DynamicMBean, MBeanRegistration{ /** * Heritrix logging instance. */ private static final Logger logger = Logger.getLogger(Heritrix.class .getName()); private static final File TMPDIR = new File(System.getProperty( "java.io.tmpdir", "/tmp")); /** * Name of the heritrix properties file. */ private static final String PROPERTIES = "heritrix.properties"; /** * Name of the key to use specifying alternate heritrix properties on * command line. */ private static final String PROPERTIES_KEY = PROPERTIES; /** * Prefix used on properties we'll add to the System.properties list. */ private static final String HERITRIX_PROPERTIES_PREFIX = "heritrix."; /** * Instance of web server if one was started. */ private static SimpleHttpServer httpServer = null; /** * CrawlJob handler. Manages multiple crawl jobs at runtime. */ private CrawlJobHandler jobHandler = null; /** * Heritrix start log file. * * This file contains standard out produced by this main class for startup * only. Used by heritrix shell script. Name here MUST match that in the * <code>bin/heritrix</code> shell script. This is a DEPENDENCY the shell * wrapper has on this here java heritrix. */ private static final String STARTLOG = "heritrix_dmesg.log"; /** * Default encoding. * * Used for content when fetching if none specified. */ public static final String DEFAULT_ENCODING = "ISO-8859-1"; /** * Heritrix stderr/stdout log file. * * This file should have nothing in it except messages over which we have no * control (JVM stacktrace, 3rd-party lib emissions). The wrapper startup * script directs stderr/stdout here. This is an INTERDEPENDENCY this * program has with the wrapper shell script. Shell can actually pass us an * alternate to use for this file. */ private static String DEFAULT_HERITRIX_OUT = "heritrix_out.log"; /** * Where to write this classes startup output. * * This out should only be used if Heritrix is being run from the * command-line. */ private static PrintWriter out = null; /** * The org.archive package */ private static final String ARCHIVE_PACKAGE = "org.archive."; /** * The crawler package. */ private static final String CRAWLER_PACKAGE = Heritrix.class.getName() .substring(0, Heritrix.class.getName().lastIndexOf('.')); /** * The root context for a webapp. */ private static final String ROOT_CONTEXT = "/"; /** * Set to true if application is started from command line. */ private static boolean commandLine = false; /** * True if container initialization has been run. */ private static boolean containerInitialized = false; private static final String JAR_SUFFIX = ".jar"; private AlertManager alertManager; /** * The context of the GUI webapp. Default is root. */ private static String adminContext = ROOT_CONTEXT; /** * True if we're to put up a GUI. Cmdline processing can override. */ private static boolean gui = !PropertyUtils .getBooleanProperty("heritrix.cmdline.nowui"); /** * Port to put the GUI up on. Cmdline processing can override. */ private static int guiPort = SimpleHttpServer.DEFAULT_PORT; /** * A collection containing only localhost. Used as default value for * guiHosts, and passed to SimpleHttpServer when doing selftest. */ final private static Collection<String> LOCALHOST_ONLY = Collections .unmodifiableList(Arrays.asList(new String[] { "127.0.0.1" })); /** * Hosts to bind the GUI webserver to. By default, only contans localhost. * Set to an empty collection to indicate that all available network * interfaces should be used for the webserver. */ private static Collection<String> guiHosts = LOCALHOST_ONLY; /** * Web UI server, realm, context name. */ private static String ADMIN = "admin"; // OpenMBean support. /** * The MBean server we're registered with (May be null). */ private MBeanServer mbeanServer = null; /** * MBean name we were registered as. */ private ObjectName mbeanName = null; /** * Keep reference to all instances of Heritrix. Used by the UI to figure * which of the local Heritrice it should be going against and to figure * what to shutdown on the way out (If there was always a JMX Agent, we * wouldn't need to keep this list. We could always ask the JMX Agent for * all instances. UPDATE: True we could always ask the JMX Agent but we * might keep around this local reference because it will allow faster, less * awkward -- think of marshalling the args for JMX invoke operation -- * access to local Heritrix instances. A new usage for this instances Map is * in CrawlJob#preRegister to find the hosting Heritrix instance). */ private static Map instances = new Hashtable(); private OpenMBeanInfoSupport openMBeanInfo; private final static String STATUS_ATTR = "Status"; private final static String VERSION_ATTR = "Version"; private final static List ATTRIBUTE_LIST; static { ATTRIBUTE_LIST = Arrays .asList(new String[] { STATUS_ATTR, VERSION_ATTR }); } private final static String START_OPER = "start"; private final static String STOP_OPER = "stop"; private final static String DESTROY_OPER = "destroy"; private final static String INTERRUPT_OPER = "interrupt"; private final static String START_CRAWLING_OPER = "startCrawling"; private final static String STOP_CRAWLING_OPER = "stopCrawling"; private final static String ADD_CRAWL_JOB_OPER = "addJob"; private final static String TERMINATE_CRAWL_JOB_OPER = "terminateCurrentJob"; private final static String DELETE_CRAWL_JOB_OPER = "deleteJob"; private final static String ALERT_OPER = "alert"; private final static String ADD_CRAWL_JOB_BASEDON_OPER = "addJobBasedon"; private final static String PENDING_JOBS_OPER = "pendingJobs"; private final static String COMPLETED_JOBS_OPER = "completedJobs"; private final static String CRAWLEND_REPORT_OPER = "crawlendReport"; private final static String SHUTDOWN_OPER = "shutdown"; private final static String LOG_OPER = "log"; private final static String REBIND_JNDI_OPER = "rebindJNDI"; private final static List OPERATION_LIST; static { OPERATION_LIST = Arrays.asList(new String[] { START_OPER, STOP_OPER, INTERRUPT_OPER, START_CRAWLING_OPER, STOP_CRAWLING_OPER, ADD_CRAWL_JOB_OPER, ADD_CRAWL_JOB_BASEDON_OPER, DELETE_CRAWL_JOB_OPER, ALERT_OPER, PENDING_JOBS_OPER, COMPLETED_JOBS_OPER, CRAWLEND_REPORT_OPER, SHUTDOWN_OPER, LOG_OPER, DESTROY_OPER, TERMINATE_CRAWL_JOB_OPER, REBIND_JNDI_OPER }); } private CompositeType jobCompositeType = null; private TabularType jobsTabularType = null; private static final String[] JOB_KEYS = new String[] { "uid", "name", "status" }; private static String adminUsername; private static String adminPassword; /** * Constructor. Does not register the created instance with JMX. Assumed * this constructor is used by such as JMX agent creating an instance of * Heritrix at the commmand of a remote client (In this case Heritrix will * be registered by the invoking agent). * * @throws IOException */ public Heritrix() throws IOException { this(null, false); } public Heritrix(final boolean jmxregister) throws IOException { this(null, jmxregister); } /** * Constructor. * * @param name * If null, we bring up the default Heritrix instance. * @param jmxregister * True if we are to register this instance with JMX agent. * @throws IOException */ public Heritrix(final String name, final boolean jmxregister) throws IOException { this(name, jmxregister, new CrawlJobHandler(getJobsdir())); } /** * Constructor. * * @param name * If null, we bring up the default Heritrix instance. * @param jmxregister * True if we are to register this instance with JMX agent. * @param cjh * CrawlJobHandler to use. * @throws IOException */ public Heritrix(final String name, final boolean jmxregister, final CrawlJobHandler cjh) throws IOException { super(); containerInitialization(); this.jobHandler = cjh; this.openMBeanInfo = buildMBeanInfo(); // Set up the alerting system. SinkHandler is also a global so will // catch alerts for all running Heritrix instances. Will need to // address (Add name of instance that threw the alert to SinkRecord?). final SinkHandler sinkHandler = SinkHandler.getInstance(); if (sinkHandler == null) { throw new NullPointerException("SinkHandler not found."); } // Adapt the alerting system to use SinkHandler. this.alertManager = new AlertManager() { public void add(SinkHandlerLogRecord record) { sinkHandler.publish(record); } public Vector getAll() { return sinkHandler.getAll(); } public Vector getNewAll() {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -