⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 heritrix.java

📁 爬虫
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
/* Heritrix * * $Id: Heritrix.java,v 1.142.2.1 2006/09/18 20:42:55 stack-sf Exp $ * * Created on May 15, 2003 * * Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */package org.archive.crawler;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.PrintStream;import java.io.PrintWriter;import java.net.HttpURLConnection;import java.net.InetAddress;import java.net.URL;import java.net.URLConnection;import java.net.UnknownHostException;import java.util.ArrayList;import java.util.Arrays;import java.util.Collection;import java.util.Collections;import java.util.Enumeration;import java.util.Hashtable;import java.util.Iterator;import java.util.List;import java.util.Map;import java.util.Properties;import java.util.StringTokenizer;import java.util.TimeZone;import java.util.Vector;import java.util.logging.Level;import java.util.logging.LogManager;import java.util.logging.Logger;import javax.management.Attribute;import javax.management.AttributeList;import javax.management.AttributeNotFoundException;import javax.management.DynamicMBean;import javax.management.InstanceAlreadyExistsException;import javax.management.InstanceNotFoundException;import javax.management.InvalidAttributeValueException;import javax.management.MBeanInfo;import javax.management.MBeanNotificationInfo;import javax.management.MBeanOperationInfo;import javax.management.MBeanRegistration;import javax.management.MBeanRegistrationException;import javax.management.MBeanServer;import javax.management.MBeanServerFactory;import javax.management.MalformedObjectNameException;import javax.management.NotCompliantMBeanException;import javax.management.ObjectName;import javax.management.ReflectionException;import javax.management.RuntimeOperationsException;import javax.management.openmbean.CompositeData;import javax.management.openmbean.CompositeDataSupport;import javax.management.openmbean.CompositeType;import javax.management.openmbean.OpenDataException;import javax.management.openmbean.OpenMBeanAttributeInfoSupport;import javax.management.openmbean.OpenMBeanConstructorInfoSupport;import javax.management.openmbean.OpenMBeanInfoSupport;import javax.management.openmbean.OpenMBeanOperationInfoSupport;import javax.management.openmbean.OpenMBeanParameterInfo;import javax.management.openmbean.OpenMBeanParameterInfoSupport;import javax.management.openmbean.OpenType;import javax.management.openmbean.SimpleType;import javax.management.openmbean.TabularData;import javax.management.openmbean.TabularDataSupport;import javax.management.openmbean.TabularType;import javax.naming.CompoundName;import javax.naming.Context;import javax.naming.NameNotFoundException;import javax.naming.NamingException;import javax.naming.NoInitialContextException;import org.apache.commons.cli.Option;import org.archive.crawler.admin.CrawlJob;import org.archive.crawler.admin.CrawlJobErrorHandler;import org.archive.crawler.admin.CrawlJobHandler;import org.archive.crawler.datamodel.CredentialStore;import org.archive.crawler.datamodel.credential.Credential;import org.archive.crawler.event.CrawlStatusListener;import org.archive.crawler.framework.AlertManager;import org.archive.crawler.framework.CrawlController;import org.archive.crawler.framework.exceptions.FatalConfigurationException;import org.archive.crawler.framework.exceptions.InitializationException;import org.archive.crawler.selftest.SelfTestCrawlJobHandler;import org.archive.crawler.settings.XMLSettingsHandler;import org.archive.io.SinkHandler;import org.archive.io.SinkHandlerLogRecord;import org.archive.net.UURI;import org.archive.util.FileUtils;import org.archive.util.IoUtils;import org.archive.util.JmxUtils;import org.archive.util.JndiUtils;import org.archive.util.PropertyUtils;import org.archive.util.TextUtils;import sun.net.www.protocol.file.FileURLConnection;/** * Main class for Heritrix crawler. *  * Heritrix is usually launched by a shell script that backgrounds heritrix that * redirects all stdout and stderr emitted by heritrix to a log file. So that * startup messages emitted subsequent to the redirection of stdout and stderr * show on the console, this class prints usage or startup output such as where * the web UI can be found, etc., to a STARTLOG that the shell script is waiting * on. As soon as the shell script sees output in this file, it prints its * content and breaks out of its wait. See ${HERITRIX_HOME}/bin/heritrix. *  * <p> * Heritrix can also be embedded or launched by webapp initialization or by JMX * bootstrapping. So far I count 4 methods of instantiation: * <ol> * <li>From this classes main -- the method usually used;</li> * <li>From the Heritrix UI (The local-instances.jsp) page;</li> * <li>A creation by a JMX agent at the behest of a remote JMX client; and</li> * <li>A container such as tomcat or jboss.</li> * </ol> *  * @author gojomo * @author Kristinn Sigurdsson * @author Stack */public class Heritrix implements DynamicMBean, MBeanRegistration{	/**	 * Heritrix logging instance.	 */	private static final Logger logger = Logger.getLogger(Heritrix.class		.getName());	private static final File TMPDIR = new File(System.getProperty(		"java.io.tmpdir", "/tmp"));	/**	 * Name of the heritrix properties file.	 */	private static final String PROPERTIES = "heritrix.properties";	/**	 * Name of the key to use specifying alternate heritrix properties on	 * command line.	 */	private static final String PROPERTIES_KEY = PROPERTIES;	/**	 * Prefix used on properties we'll add to the System.properties list.	 */	private static final String HERITRIX_PROPERTIES_PREFIX = "heritrix.";	/**	 * Instance of web server if one was started.	 */	private static SimpleHttpServer httpServer = null;	/**	 * CrawlJob handler. Manages multiple crawl jobs at runtime.	 */	private CrawlJobHandler jobHandler = null;	/**	 * Heritrix start log file.	 * 	 * This file contains standard out produced by this main class for startup	 * only. Used by heritrix shell script. Name here MUST match that in the	 * <code>bin/heritrix</code> shell script. This is a DEPENDENCY the shell	 * wrapper has on this here java heritrix.	 */	private static final String STARTLOG = "heritrix_dmesg.log";	/**	 * Default encoding.	 * 	 * Used for content when fetching if none specified.	 */	public static final String DEFAULT_ENCODING = "ISO-8859-1";	/**	 * Heritrix stderr/stdout log file.	 * 	 * This file should have nothing in it except messages over which we have no	 * control (JVM stacktrace, 3rd-party lib emissions). The wrapper startup	 * script directs stderr/stdout here. This is an INTERDEPENDENCY this	 * program has with the wrapper shell script. Shell can actually pass us an	 * alternate to use for this file.	 */	private static String DEFAULT_HERITRIX_OUT = "heritrix_out.log";	/**	 * Where to write this classes startup output.	 * 	 * This out should only be used if Heritrix is being run from the	 * command-line.	 */	private static PrintWriter out = null;	/**	 * The org.archive package	 */	private static final String ARCHIVE_PACKAGE = "org.archive.";	/**	 * The crawler package.	 */	private static final String CRAWLER_PACKAGE = Heritrix.class.getName()		.substring(0, Heritrix.class.getName().lastIndexOf('.'));	/**	 * The root context for a webapp.	 */	private static final String ROOT_CONTEXT = "/";	/**	 * Set to true if application is started from command line.	 */	private static boolean commandLine = false;	/**	 * True if container initialization has been run.	 */	private static boolean containerInitialized = false;	private static final String JAR_SUFFIX = ".jar";	private AlertManager alertManager;	/**	 * The context of the GUI webapp. Default is root.	 */	private static String adminContext = ROOT_CONTEXT;	/**	 * True if we're to put up a GUI. Cmdline processing can override.	 */	private static boolean gui = !PropertyUtils		.getBooleanProperty("heritrix.cmdline.nowui");	/**	 * Port to put the GUI up on. Cmdline processing can override.	 */	private static int guiPort = SimpleHttpServer.DEFAULT_PORT;	/**	 * A collection containing only localhost. Used as default value for	 * guiHosts, and passed to SimpleHttpServer when doing selftest.	 */	final private static Collection<String> LOCALHOST_ONLY = Collections		.unmodifiableList(Arrays.asList(new String[] { "127.0.0.1" }));	/**	 * Hosts to bind the GUI webserver to. By default, only contans localhost.	 * Set to an empty collection to indicate that all available network	 * interfaces should be used for the webserver.	 */	private static Collection<String> guiHosts = LOCALHOST_ONLY;	/**	 * Web UI server, realm, context name.	 */	private static String ADMIN = "admin";	// OpenMBean support.	/**	 * The MBean server we're registered with (May be null).	 */	private MBeanServer mbeanServer = null;	/**	 * MBean name we were registered as.	 */	private ObjectName mbeanName = null;	/**	 * Keep reference to all instances of Heritrix. Used by the UI to figure	 * which of the local Heritrice it should be going against and to figure	 * what to shutdown on the way out (If there was always a JMX Agent, we	 * wouldn't need to keep this list. We could always ask the JMX Agent for	 * all instances. UPDATE: True we could always ask the JMX Agent but we	 * might keep around this local reference because it will allow faster, less	 * awkward -- think of marshalling the args for JMX invoke operation --	 * access to local Heritrix instances. A new usage for this instances Map is	 * in CrawlJob#preRegister to find the hosting Heritrix instance).	 */	private static Map instances = new Hashtable();	private OpenMBeanInfoSupport openMBeanInfo;	private final static String STATUS_ATTR = "Status";	private final static String VERSION_ATTR = "Version";	private final static List ATTRIBUTE_LIST;	static	{		ATTRIBUTE_LIST = Arrays			.asList(new String[] { STATUS_ATTR, VERSION_ATTR });	}	private final static String START_OPER = "start";	private final static String STOP_OPER = "stop";	private final static String DESTROY_OPER = "destroy";	private final static String INTERRUPT_OPER = "interrupt";	private final static String START_CRAWLING_OPER = "startCrawling";	private final static String STOP_CRAWLING_OPER = "stopCrawling";	private final static String ADD_CRAWL_JOB_OPER = "addJob";	private final static String TERMINATE_CRAWL_JOB_OPER = "terminateCurrentJob";	private final static String DELETE_CRAWL_JOB_OPER = "deleteJob";	private final static String ALERT_OPER = "alert";	private final static String ADD_CRAWL_JOB_BASEDON_OPER = "addJobBasedon";	private final static String PENDING_JOBS_OPER = "pendingJobs";	private final static String COMPLETED_JOBS_OPER = "completedJobs";	private final static String CRAWLEND_REPORT_OPER = "crawlendReport";	private final static String SHUTDOWN_OPER = "shutdown";	private final static String LOG_OPER = "log";	private final static String REBIND_JNDI_OPER = "rebindJNDI";	private final static List OPERATION_LIST;	static	{		OPERATION_LIST = Arrays.asList(new String[] { START_OPER, STOP_OPER,				INTERRUPT_OPER, START_CRAWLING_OPER, STOP_CRAWLING_OPER,				ADD_CRAWL_JOB_OPER, ADD_CRAWL_JOB_BASEDON_OPER,				DELETE_CRAWL_JOB_OPER, ALERT_OPER, PENDING_JOBS_OPER,				COMPLETED_JOBS_OPER, CRAWLEND_REPORT_OPER, SHUTDOWN_OPER,				LOG_OPER, DESTROY_OPER, TERMINATE_CRAWL_JOB_OPER,				REBIND_JNDI_OPER });	}	private CompositeType jobCompositeType = null;	private TabularType jobsTabularType = null;	private static final String[] JOB_KEYS = new String[] { "uid", "name",			"status" };	private static String adminUsername;	private static String adminPassword;	/**	 * Constructor. Does not register the created instance with JMX. Assumed	 * this constructor is used by such as JMX agent creating an instance of	 * Heritrix at the commmand of a remote client (In this case Heritrix will	 * be registered by the invoking agent).	 * 	 * @throws IOException	 */	public Heritrix() throws IOException	{		this(null, false);	}	public Heritrix(final boolean jmxregister) throws IOException	{		this(null, jmxregister);	}	/**	 * Constructor.	 * 	 * @param name	 *            If null, we bring up the default Heritrix instance.	 * @param jmxregister	 *            True if we are to register this instance with JMX agent.	 * @throws IOException	 */	public Heritrix(final String name, final boolean jmxregister)			throws IOException	{		this(name, jmxregister, new CrawlJobHandler(getJobsdir()));	}	/**	 * Constructor.	 * 	 * @param name	 *            If null, we bring up the default Heritrix instance.	 * @param jmxregister	 *            True if we are to register this instance with JMX agent.	 * @param cjh	 *            CrawlJobHandler to use.	 * @throws IOException	 */	public Heritrix(final String name, final boolean jmxregister,			final CrawlJobHandler cjh) throws IOException	{		super();		containerInitialization();		this.jobHandler = cjh;		this.openMBeanInfo = buildMBeanInfo();		// Set up the alerting system. SinkHandler is also a global so will		// catch alerts for all running Heritrix instances. Will need to		// address (Add name of instance that threw the alert to SinkRecord?).		final SinkHandler sinkHandler = SinkHandler.getInstance();		if (sinkHandler == null)		{			throw new NullPointerException("SinkHandler not found.");		}		// Adapt the alerting system to use SinkHandler.		this.alertManager = new AlertManager()		{			public void add(SinkHandlerLogRecord record)			{				sinkHandler.publish(record);			}			public Vector getAll()			{				return sinkHandler.getAll();			}			public Vector getNewAll()			{

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -