⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 heritrix.java

📁 高性能分词算法
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
/* Heritrix * * $Id: Heritrix.java 6081 2008-12-09 00:58:14Z gojomo $ * * Created on May 15, 2003 * * Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */package org.archive.crawler;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.PrintStream;import java.io.PrintWriter;import java.net.HttpURLConnection;import java.net.InetAddress;import java.net.URL;import java.net.URLConnection;import java.net.UnknownHostException;import java.util.ArrayList;import java.util.Arrays;import java.util.Collection;import java.util.Collections;import java.util.Enumeration;import java.util.Hashtable;import java.util.Iterator;import java.util.List;import java.util.Map;import java.util.Properties;import java.util.StringTokenizer;import java.util.TimeZone;import java.util.Vector;import java.util.logging.Level;import java.util.logging.LogManager;import java.util.logging.Logger;import javax.management.Attribute;import javax.management.AttributeList;import javax.management.AttributeNotFoundException;import javax.management.DynamicMBean;import javax.management.InstanceAlreadyExistsException;import javax.management.InstanceNotFoundException;import javax.management.InvalidAttributeValueException;import javax.management.MBeanInfo;import javax.management.MBeanNotificationInfo;import javax.management.MBeanOperationInfo;import javax.management.MBeanRegistration;import javax.management.MBeanRegistrationException;import javax.management.MBeanServer;import javax.management.MBeanServerFactory;import javax.management.MalformedObjectNameException;import javax.management.NotCompliantMBeanException;import javax.management.ObjectName;import javax.management.ReflectionException;import javax.management.RuntimeOperationsException;import javax.management.openmbean.CompositeData;import javax.management.openmbean.CompositeDataSupport;import javax.management.openmbean.CompositeType;import javax.management.openmbean.OpenDataException;import javax.management.openmbean.OpenMBeanAttributeInfoSupport;import javax.management.openmbean.OpenMBeanConstructorInfoSupport;import javax.management.openmbean.OpenMBeanInfoSupport;import javax.management.openmbean.OpenMBeanOperationInfoSupport;import javax.management.openmbean.OpenMBeanParameterInfo;import javax.management.openmbean.OpenMBeanParameterInfoSupport;import javax.management.openmbean.OpenType;import javax.management.openmbean.SimpleType;import javax.management.openmbean.TabularData;import javax.management.openmbean.TabularDataSupport;import javax.management.openmbean.TabularType;import javax.naming.CompoundName;import javax.naming.Context;import javax.naming.NameNotFoundException;import javax.naming.NamingException;import javax.naming.NoInitialContextException;import org.apache.commons.cli.Option;import org.archive.crawler.admin.CrawlJob;import org.archive.crawler.admin.CrawlJobErrorHandler;import org.archive.crawler.admin.CrawlJobHandler;import org.archive.crawler.datamodel.CredentialStore;import org.archive.crawler.datamodel.credential.Credential;import org.archive.crawler.event.CrawlStatusListener;import org.archive.crawler.framework.AlertManager;import org.archive.crawler.framework.CrawlController;import org.archive.crawler.framework.exceptions.FatalConfigurationException;import org.archive.crawler.framework.exceptions.InitializationException;import org.archive.crawler.selftest.SelfTestCrawlJobHandler;import org.archive.crawler.settings.XMLSettingsHandler;import org.archive.io.SinkHandler;import org.archive.io.SinkHandlerLogRecord;import org.archive.net.UURI;import org.archive.util.FileUtils;import org.archive.util.IoUtils;import org.archive.util.JmxUtils;import org.archive.util.JndiUtils;import org.archive.util.PropertyUtils;import org.archive.util.TextUtils;import sun.net.www.protocol.file.FileURLConnection;/** * Main class for Heritrix crawler. * * Heritrix is usually launched by a shell script that backgrounds heritrix * that redirects all stdout and stderr emitted by heritrix to a log file.  So * that startup messages emitted subsequent to the redirection of stdout and * stderr show on the console, this class prints usage or startup output * such as where the web UI can be found, etc., to a STARTLOG that the shell * script is waiting on.  As soon as the shell script sees output in this file, * it prints its content and breaks out of its wait. * See ${HERITRIX_HOME}/bin/heritrix. *  * <p>Heritrix can also be embedded or launched by webapp initialization or * by JMX bootstrapping.  So far I count 4 methods of instantiation: * <ol> * <li>From this classes main -- the method usually used;</li> * <li>From the Heritrix UI (The local-instances.jsp) page;</li> * <li>A creation by a JMX agent at the behest of a remote JMX client; and</li> * <li>A container such as tomcat or jboss.</li> * </ol> * * @author gojomo * @author Kristinn Sigurdsson * @author Stack */public class Heritrix implements DynamicMBean, MBeanRegistration {    /**     * Heritrix logging instance.     */    private static final Logger logger =        Logger.getLogger(Heritrix.class.getName());        private static final File TMPDIR =        new File(System.getProperty("java.io.tmpdir", "/tmp"));    /**     * Name of the heritrix properties file.     */    private static final String PROPERTIES = "heritrix.properties";    /**     * Name of the key to use specifying alternate heritrix properties on     * command line.     */    private static final String PROPERTIES_KEY = PROPERTIES;        /**     * Prefix used on our properties we'll add to the System.properties list.     */    private static final String HERITRIX_PROPERTIES_PREFIX = "heritrix.";    /**     * Prefix used on other properties we'll add to the System.properties      * list (after stripping this prefix).      */    private static final String SYSTEM_PREFIX = "system.";    /**     * Instance of web server if one was started.     */    private static SimpleHttpServer httpServer = null;    /**     * CrawlJob handler. Manages multiple crawl jobs at runtime.     */    private CrawlJobHandler jobHandler = null;    /**     * Heritrix start log file.     *     * This file contains standard out produced by this main class for startup     * only.  Used by heritrix shell script.  Name here MUST match that in the     * <code>bin/heritrix</code> shell script.  This is a DEPENDENCY the shell     * wrapper has on this here java heritrix.     */    private static final String STARTLOG = "heritrix_dmesg.log";    /**     * Default encoding.     *      * Used for content when fetching if none specified.     */	public static final String DEFAULT_ENCODING = "ISO-8859-1";    /**     * Heritrix stderr/stdout log file.     *     * This file should have nothing in it except messages over which we have     * no control (JVM stacktrace, 3rd-party lib emissions).  The wrapper     * startup script directs stderr/stdout here. This is an INTERDEPENDENCY     * this program has with the wrapper shell script.  Shell can actually     * pass us an alternate to use for this file.     */    private static String DEFAULT_HERITRIX_OUT = "heritrix_out.log";    /**     * Where to write this classes startup output.     *      * This out should only be used if Heritrix is being run from the     * command-line.     */    private static PrintWriter out = null;    /**     * The org.archive package     */    private static final String ARCHIVE_PACKAGE = "org.archive.";    /**     * The crawler package.     */	private static final String CRAWLER_PACKAGE = Heritrix.class.getName().        substring(0, Heritrix.class.getName().lastIndexOf('.'));        /**     * The root context for a webapp.     */    private static final String ROOT_CONTEXT = "/";    /**     * Set to true if application is started from command line.     */    private static boolean commandLine = false;        /**     * True if container initialization has been run.     */    private static boolean containerInitialized = false;        /**     * True if properties have been loaded.     */    private static boolean propertiesLoaded = false;        private static final String JAR_SUFFIX = ".jar";        private AlertManager alertManager;    /**     * The context of the GUI webapp.  Default is root.     */    private static String adminContext = ROOT_CONTEXT;        /**     * True if we're to put up a GUI.     * Cmdline processing can override.     */    private static boolean gui =        !PropertyUtils.getBooleanProperty("heritrix.cmdline.nowui");        /**     * Port to put the GUI up on.     * Cmdline processing can override.     */    private static int guiPort = SimpleHttpServer.DEFAULT_PORT;        /**     * A collection containing only localhost.  Used as default value     * for guiHosts, and passed to SimpleHttpServer when doing selftest.     */    final private static Collection<String> LOCALHOST_ONLY =     Collections.unmodifiableList(Arrays.asList(new String[] { "127.0.0.1" }));        /**     * Hosts to bind the GUI webserver to.     * By default, only contans localhost.     * Set to an empty collection to indicate that all available network     * interfaces should be used for the webserver.     */    private static Collection<String> guiHosts = LOCALHOST_ONLY;            /**     * Web UI server, realm, context name.     */    private static String ADMIN = "admin";        // OpenMBean support.    /**     * The MBean server we're registered with (May be null).     */    private MBeanServer mbeanServer = null;        /**     * MBean name we were registered as.     */    private ObjectName mbeanName = null;        /**     * Keep reference to all instances of Heritrix.     * Used by the UI to figure which of the local Heritrice it should     * be going against and to figure what to shutdown on the way out (If     * there was always a JMX Agent, we wouldn't need to keep this list.  We     * could always ask the JMX Agent for all instances. UPDATE: True we could     * always ask the JMX Agent but we might keep around this local reference     * because it will allow faster, less awkward -- think of marshalling the args     * for JMX invoke operation -- access to local Heritrix instances.  A new     * usage for this instances Map is in CrawlJob#preRegister to find the hosting     * Heritrix instance).     */    private static Map<String,Heritrix> instances     = new Hashtable<String,Heritrix>();        private OpenMBeanInfoSupport openMBeanInfo;    private final static String STATUS_ATTR = "Status";    private final static String VERSION_ATTR = "Version";    private final static String ISRUNNING_ATTR = "IsRunning";    private final static String ISCRAWLING_ATTR = "IsCrawling";    private final static String ALERTCOUNT_ATTR = "AlertCount";    private final static String NEWALERTCOUNT_ATTR = "NewAlertCount";    private final static String CURRENTJOB_ATTR = "CurrentJob";    private final static List ATTRIBUTE_LIST;    static {        ATTRIBUTE_LIST = Arrays.asList(new String [] {STATUS_ATTR,            VERSION_ATTR, ISRUNNING_ATTR, ISCRAWLING_ATTR,            ALERTCOUNT_ATTR, NEWALERTCOUNT_ATTR, CURRENTJOB_ATTR});

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -