⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 heritrix.java

📁 爬虫
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
				return sinkHandler.getAllUnread();			}			public SinkHandlerLogRecord get(String alertID)			{				return sinkHandler.get(Long.parseLong(alertID));			}			public int getCount()			{				return sinkHandler.getCount();			}			public int getNewCount()			{				return sinkHandler.getUnreadCount();			}			public void remove(String alertID)			{				sinkHandler.remove(Long.parseLong(alertID));			}			public void read(String alertID)			{				sinkHandler.read(Long.parseLong(alertID));			}		};		try		{			Heritrix.registerHeritrix(this, name, jmxregister);		}		catch (InstanceAlreadyExistsException e)		{			throw new RuntimeException(e);		}		catch (MBeanRegistrationException e)		{			throw new RuntimeException(e);		}		catch (NotCompliantMBeanException e)		{			throw new RuntimeException(e);		}		catch (MalformedObjectNameException e)		{			throw new RuntimeException(e);		}	}	/**	 * Run setup tasks for this 'container'. Idempotent.	 * 	 * @throws IOException	 */	protected static void containerInitialization() throws IOException	{		if (Heritrix.containerInitialized)		{			return;		}		Heritrix.containerInitialized = true;		// Load up the properties. This invocation adds heritrix properties		// to system properties so all available via System.getProperty.		// Note, loadProperties and patchLogging have global effects. May be an		// issue if we're running inside a container such as tomcat or jboss.		Heritrix.loadProperties();		Heritrix.patchLogging();		Heritrix.configureTrustStore();		// Will run on SIGTERM but not on SIGKILL, unfortunately.		// Otherwise, ensures we cleanup after ourselves (Deregister from		// JMX and JNDI).		Runtime.getRuntime().addShutdownHook(			Heritrix.getShutdownThread(false, 0, "Heritrix shutdown hook"));		// Register this heritrix 'container' though we may be inside another		// tomcat or jboss container.		try		{			registerContainerJndi();		}		catch (Exception e)		{			logger.log(Level.WARNING, "Failed jndi container registration.", e);		}	}	/**	 * Do inverse of construction. Used by anyone who does a 'new Heritrix' when	 * they want to cleanup the instance. Of note, there may be Heritrix threads	 * still hanging around after the call to destroy completes. They'll	 * eventually go down after they've finished their cleanup routines. In	 * particular, if you are watching Heritrix via JMX, you can see the	 * Heritrix instance JMX bean unregister ahead of the CrawlJob JMX bean that	 * its hosting.	 */	public void destroy()	{		stop();		try		{			Heritrix.unregisterHeritrix(this);		}		catch (InstanceNotFoundException e)		{			e.printStackTrace();		}		catch (MBeanRegistrationException e)		{			e.printStackTrace();		}		catch (NullPointerException e)		{			e.printStackTrace();		}		this.jobHandler = null;		this.openMBeanInfo = null;	}	/**	 * Launch program. Optionally will launch a web server to host UI. Will also	 * register Heritrix MBean with first found JMX Agent (Usually the 1.5.0 JVM	 * Agent).	 * 	 * @param args	 *            Command line arguments.	 * @throws Exception	 */	public static void main(String[] args) throws Exception	{		Heritrix.commandLine = true;		// Set timezone here. Would be problematic doing it if we're running		// inside in a container.		TimeZone.setDefault(TimeZone.getTimeZone("GMT+8"));		File startLog = new File(getHeritrixHome(), STARTLOG);		Heritrix.out = new PrintWriter(isDevelopment() ? System.out				: new PrintStream(new FileOutputStream(startLog)));		try		{			containerInitialization();			String status = doCmdLineArgs(args);			if (status != null)			{				Heritrix.out.println(status);			}		}		catch (Exception e)		{			// Show any exceptions in STARTLOG.			e.printStackTrace(Heritrix.out);			throw e;		}		finally		{			// If not development, close the file that signals the wrapper			// script that we've started. Otherwise, just flush it; if in			// development, the output is probably a console.			if (!isDevelopment())			{				if (Heritrix.out != null)				{					Heritrix.out.close();				}				System.out					.println("Heritrix version: " + Heritrix.getVersion());			}			else			{				if (Heritrix.out != null)				{					Heritrix.out.flush();				}			}		}	}	protected static String doCmdLineArgs(final String[] args) throws Exception	{		// Get defaults for commandline arguments from the properties file.		String tmpStr = PropertyUtils.getPropertyOrNull("heritrix.context");		if (tmpStr != null)		{			Heritrix.adminContext = tmpStr;		}		tmpStr = PropertyUtils.getPropertyOrNull("heritrix.cmdline.port");		if (tmpStr != null)		{			Heritrix.guiPort = Integer.parseInt(tmpStr);		}		tmpStr = PropertyUtils.getPropertyOrNull("heritrix.cmdline.admin");		String adminLoginPassword = (tmpStr == null) ? "" : tmpStr;		String crawlOrderFile = PropertyUtils			.getPropertyOrNull("heritrix.cmdline.order");		tmpStr = PropertyUtils.getPropertyOrNull("heritrix.cmdline.run");		boolean runMode = PropertyUtils			.getBooleanProperty("heritrix.cmdline.run");		boolean selfTest = false;		String selfTestName = null;		CommandLineParser clp = new CommandLineParser(args, Heritrix.out,			Heritrix.getVersion());		List arguments = clp.getCommandLineArguments();		Option[] options = clp.getCommandLineOptions();		// Check passed argument. Only one argument, the ORDER_FILE is allowed.		// If one argument, make sure exists and xml suffix.		if (arguments.size() > 1)		{			clp.usage(1);		}		else if (arguments.size() == 1)		{			crawlOrderFile = (String) arguments.get(0);			if (!(new File(crawlOrderFile).exists()))			{				clp.usage("ORDER.XML <" + crawlOrderFile						+ "> specified does not exist.", 1);			}			// Must end with '.xml'			if (crawlOrderFile.length() > 4					&& !crawlOrderFile.substring(crawlOrderFile.length() - 4)						.equalsIgnoreCase(".xml"))			{				clp.usage("ORDER.XML <" + crawlOrderFile						+ "> does not have required '.xml' suffix.", 1);			}		}		// Now look at options passed.		for (int i = 0; i < options.length; i++)		{			switch (options[i].getId())			{				case 'h':				clp.usage();					break;				case 'a':				adminLoginPassword = options[i].getValue();					break;				case 'n':				if (crawlOrderFile == null)				{					clp.usage("You must specify an ORDER_FILE with"							+ " '--nowui' option.", 1);				}				Heritrix.gui = false;					break;				case 'b':				Heritrix.guiHosts = parseHosts(options[i].getValue());					break;				case 'p':				try				{					Heritrix.guiPort = Integer.parseInt(options[i].getValue());				}				catch (NumberFormatException e)				{					clp.usage("Failed parse of port number: "							+ options[i].getValue(), 1);				}				if (Heritrix.guiPort <= 0)				{					clp.usage("Nonsensical port number: "							+ options[i].getValue(), 1);				}					break;				case 'r':				runMode = true;					break;				case 's':				selfTestName = options[i].getValue();				selfTest = true;					break;				default:				assert false : options[i].getId();			}		}		// Ok, we should now have everything to launch the program.		String status = null;		if (selfTest)		{			// If more than just '--selftest' and '--port' passed, then			// there is confusion on what is being asked of us. Print usage			// rather than proceed.			for (int i = 0; i < options.length; i++)			{				if (options[i].getId() != 'p' && options[i].getId() != 's')				{					clp.usage(1);				}			}			if (arguments.size() > 0)			{				// No arguments accepted by selftest.				clp.usage(1);			}			status = selftest(selfTestName, Heritrix.guiPort);		}		else		{			if (!isValidLoginPasswordString(adminLoginPassword))			{				clp.usage("Invalid admin login:password value, or none "						+ "specified. ", 1);			}			if (!Heritrix.gui)			{				if (options.length > 1)				{					// If more than just '--nowui' passed, then there is					// confusion on what is being asked of us. Print usage					// rather than proceed.					clp.usage(1);				}				Heritrix h = new Heritrix(true);				status = h.doOneCrawl(crawlOrderFile);			}			else			{				status = startEmbeddedWebserver(Heritrix.guiHosts,					Heritrix.guiPort, adminLoginPassword);				Heritrix h = new Heritrix(true);				String tmp = h.launch(crawlOrderFile, runMode);				if (tmp != null)				{					status += ('\n' + tmp);				}			}		}		return status;	}	/**	 * @return The file we dump stdout and stderr into.	 */	public static String getHeritrixOut()	{		String tmp = System.getProperty("heritrix.out");		if (tmp == null || tmp.length() == 0)		{			tmp = Heritrix.DEFAULT_HERITRIX_OUT;		}		return tmp;	}	/**	 * Exploit <code>-Dheritrix.home</code> if available to us. Is current	 * working dir if no heritrix.home property supplied.	 * 	 * @return Heritrix home directory.	 * @throws IOException	 */	protected static File getHeritrixHome() throws IOException	{		File heritrixHome = null;		String home = System.getProperty("heritrix.home");		if (home != null && home.length() > 0)		{			heritrixHome = new File(home);			if (!heritrixHome.exists())			{				throw new IOException("HERITRIX_HOME <" + home						+ "> does not exist.");			}		}		else		{			heritrixHome = new File(new File("").getAbsolutePath());		}		return heritrixHome;	}	/**	 * @return The directory into which we put jobs. If the system property	 *         'heritrix.jobsdir' is set, we will use its value in place of the	 *         default 'jobs' directory in the current working directory.	 * @throws IOException	 */	public static File getJobsdir() throws IOException	{		String jobsdirStr = System.getProperty("heritrix.jobsdir", "jobs");		return jobsdirStr.startsWith(File.separator) ? new File(jobsdirStr)				: new File(getHeritrixHome(), jobsdirStr);	}	/**	 * Get and check for existence of expected subdir.	 * 	 * If development flag set, then look for dir under src dir.	 * 	 * @param subdirName	 *            Dir to look for.	 * @return The extant subdir. Otherwise null if we're running in a webapp	 *         context where there is no conf directory available.	 * @throws IOException	 *             if unable to find expected subdir.	 */	protected static File getSubDir(String subdirName) throws IOException	{		return getSubDir(subdirName, true);	}	/**	 * Get and optionally check for existence of subdir.	 * 	 * If development flag set, then look for dir under src dir.	 * 	 * @param subdirName	 *            Dir to look for.	 * @param fail	 *            True if we are to fail if directory does not exist; false if	 *            we are to return false if the directory does not exist.	 * @return The extant subdir. Otherwise null if we're running in a webapp	 *         context where there is no subdir directory available.	 * @throws IOException	 *             if unable to find expected subdir.	 */	protected static File getSubDir(String subdirName, boolean fail)			throws IOException	{		String path = isDevelopment() ? "src" + File.separator + subdirName				: subdirName;		File dir = new File(getHeritrixHome(), path);		if (!dir.exists())		{			if (fail)			{				throw new IOException("Cannot find subdir: " + subdirName);			}			dir = null;		}		return dir;	}	/**	 * Test string is valid login/password string.	 * 	 * A valid login/password string has the login and password compounded w/ a

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -