⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 heritrix.java

📁 爬虫
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
	 * ':' delimiter.	 * 	 * @param str	 *            String to test.	 * @return True if valid password/login string.	 */	protected static boolean isValidLoginPasswordString(String str)	{		boolean isValid = false;		StringTokenizer tokenizer = new StringTokenizer(str, ":");		if (tokenizer.countTokens() == 2)		{			String login = ((String) tokenizer.nextElement()).trim();			String password = ((String) tokenizer.nextElement()).trim();			if (login.length() > 0 && password.length() > 0)			{				isValid = true;			}		}		return isValid;	}	protected static boolean isDevelopment()	{		return System.getProperty("heritrix.development") != null;	}	/**	 * Load the heritrix.properties file.	 * 	 * Adds any property that starts with	 * <code>HERITRIX_PROPERTIES_PREFIX</code> or <code>ARCHIVE_PACKAGE</code>	 * into system properties (except logging '.level' directives).	 * 	 * @return Loaded properties.	 * @throws IOException	 */	protected static Properties loadProperties() throws IOException	{		Properties properties = new Properties();		properties.load(getPropertiesInputStream());		// Any property that begins with ARCHIVE_PACKAGE, make it		// into a system property. While iterating, check to see if anything		// defined on command-line, and if so, it overrules whats in		// heritrix.properties.		for (Enumeration e = properties.keys(); e.hasMoreElements();)		{			String key = ((String) e.nextElement()).trim();			if (key.startsWith(ARCHIVE_PACKAGE)					|| key.startsWith(HERITRIX_PROPERTIES_PREFIX))			{				// Don't add the heritrix.properties entries that are				// changing the logging level of particular classes.				if (key.indexOf(".level") < 0)				{					if (System.getProperty(key) == null							|| System.getProperty(key).length() == 0)					{						System.setProperty(key, properties.getProperty(key)							.trim());					}				}			}		}		return properties;	}	protected static InputStream getPropertiesInputStream() throws IOException	{		File file = null;		// Look to see if properties have been passed on the cmd-line.		String alternateProperties = System.getProperty(PROPERTIES_KEY);		if (alternateProperties != null && alternateProperties.length() > 0)		{			file = new File(alternateProperties);		}		// Get properties from conf directory if one available.		if ((file == null || !file.exists()) && getConfdir(false) != null)		{			file = new File(getConfdir(), PROPERTIES);			if (!file.exists())			{				// If no properties file in the conf dir, set file back to				// null so we go looking for heritrix.properties on classpath.				file = null;			}		}		// If not on the command-line, there is no conf dir. Then get the		// properties from the CLASSPATH (Classpath file separator is always		// '/', whatever the platform.		InputStream is = (file != null) ? new FileInputStream(file)				: Heritrix.class.getResourceAsStream("/" + PROPERTIES_KEY);		if (is == null)		{			throw new IOException("Failed to load properties file from"					+ " filesystem or from classpath.");		}		return is;	}	/**	 * If the user hasn't altered the default logging parameters, tighten them	 * up somewhat: some of our libraries are way too verbose at the INFO or	 * WARNING levels.	 * 	 * This might be a problem running inside in someone else's container.	 * Container's seem to prefer commons logging so we ain't messing them doing	 * the below.	 * 	 * @throws IOException	 * @throws SecurityException	 */	protected static void patchLogging() throws SecurityException, IOException	{		if (System.getProperty("java.util.logging.config.class") != null)		{			return;		}		if (System.getProperty("java.util.logging.config.file") != null)		{			return;		}		// No user-set logging properties established; use defaults		// from distribution-packaged 'heritrix.properties'.		LogManager.getLogManager()			.readConfiguration(getPropertiesInputStream());	}	/**	 * Configure our trust store.	 * 	 * If system property is defined, then use it for our truststore. Otherwise	 * use the heritrix truststore under conf directory if it exists.	 * 	 * <p>	 * If we're not launched from the command-line, we will not be able to find	 * our truststore. The truststore is nor normally used so rare should this	 * be a problem (In case where we don't use find our trust store, we'll use	 * the 'default' -- either the JVMs or the containers).	 */	protected static void configureTrustStore()	{		// Below must be defined in jsse somewhere but can' find it.		final String TRUSTSTORE_KEY = "javax.net.ssl.trustStore";		String value = System.getProperty(TRUSTSTORE_KEY);		File confdir = null;		try		{			confdir = getConfdir(false);		}		catch (IOException e)		{			logger.log(Level.WARNING, "Failed to get confdir.", e);		}		if ((value == null || value.length() <= 0) && confdir != null)		{			// Use the heritrix store if it exists on disk.			File heritrixStore = new File(confdir, "heritrix.cacerts");			if (heritrixStore.exists())			{				value = heritrixStore.getAbsolutePath();			}		}		if (value != null && value.length() > 0)		{			System.setProperty(TRUSTSTORE_KEY, value);		}	}	/**	 * Run the selftest	 * 	 * @param oneSelfTestName	 *            Name of a test if we are to run one only rather than the	 *            default running all tests.	 * @param port	 *            Port number to use for web UI.	 * 	 * @exception Exception	 * @return Status of how selftest startup went.	 */	protected static String selftest(final String oneSelfTestName,			final int port) throws Exception	{		// Put up the webserver w/ the root and selftest webapps only.		final String SELFTEST = "selftest";		Heritrix.httpServer = new SimpleHttpServer(SELFTEST,			Heritrix.adminContext, LOCALHOST_ONLY, port, true);		// Set up digest auth for a section of the server so selftest can run		// auth tests. Looks like can only set one login realm going by the		// web.xml dtd. Otherwise, would be nice to selftest basic and digest.		// Have login, password and role all be SELFTEST. Must match what is		// in the selftest order.xml file.		Heritrix.httpServer.setAuthentication(SELFTEST, Heritrix.adminContext,			SELFTEST, SELFTEST, SELFTEST);		Heritrix.httpServer.startServer();		// Get the order file from the CLASSPATH unless we're running in dev		// environment.		File selftestDir = (isDevelopment()) ? new File(getConfdir(), SELFTEST)				: new File(File.separator + SELFTEST);		File crawlOrderFile = new File(selftestDir, "order.xml");		// Create a job based off the selftest order file. Then use this as		// a template to pass jobHandler.newJob(). Doing this gets our		// selftest output to show under the jobs directory.		// Pass as a seed a pointer to the webserver we just put up.		final String ROOTURI = "127.0.0.1:" + Integer.toString(port);		String selfTestUrl = "http://" + ROOTURI + '/';		if (oneSelfTestName != null && oneSelfTestName.length() > 0)		{			selfTestUrl += (oneSelfTestName + '/');		}		CrawlJobHandler cjh = new SelfTestCrawlJobHandler(getJobsdir(),			oneSelfTestName, selfTestUrl);		Heritrix h = new Heritrix("Selftest", true, cjh);		CrawlJob job = createCrawlJob(cjh, crawlOrderFile, "Template");		job = h.getJobHandler().newJob(job, null, SELFTEST,			"Integration self test", selfTestUrl, CrawlJob.PRIORITY_CRITICAL);		h.getJobHandler().addJob(job);		// Before we start, need to change some items in the settings file.		CredentialStore cs = (CredentialStore) job.getSettingsHandler()			.getOrder().getAttribute(CredentialStore.ATTR_NAME);		for (Iterator i = cs.iterator(null); i.hasNext();)		{			((Credential) i.next()).setCredentialDomain(null, ROOTURI);		}		h.getJobHandler().startCrawler();		StringBuffer buffer = new StringBuffer();		buffer.append("Heritrix " + Heritrix.getVersion()				+ " selftest started.");		buffer.append("\nSelftest first crawls " + selfTestUrl				+ " and then runs an analysis.");		buffer.append("\nResult of analysis printed to " + getHeritrixOut()				+ " when done.");		buffer.append("\nSelftest job directory for logs and arcs:\n"				+ job.getDirectory().getAbsolutePath());		return buffer.toString();	}	/**	 * Launch the crawler without a web UI and run the passed crawl only.	 * 	 * Specialized version of {@link #launch()}.	 * 	 * @param crawlOrderFile	 *            The crawl order to crawl.	 * @throws InitializationException	 * @throws InvalidAttributeValueException	 * @return Status string.	 */	protected String doOneCrawl(String crawlOrderFile)			throws InitializationException, InvalidAttributeValueException	{		return doOneCrawl(crawlOrderFile, null);	}	/**	 * Launch the crawler without a web UI and run passed crawl only.	 * 	 * Specialized version of {@link #launch()}.	 * 	 * @param crawlOrderFile	 *            The crawl order to crawl.	 * @param listener	 *            Register this crawl status listener before starting crawl (You	 *            can use this listener to notice end-of-crawl).	 * @throws InitializationException	 * @throws InvalidAttributeValueException	 * @return Status string.	 */	protected String doOneCrawl(String crawlOrderFile,			CrawlStatusListener listener) throws InitializationException,			InvalidAttributeValueException	{		XMLSettingsHandler handler = new XMLSettingsHandler(new File(			crawlOrderFile));		handler.initialize();		CrawlController controller = new CrawlController();		controller.initialize(handler);		if (listener != null)		{			controller.addCrawlStatusListener(listener);		}		controller.requestCrawlStart();		return "Crawl started using " + crawlOrderFile + ".";	}	/**	 * Launch the crawler for a web UI.	 * 	 * Crawler hangs around waiting on jobs.	 * 	 * @exception Exception	 * @return A status string describing how the launch went.	 * @throws Exception	 */	public String launch() throws Exception	{		return launch(null, false);	}	/**	 * Launch the crawler for a web UI.	 * 	 * Crawler hangs around waiting on jobs.	 * 	 * @param crawlOrderFile	 *            File to crawl. May be null.	 * @param runMode	 *            Whether crawler should be set to run mode.	 * 	 * @exception Exception	 * @return A status string describing how the launch went.	 */	public String launch(String crawlOrderFile, boolean runMode)			throws Exception	{		String status = null;		if (crawlOrderFile != null)		{			addCrawlJob(crawlOrderFile, "Autolaunched", "", "");			if (runMode)			{				this.jobHandler.startCrawler();				status = "Job being crawled: " + crawlOrderFile;			}			else			{				status = "Crawl job ready and pending: " + crawlOrderFile;			}		}		else if (runMode)		{			// The use case is that jobs are to be run on a schedule and that			// if the crawler is in run mode, then the scheduled job will be			// run at appropriate time. Otherwise, not.			this.jobHandler.startCrawler();			status = "Crawler set to run mode.";		}		return status;	}	/**	 * Start up the embedded Jetty webserver instance. This is done when we're	 * run from the command-line.	 * 	 * @param port	 *            Port number to use for web UI.	 * @param adminLoginPassword	 *            Compound of login and password.	 * @throws Exception	 * @return Status on webserver startup.	 * @deprecated Use startEmbeddedWebserver(hosts, port, adminLoginPassword)	 */	protected static String startEmbeddedWebserver(final int port,			final boolean lho, final String adminLoginPassword)			throws Exception	{		ArrayList<String> hosts = new ArrayList<String>();		if (lho)		{			hosts.add("127.0.0.1");		}		return startEmbeddedWebserver(hosts, port, adminLoginPassword);	}	/**	 * Parses a list of host names.	 * 	 * <p>	 * If the given string is <code>/</code>, then an empty collection is	 * returned. This indicates that all available network interfaces should be	 * used.	 * 	 * <p>	 * Otherwise, the string must contain a comma-separated list of IP addresses	 * or host names. The parsed list is then returned.	 * 	 * @param hosts	 *            the string to parse	 * @return the parsed collection of hosts	 */	private static Collection<String> parseHosts(String hosts)	{		hosts = hosts.trim();		if (hosts.equals("/"))		{			return new ArrayList<String>(1);		}		String[] hostArray = hosts.split(",");		for (int i = 0; i < hostArray.length; i++)		{			hostArray[i] = hostArray[i].trim();		}		return Arrays.asList(hostArray);	}	/**	 * Start up the embedded Jetty webserver instance. This is done when we're	 * run from the command-line.	 * 	 * @param hosts	 *            a list of IP addresses or hostnames to bind to, or an empty	 *            collection to bind to all available network interfaces	 * @param port	 *            Port number to use for web UI.	 * @param adminLoginPassword	 *            Compound of login and password.	 * @throws Exception	 * @return Status on webserver startup.	 */	protected static String startEmbeddedWebserver(Collection<String> hosts,			int port, String adminLoginPassword) throws Exception	{		adminUsername = adminLoginPassword.substring(0, adminLoginPassword			.indexOf(":"));		adminPassword = adminLoginPassword.substring(adminLoginPassword			.indexOf(":") + 1);		Heritrix.httpServer = new SimpleHttpServer("admin",			Heritrix.adminContext, hosts, port, false);		final String DOTWAR = ".war";		final String SELFTEST = "selftest";		// Look for additional WAR files beyond 'selftest' and 'admin'.		File[] wars = getWarsdir().listFiles();		for (int i = 0; i < wars.length; i++)		{			if (wars[i].isFile())			{				final String warName = wars[i].getName();				final String warNameNC = warName.toLowerCase();				if (warNameNC.endsWith(DOTWAR)						&& !warNameNC.equals(ADMIN + DOTWAR)						&& !warNameNC.equals(SELFTEST + DOTWAR))				{					int dot = warName.indexOf('.');					Heritrix.httpServer.addWebapp(warName.substring(0, dot),						null, true);				}			}		}		// Name of passed 'realm' must match what is in configured in web.xml.		// We'll use ROLE for 'realm' and 'role'.		final String ROLE = ADMIN;		Heritrix.httpServer.setAuthentication(ROLE, Heritrix.adminContext,			adminUsername, adminPassword, ROLE);		Heritrix.httpServer.startServer();

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -