📄 heritrix.java

📁 爬虫
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
		StringBuffer buffer = new StringBuffer();		buffer.append("Heritrix " + Heritrix.getVersion() + " is running.");		for (String host : httpServer.getHosts())		{			buffer.append("\nWeb console is at: http://");			buffer.append(host).append(':').append(port);		}		buffer.append("\nWeb console login and password: " + adminUsername				+ "/" + adminPassword);		return buffer.toString();	}	/**	 * Replace existing administrator login info with new info.	 * 	 * @param newUsername	 *            new administrator login username	 * @param newPassword	 *            new administrator login password	 */	public static void resetAuthentication(String newUsername,			String newPassword)	{		Heritrix.httpServer.resetAuthentication(ADMIN, adminUsername,			newUsername, newPassword);		adminUsername = newUsername;		adminPassword = newPassword;		logger.info("administrative login changed to " + newUsername + ":"				+ newPassword);	}	protected static CrawlJob createCrawlJob(CrawlJobHandler handler,			File crawlOrderFile, String name)			throws InvalidAttributeValueException	{		XMLSettingsHandler settings = new XMLSettingsHandler(crawlOrderFile);		settings.initialize();		return new CrawlJob(handler.getNextJobUID(), name, settings,			new CrawlJobErrorHandler(Level.SEVERE), CrawlJob.PRIORITY_HIGH,			crawlOrderFile.getAbsoluteFile().getParentFile());	}	/**	 * This method is called when we have an order file to hand that we want to	 * base a job on. It leaves the order file in place and just starts up a job	 * that uses all the order points to for locations for logs, etc.	 * 	 * @param orderPathOrUrl	 *            Path to an order file or to a seeds file.	 * @param name	 *            Name to use for this job.	 * @param description	 * @param seeds	 * @return A status string.	 * @throws IOException	 * @throws FatalConfigurationException	 */	public String addCrawlJob(String orderPathOrUrl, String name,			String description, String seeds) throws IOException,			FatalConfigurationException	{		if (!UURI.hasScheme(orderPathOrUrl))		{			// Assume its a file path.			return addCrawlJob(new File(orderPathOrUrl), name, description,				seeds);		}		// Otherwise, must be an URL.		URL url = new URL(orderPathOrUrl);		// Handle http and file only for now (Tried to handle JarUrlConnection		// but too awkward undoing jar stream. Rather just look for URLs that		// end in '.jar').		String result = null;		URLConnection connection = url.openConnection();		if (connection instanceof HttpURLConnection)		{			result = addCrawlJob(url, (HttpURLConnection) connection, name,				description, seeds);		}		else if (connection instanceof FileURLConnection)		{			result = addCrawlJob(new File(url.getPath()), name, description,				seeds);		}		else		{			throw new UnsupportedOperationException("No support for "					+ connection);		}		return result;	}	protected String addCrawlJob(final URL url,			final HttpURLConnection connection, final String name,			final String description, final String seeds) throws IOException,			FatalConfigurationException	{		// Look see if its a jar file. If it is undo it.		boolean isJar = url.getPath() != null				&& url.getPath().toLowerCase().endsWith(JAR_SUFFIX);		// If http url connection, bring down the resource local.		File localFile = File.createTempFile(Heritrix.class.getName(), isJar				? JAR_SUFFIX : null, TMPDIR);		connection.connect();		String result = null;		try		{			IoUtils.readFullyToFile(connection.getInputStream(), localFile);			result = addCrawlJob(localFile, name, description, seeds);		}		catch (IOException ioe)		{			// Cleanup if an Exception.			localFile.delete();			localFile = null;		}		finally		{			connection.disconnect();			// If its a jar file, then we made a job based on the jar contents.			// Its no longer needed. Remove it. If not a jar file, then leave			// the file around because the job depends on it.			if (isJar && localFile != null && localFile.exists())			{				localFile.delete();			}		}		return result;	}	protected String addCrawlJob(final File order, final String name,			final String description, final String seeds)			throws FatalConfigurationException, IOException	{		CrawlJob addedJob = null;		if (this.jobHandler == null)		{			throw new NullPointerException("Heritrix jobhandler is null.");		}		try		{			if (order.getName().toLowerCase().endsWith(JAR_SUFFIX))			{				return addCrawlJobBasedonJar(order, name, description, seeds);			}			addedJob = this.jobHandler.addJob(createCrawlJob(this.jobHandler,				order, name));		}		catch (InvalidAttributeValueException e)		{			FatalConfigurationException fce = new FatalConfigurationException(				"Converted InvalidAttributeValueException on "						+ order.getAbsolutePath() + ": " + e.getMessage());			fce.setStackTrace(e.getStackTrace());		}		return addedJob != null ? addedJob.getUID() : null;	}	/**	 * Undo jar file and use as basis for a new job.	 * 	 * @param jarFile	 *            Pointer to file that holds jar.	 * @param name	 *            Name to use for new job.	 * @param description	 * @param seeds	 * @return Message.	 * @throws IOException	 * @throws FatalConfigurationException	 */	protected String addCrawlJobBasedonJar(final File jarFile,			final String name, final String description, final String seeds)			throws IOException, FatalConfigurationException	{		if (jarFile == null || !jarFile.exists())		{			throw new FileNotFoundException(jarFile.getAbsolutePath());		}		// Create a directory with a tmp name. Do it by first creating file,		// removing it, then creating the directory. There is a hole during		// which the OS may put a file of same exact name in our way but		// unlikely.		File dir = File.createTempFile(Heritrix.class.getName(),			".expandedjar", TMPDIR);		dir.delete();		dir.mkdir();		try		{			org.archive.crawler.util.IoUtils.unzip(jarFile, dir);			// Expect to find an order file at least.			File orderFile = new File(dir, "order.xml");			if (!orderFile.exists())			{				throw new IOException("Missing order: "						+ orderFile.getAbsolutePath());			}			CrawlJob job = createCrawlJobBasedOn(orderFile, name, description,				seeds);			// Copy into place any seeds and settings directories before we			// add job to Heritrix to crawl.			File seedsFile = new File(dir, "seeds.txt");			if (seedsFile.exists())			{				FileUtils.copyFiles(seedsFile, new File(job.getDirectory(),					seedsFile.getName()));			}			File settingsDir = new File(dir, "settings");			if (settingsDir.exists())			{				FileUtils.copyFiles(settingsDir, job.getDirectory());			}			addCrawlJob(job);			return job.getUID();		}		finally		{			// After job has been added, no more need of expanded content.			// (Let the caller be responsible for cleanup of jar. Sometimes			// its should be deleted -- when its a local copy of a jar pulled			// across the net -- wherease other times, if its a jar passed			// in w/ a 'file' scheme, it shouldn't be deleted.			org.archive.util.FileUtils.deleteDir(dir);		}	}	public String addCrawlJobBasedOn(String jobUidOrProfile, String name,			String description, String seeds)	{		try		{			CrawlJob cj = getJobHandler().getJob(jobUidOrProfile);			if (cj == null)			{				throw new InvalidAttributeValueException(jobUidOrProfile						+ " is not a job UID or profile name (Job UIDs are "						+ " usually the 14 digit date portion of job name).");			}			CrawlJob job = addCrawlJobBasedOn(cj.getSettingsHandler()				.getOrderFile(), name, description, seeds);			return job.getUID();		}		catch (Exception e)		{			e.printStackTrace();			return "Exception on " + jobUidOrProfile + ": " + e.getMessage();		}	}	protected CrawlJob addCrawlJobBasedOn(final File orderFile,			final String name, final String description, final String seeds)			throws FatalConfigurationException	{		return addCrawlJob(createCrawlJobBasedOn(orderFile, name, description,			seeds));	}	protected CrawlJob createCrawlJobBasedOn(final File orderFile,			final String name, final String description, final String seeds)			throws FatalConfigurationException	{		CrawlJob job = getJobHandler().newJob(orderFile, name, description,			seeds);		return CrawlJobHandler.ensureNewJobWritten(job, name, description);	}	protected CrawlJob addCrawlJob(final CrawlJob job)	{		return getJobHandler().addJob(job);	}	public void startCrawling()	{		if (getJobHandler() == null)		{			throw new NullPointerException("Heritrix jobhandler is null.");		}		getJobHandler().startCrawler();	}	public void stopCrawling()	{		if (getJobHandler() == null)		{			throw new NullPointerException("Heritrix jobhandler is null.");		}		getJobHandler().stopCrawler();	}	/**	 * Get the heritrix version.	 * 	 * @return The heritrix version. May be null.	 */	public static String getVersion()	{		return System.getProperty("heritrix.version");	}	/**	 * Get the job handler	 * 	 * @return The CrawlJobHandler being used.	 */	public CrawlJobHandler getJobHandler()	{		return this.jobHandler;	}	/**	 * Get the configuration directory.	 * 	 * @return The conf directory under HERITRIX_HOME or null if none can be	 *         found.	 * @throws IOException	 */	public static File getConfdir() throws IOException	{		return getConfdir(true);	}	/**	 * Get the configuration directory.	 * 	 * @param fail	 *            Throw IOE if can't find directory if true, else just return	 *            null.	 * @return The conf directory under HERITRIX_HOME or null (or an IOE) if	 *         can't be found.	 * @throws IOException	 */	public static File getConfdir(final boolean fail) throws IOException	{		final String key = "heritrix.conf";		// Look to see if heritrix.conf property passed on the cmd-line.		String tmp = System.getProperty(key);		// if not fall back to default $HERITIX_HOME/conf		if (tmp == null || tmp.length() == 0)		{			return getSubDir("conf", fail);		}		File dir = new File(tmp);		if (!dir.exists())		{			if (fail)			{				throw new IOException("Cannot find conf dir: " + tmp);			}			else			{				logger.log(Level.WARNING, "Specified " + key						+ " dir does not exist.  Falling back on default");			}			dir = getSubDir("conf", fail);		}		return dir;	}	/**	 * @return Returns the httpServer. May be null if one was not started.	 */	public static SimpleHttpServer getHttpServer()	{		return Heritrix.httpServer;	}	/**	 * @throws IOException	 * @return Returns the directory under which reside the WAR files we're to	 *         load into the servlet container.	 */	public static File getWarsdir() throws IOException	{		return getSubDir("webapps");	}	/**	 * Prepars for program shutdown. This method does it's best to prepare the	 * program so that it can exit normally. It will kill the httpServer and	 * terminate any running job.<br>	 * It is advisible to wait a few (~1000) millisec after calling this method	 * and before calling performHeritrixShutDown() to allow as many threads as	 * possible to finish what they are doing.	 */	public static void prepareHeritrixShutDown()	{		// Stop and destroy all running Heritrix instances.		// Get array of the key set to avoid CCEs for case where call to		// destroy does a remove of an instance from Heritrix.instances.		final Object[] keys = Heritrix.instances.keySet().toArray();		for (int i = 0; i < keys.length; i++)		{			((Heritrix) Heritrix.instances.get(keys[i])).destroy();		}		try		{			deregisterJndi(getJndiContainerName());		}		catch (NameNotFoundException e)		{			// We were probably unbound already. Ignore.			logger.log(Level.WARNING, "deregistration of jndi", e);		}		catch (Exception e)		{			e.printStackTrace();		}		if (Heritrix.httpServer != null)		{			// Shut down the web access.			try			{				Heritrix.httpServer.stopServer();			}			catch (InterruptedException e)			{				// Generally this can be ignored, but we'll print a stack trace				// just in case.				e.printStackTrace();			}			finally			{				Heritrix.httpServer = null;			}		}	}	/**	 * Exit program. Recommended that prepareHeritrixShutDown() be invoked prior	 * to this method.	 */	public static void performHeritrixShutDown()	{		performHeritrixShutDown(0);	}	/**	 * Exit program. Recommended that prepareHeritrixShutDown() be invoked prior	 * to this method.	 * 	 * @param exitCode	 *            Code to pass System.exit.	 * 	 */	public static void performHeritrixShutDown(int exitCode)	{		System.exit(exitCode);
💿 文件大小 10600 K
👤 上传用户 ludingpc
📂 所属分类中间件编程
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -