📄 heritrix.java
字号:
StringBuffer buffer = new StringBuffer(); buffer.append("Heritrix " + Heritrix.getVersion() + " is running."); for (String host : httpServer.getHosts()) { buffer.append("\nWeb console is at: http://"); buffer.append(host).append(':').append(port); } buffer.append("\nWeb console login and password: " + adminUsername + "/" + adminPassword); return buffer.toString(); } /** * Replace existing administrator login info with new info. * * @param newUsername * new administrator login username * @param newPassword * new administrator login password */ public static void resetAuthentication(String newUsername, String newPassword) { Heritrix.httpServer.resetAuthentication(ADMIN, adminUsername, newUsername, newPassword); adminUsername = newUsername; adminPassword = newPassword; logger.info("administrative login changed to " + newUsername + ":" + newPassword); } protected static CrawlJob createCrawlJob(CrawlJobHandler handler, File crawlOrderFile, String name) throws InvalidAttributeValueException { XMLSettingsHandler settings = new XMLSettingsHandler(crawlOrderFile); settings.initialize(); return new CrawlJob(handler.getNextJobUID(), name, settings, new CrawlJobErrorHandler(Level.SEVERE), CrawlJob.PRIORITY_HIGH, crawlOrderFile.getAbsoluteFile().getParentFile()); } /** * This method is called when we have an order file to hand that we want to * base a job on. It leaves the order file in place and just starts up a job * that uses all the order points to for locations for logs, etc. * * @param orderPathOrUrl * Path to an order file or to a seeds file. * @param name * Name to use for this job. * @param description * @param seeds * @return A status string. * @throws IOException * @throws FatalConfigurationException */ public String addCrawlJob(String orderPathOrUrl, String name, String description, String seeds) throws IOException, FatalConfigurationException { if (!UURI.hasScheme(orderPathOrUrl)) { // Assume its a file path. return addCrawlJob(new File(orderPathOrUrl), name, description, seeds); } // Otherwise, must be an URL. URL url = new URL(orderPathOrUrl); // Handle http and file only for now (Tried to handle JarUrlConnection // but too awkward undoing jar stream. Rather just look for URLs that // end in '.jar'). String result = null; URLConnection connection = url.openConnection(); if (connection instanceof HttpURLConnection) { result = addCrawlJob(url, (HttpURLConnection) connection, name, description, seeds); } else if (connection instanceof FileURLConnection) { result = addCrawlJob(new File(url.getPath()), name, description, seeds); } else { throw new UnsupportedOperationException("No support for " + connection); } return result; } protected String addCrawlJob(final URL url, final HttpURLConnection connection, final String name, final String description, final String seeds) throws IOException, FatalConfigurationException { // Look see if its a jar file. If it is undo it. boolean isJar = url.getPath() != null && url.getPath().toLowerCase().endsWith(JAR_SUFFIX); // If http url connection, bring down the resource local. File localFile = File.createTempFile(Heritrix.class.getName(), isJar ? JAR_SUFFIX : null, TMPDIR); connection.connect(); String result = null; try { IoUtils.readFullyToFile(connection.getInputStream(), localFile); result = addCrawlJob(localFile, name, description, seeds); } catch (IOException ioe) { // Cleanup if an Exception. localFile.delete(); localFile = null; } finally { connection.disconnect(); // If its a jar file, then we made a job based on the jar contents. // Its no longer needed. Remove it. If not a jar file, then leave // the file around because the job depends on it. if (isJar && localFile != null && localFile.exists()) { localFile.delete(); } } return result; } protected String addCrawlJob(final File order, final String name, final String description, final String seeds) throws FatalConfigurationException, IOException { CrawlJob addedJob = null; if (this.jobHandler == null) { throw new NullPointerException("Heritrix jobhandler is null."); } try { if (order.getName().toLowerCase().endsWith(JAR_SUFFIX)) { return addCrawlJobBasedonJar(order, name, description, seeds); } addedJob = this.jobHandler.addJob(createCrawlJob(this.jobHandler, order, name)); } catch (InvalidAttributeValueException e) { FatalConfigurationException fce = new FatalConfigurationException( "Converted InvalidAttributeValueException on " + order.getAbsolutePath() + ": " + e.getMessage()); fce.setStackTrace(e.getStackTrace()); } return addedJob != null ? addedJob.getUID() : null; } /** * Undo jar file and use as basis for a new job. * * @param jarFile * Pointer to file that holds jar. * @param name * Name to use for new job. * @param description * @param seeds * @return Message. * @throws IOException * @throws FatalConfigurationException */ protected String addCrawlJobBasedonJar(final File jarFile, final String name, final String description, final String seeds) throws IOException, FatalConfigurationException { if (jarFile == null || !jarFile.exists()) { throw new FileNotFoundException(jarFile.getAbsolutePath()); } // Create a directory with a tmp name. Do it by first creating file, // removing it, then creating the directory. There is a hole during // which the OS may put a file of same exact name in our way but // unlikely. File dir = File.createTempFile(Heritrix.class.getName(), ".expandedjar", TMPDIR); dir.delete(); dir.mkdir(); try { org.archive.crawler.util.IoUtils.unzip(jarFile, dir); // Expect to find an order file at least. File orderFile = new File(dir, "order.xml"); if (!orderFile.exists()) { throw new IOException("Missing order: " + orderFile.getAbsolutePath()); } CrawlJob job = createCrawlJobBasedOn(orderFile, name, description, seeds); // Copy into place any seeds and settings directories before we // add job to Heritrix to crawl. File seedsFile = new File(dir, "seeds.txt"); if (seedsFile.exists()) { FileUtils.copyFiles(seedsFile, new File(job.getDirectory(), seedsFile.getName())); } File settingsDir = new File(dir, "settings"); if (settingsDir.exists()) { FileUtils.copyFiles(settingsDir, job.getDirectory()); } addCrawlJob(job); return job.getUID(); } finally { // After job has been added, no more need of expanded content. // (Let the caller be responsible for cleanup of jar. Sometimes // its should be deleted -- when its a local copy of a jar pulled // across the net -- wherease other times, if its a jar passed // in w/ a 'file' scheme, it shouldn't be deleted. org.archive.util.FileUtils.deleteDir(dir); } } public String addCrawlJobBasedOn(String jobUidOrProfile, String name, String description, String seeds) { try { CrawlJob cj = getJobHandler().getJob(jobUidOrProfile); if (cj == null) { throw new InvalidAttributeValueException(jobUidOrProfile + " is not a job UID or profile name (Job UIDs are " + " usually the 14 digit date portion of job name)."); } CrawlJob job = addCrawlJobBasedOn(cj.getSettingsHandler() .getOrderFile(), name, description, seeds); return job.getUID(); } catch (Exception e) { e.printStackTrace(); return "Exception on " + jobUidOrProfile + ": " + e.getMessage(); } } protected CrawlJob addCrawlJobBasedOn(final File orderFile, final String name, final String description, final String seeds) throws FatalConfigurationException { return addCrawlJob(createCrawlJobBasedOn(orderFile, name, description, seeds)); } protected CrawlJob createCrawlJobBasedOn(final File orderFile, final String name, final String description, final String seeds) throws FatalConfigurationException { CrawlJob job = getJobHandler().newJob(orderFile, name, description, seeds); return CrawlJobHandler.ensureNewJobWritten(job, name, description); } protected CrawlJob addCrawlJob(final CrawlJob job) { return getJobHandler().addJob(job); } public void startCrawling() { if (getJobHandler() == null) { throw new NullPointerException("Heritrix jobhandler is null."); } getJobHandler().startCrawler(); } public void stopCrawling() { if (getJobHandler() == null) { throw new NullPointerException("Heritrix jobhandler is null."); } getJobHandler().stopCrawler(); } /** * Get the heritrix version. * * @return The heritrix version. May be null. */ public static String getVersion() { return System.getProperty("heritrix.version"); } /** * Get the job handler * * @return The CrawlJobHandler being used. */ public CrawlJobHandler getJobHandler() { return this.jobHandler; } /** * Get the configuration directory. * * @return The conf directory under HERITRIX_HOME or null if none can be * found. * @throws IOException */ public static File getConfdir() throws IOException { return getConfdir(true); } /** * Get the configuration directory. * * @param fail * Throw IOE if can't find directory if true, else just return * null. * @return The conf directory under HERITRIX_HOME or null (or an IOE) if * can't be found. * @throws IOException */ public static File getConfdir(final boolean fail) throws IOException { final String key = "heritrix.conf"; // Look to see if heritrix.conf property passed on the cmd-line. String tmp = System.getProperty(key); // if not fall back to default $HERITIX_HOME/conf if (tmp == null || tmp.length() == 0) { return getSubDir("conf", fail); } File dir = new File(tmp); if (!dir.exists()) { if (fail) { throw new IOException("Cannot find conf dir: " + tmp); } else { logger.log(Level.WARNING, "Specified " + key + " dir does not exist. Falling back on default"); } dir = getSubDir("conf", fail); } return dir; } /** * @return Returns the httpServer. May be null if one was not started. */ public static SimpleHttpServer getHttpServer() { return Heritrix.httpServer; } /** * @throws IOException * @return Returns the directory under which reside the WAR files we're to * load into the servlet container. */ public static File getWarsdir() throws IOException { return getSubDir("webapps"); } /** * Prepars for program shutdown. This method does it's best to prepare the * program so that it can exit normally. It will kill the httpServer and * terminate any running job.<br> * It is advisible to wait a few (~1000) millisec after calling this method * and before calling performHeritrixShutDown() to allow as many threads as * possible to finish what they are doing. */ public static void prepareHeritrixShutDown() { // Stop and destroy all running Heritrix instances. // Get array of the key set to avoid CCEs for case where call to // destroy does a remove of an instance from Heritrix.instances. final Object[] keys = Heritrix.instances.keySet().toArray(); for (int i = 0; i < keys.length; i++) { ((Heritrix) Heritrix.instances.get(keys[i])).destroy(); } try { deregisterJndi(getJndiContainerName()); } catch (NameNotFoundException e) { // We were probably unbound already. Ignore. logger.log(Level.WARNING, "deregistration of jndi", e); } catch (Exception e) { e.printStackTrace(); } if (Heritrix.httpServer != null) { // Shut down the web access. try { Heritrix.httpServer.stopServer(); } catch (InterruptedException e) { // Generally this can be ignored, but we'll print a stack trace // just in case. e.printStackTrace(); } finally { Heritrix.httpServer = null; } } } /** * Exit program. Recommended that prepareHeritrixShutDown() be invoked prior * to this method. */ public static void performHeritrixShutDown() { performHeritrixShutDown(0); } /** * Exit program. Recommended that prepareHeritrixShutDown() be invoked prior * to this method. * * @param exitCode * Code to pass System.exit. * */ public static void performHeritrixShutDown(int exitCode) { System.exit(exitCode);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -