📄 heritrix.java
字号:
* ':' delimiter. * * @param str * String to test. * @return True if valid password/login string. */ protected static boolean isValidLoginPasswordString(String str) { boolean isValid = false; StringTokenizer tokenizer = new StringTokenizer(str, ":"); if (tokenizer.countTokens() == 2) { String login = ((String) tokenizer.nextElement()).trim(); String password = ((String) tokenizer.nextElement()).trim(); if (login.length() > 0 && password.length() > 0) { isValid = true; } } return isValid; } protected static boolean isDevelopment() { return System.getProperty("heritrix.development") != null; } /** * Load the heritrix.properties file. * * Adds any property that starts with * <code>HERITRIX_PROPERTIES_PREFIX</code> or <code>ARCHIVE_PACKAGE</code> * into system properties (except logging '.level' directives). * * @return Loaded properties. * @throws IOException */ protected static Properties loadProperties() throws IOException { Properties properties = new Properties(); properties.load(getPropertiesInputStream()); // Any property that begins with ARCHIVE_PACKAGE, make it // into a system property. While iterating, check to see if anything // defined on command-line, and if so, it overrules whats in // heritrix.properties. for (Enumeration e = properties.keys(); e.hasMoreElements();) { String key = ((String) e.nextElement()).trim(); if (key.startsWith(ARCHIVE_PACKAGE) || key.startsWith(HERITRIX_PROPERTIES_PREFIX)) { // Don't add the heritrix.properties entries that are // changing the logging level of particular classes. if (key.indexOf(".level") < 0) { if (System.getProperty(key) == null || System.getProperty(key).length() == 0) { System.setProperty(key, properties.getProperty(key) .trim()); } } } } return properties; } protected static InputStream getPropertiesInputStream() throws IOException { File file = null; // Look to see if properties have been passed on the cmd-line. String alternateProperties = System.getProperty(PROPERTIES_KEY); if (alternateProperties != null && alternateProperties.length() > 0) { file = new File(alternateProperties); } // Get properties from conf directory if one available. if ((file == null || !file.exists()) && getConfdir(false) != null) { file = new File(getConfdir(), PROPERTIES); if (!file.exists()) { // If no properties file in the conf dir, set file back to // null so we go looking for heritrix.properties on classpath. file = null; } } // If not on the command-line, there is no conf dir. Then get the // properties from the CLASSPATH (Classpath file separator is always // '/', whatever the platform. InputStream is = (file != null) ? new FileInputStream(file) : Heritrix.class.getResourceAsStream("/" + PROPERTIES_KEY); if (is == null) { throw new IOException("Failed to load properties file from" + " filesystem or from classpath."); } return is; } /** * If the user hasn't altered the default logging parameters, tighten them * up somewhat: some of our libraries are way too verbose at the INFO or * WARNING levels. * * This might be a problem running inside in someone else's container. * Container's seem to prefer commons logging so we ain't messing them doing * the below. * * @throws IOException * @throws SecurityException */ protected static void patchLogging() throws SecurityException, IOException { if (System.getProperty("java.util.logging.config.class") != null) { return; } if (System.getProperty("java.util.logging.config.file") != null) { return; } // No user-set logging properties established; use defaults // from distribution-packaged 'heritrix.properties'. LogManager.getLogManager() .readConfiguration(getPropertiesInputStream()); } /** * Configure our trust store. * * If system property is defined, then use it for our truststore. Otherwise * use the heritrix truststore under conf directory if it exists. * * <p> * If we're not launched from the command-line, we will not be able to find * our truststore. The truststore is nor normally used so rare should this * be a problem (In case where we don't use find our trust store, we'll use * the 'default' -- either the JVMs or the containers). */ protected static void configureTrustStore() { // Below must be defined in jsse somewhere but can' find it. final String TRUSTSTORE_KEY = "javax.net.ssl.trustStore"; String value = System.getProperty(TRUSTSTORE_KEY); File confdir = null; try { confdir = getConfdir(false); } catch (IOException e) { logger.log(Level.WARNING, "Failed to get confdir.", e); } if ((value == null || value.length() <= 0) && confdir != null) { // Use the heritrix store if it exists on disk. File heritrixStore = new File(confdir, "heritrix.cacerts"); if (heritrixStore.exists()) { value = heritrixStore.getAbsolutePath(); } } if (value != null && value.length() > 0) { System.setProperty(TRUSTSTORE_KEY, value); } } /** * Run the selftest * * @param oneSelfTestName * Name of a test if we are to run one only rather than the * default running all tests. * @param port * Port number to use for web UI. * * @exception Exception * @return Status of how selftest startup went. */ protected static String selftest(final String oneSelfTestName, final int port) throws Exception { // Put up the webserver w/ the root and selftest webapps only. final String SELFTEST = "selftest"; Heritrix.httpServer = new SimpleHttpServer(SELFTEST, Heritrix.adminContext, LOCALHOST_ONLY, port, true); // Set up digest auth for a section of the server so selftest can run // auth tests. Looks like can only set one login realm going by the // web.xml dtd. Otherwise, would be nice to selftest basic and digest. // Have login, password and role all be SELFTEST. Must match what is // in the selftest order.xml file. Heritrix.httpServer.setAuthentication(SELFTEST, Heritrix.adminContext, SELFTEST, SELFTEST, SELFTEST); Heritrix.httpServer.startServer(); // Get the order file from the CLASSPATH unless we're running in dev // environment. File selftestDir = (isDevelopment()) ? new File(getConfdir(), SELFTEST) : new File(File.separator + SELFTEST); File crawlOrderFile = new File(selftestDir, "order.xml"); // Create a job based off the selftest order file. Then use this as // a template to pass jobHandler.newJob(). Doing this gets our // selftest output to show under the jobs directory. // Pass as a seed a pointer to the webserver we just put up. final String ROOTURI = "127.0.0.1:" + Integer.toString(port); String selfTestUrl = "http://" + ROOTURI + '/'; if (oneSelfTestName != null && oneSelfTestName.length() > 0) { selfTestUrl += (oneSelfTestName + '/'); } CrawlJobHandler cjh = new SelfTestCrawlJobHandler(getJobsdir(), oneSelfTestName, selfTestUrl); Heritrix h = new Heritrix("Selftest", true, cjh); CrawlJob job = createCrawlJob(cjh, crawlOrderFile, "Template"); job = h.getJobHandler().newJob(job, null, SELFTEST, "Integration self test", selfTestUrl, CrawlJob.PRIORITY_CRITICAL); h.getJobHandler().addJob(job); // Before we start, need to change some items in the settings file. CredentialStore cs = (CredentialStore) job.getSettingsHandler() .getOrder().getAttribute(CredentialStore.ATTR_NAME); for (Iterator i = cs.iterator(null); i.hasNext();) { ((Credential) i.next()).setCredentialDomain(null, ROOTURI); } h.getJobHandler().startCrawler(); StringBuffer buffer = new StringBuffer(); buffer.append("Heritrix " + Heritrix.getVersion() + " selftest started."); buffer.append("\nSelftest first crawls " + selfTestUrl + " and then runs an analysis."); buffer.append("\nResult of analysis printed to " + getHeritrixOut() + " when done."); buffer.append("\nSelftest job directory for logs and arcs:\n" + job.getDirectory().getAbsolutePath()); return buffer.toString(); } /** * Launch the crawler without a web UI and run the passed crawl only. * * Specialized version of {@link #launch()}. * * @param crawlOrderFile * The crawl order to crawl. * @throws InitializationException * @throws InvalidAttributeValueException * @return Status string. */ protected String doOneCrawl(String crawlOrderFile) throws InitializationException, InvalidAttributeValueException { return doOneCrawl(crawlOrderFile, null); } /** * Launch the crawler without a web UI and run passed crawl only. * * Specialized version of {@link #launch()}. * * @param crawlOrderFile * The crawl order to crawl. * @param listener * Register this crawl status listener before starting crawl (You * can use this listener to notice end-of-crawl). * @throws InitializationException * @throws InvalidAttributeValueException * @return Status string. */ protected String doOneCrawl(String crawlOrderFile, CrawlStatusListener listener) throws InitializationException, InvalidAttributeValueException { XMLSettingsHandler handler = new XMLSettingsHandler(new File( crawlOrderFile)); handler.initialize(); CrawlController controller = new CrawlController(); controller.initialize(handler); if (listener != null) { controller.addCrawlStatusListener(listener); } controller.requestCrawlStart(); return "Crawl started using " + crawlOrderFile + "."; } /** * Launch the crawler for a web UI. * * Crawler hangs around waiting on jobs. * * @exception Exception * @return A status string describing how the launch went. * @throws Exception */ public String launch() throws Exception { return launch(null, false); } /** * Launch the crawler for a web UI. * * Crawler hangs around waiting on jobs. * * @param crawlOrderFile * File to crawl. May be null. * @param runMode * Whether crawler should be set to run mode. * * @exception Exception * @return A status string describing how the launch went. */ public String launch(String crawlOrderFile, boolean runMode) throws Exception { String status = null; if (crawlOrderFile != null) { addCrawlJob(crawlOrderFile, "Autolaunched", "", ""); if (runMode) { this.jobHandler.startCrawler(); status = "Job being crawled: " + crawlOrderFile; } else { status = "Crawl job ready and pending: " + crawlOrderFile; } } else if (runMode) { // The use case is that jobs are to be run on a schedule and that // if the crawler is in run mode, then the scheduled job will be // run at appropriate time. Otherwise, not. this.jobHandler.startCrawler(); status = "Crawler set to run mode."; } return status; } /** * Start up the embedded Jetty webserver instance. This is done when we're * run from the command-line. * * @param port * Port number to use for web UI. * @param adminLoginPassword * Compound of login and password. * @throws Exception * @return Status on webserver startup. * @deprecated Use startEmbeddedWebserver(hosts, port, adminLoginPassword) */ protected static String startEmbeddedWebserver(final int port, final boolean lho, final String adminLoginPassword) throws Exception { ArrayList<String> hosts = new ArrayList<String>(); if (lho) { hosts.add("127.0.0.1"); } return startEmbeddedWebserver(hosts, port, adminLoginPassword); } /** * Parses a list of host names. * * <p> * If the given string is <code>/</code>, then an empty collection is * returned. This indicates that all available network interfaces should be * used. * * <p> * Otherwise, the string must contain a comma-separated list of IP addresses * or host names. The parsed list is then returned. * * @param hosts * the string to parse * @return the parsed collection of hosts */ private static Collection<String> parseHosts(String hosts) { hosts = hosts.trim(); if (hosts.equals("/")) { return new ArrayList<String>(1); } String[] hostArray = hosts.split(","); for (int i = 0; i < hostArray.length; i++) { hostArray[i] = hostArray[i].trim(); } return Arrays.asList(hostArray); } /** * Start up the embedded Jetty webserver instance. This is done when we're * run from the command-line. * * @param hosts * a list of IP addresses or hostnames to bind to, or an empty * collection to bind to all available network interfaces * @param port * Port number to use for web UI. * @param adminLoginPassword * Compound of login and password. * @throws Exception * @return Status on webserver startup. */ protected static String startEmbeddedWebserver(Collection<String> hosts, int port, String adminLoginPassword) throws Exception { adminUsername = adminLoginPassword.substring(0, adminLoginPassword .indexOf(":")); adminPassword = adminLoginPassword.substring(adminLoginPassword .indexOf(":") + 1); Heritrix.httpServer = new SimpleHttpServer("admin", Heritrix.adminContext, hosts, port, false); final String DOTWAR = ".war"; final String SELFTEST = "selftest"; // Look for additional WAR files beyond 'selftest' and 'admin'. File[] wars = getWarsdir().listFiles(); for (int i = 0; i < wars.length; i++) { if (wars[i].isFile()) { final String warName = wars[i].getName(); final String warNameNC = warName.toLowerCase(); if (warNameNC.endsWith(DOTWAR) && !warNameNC.equals(ADMIN + DOTWAR) && !warNameNC.equals(SELFTEST + DOTWAR)) { int dot = warName.indexOf('.'); Heritrix.httpServer.addWebapp(warName.substring(0, dot), null, true); } } } // Name of passed 'realm' must match what is in configured in web.xml. // We'll use ROLE for 'realm' and 'role'. final String ROLE = ADMIN; Heritrix.httpServer.setAuthentication(ROLE, Heritrix.adminContext, adminUsername, adminPassword, ROLE); Heritrix.httpServer.startServer();
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -