📄 heritrix.java
字号:
return sinkHandler.getAllUnread(); } public SinkHandlerLogRecord get(String alertID) { return sinkHandler.get(Long.parseLong(alertID)); } public int getCount() { return sinkHandler.getCount(); } public int getNewCount() { return sinkHandler.getUnreadCount(); } public void remove(String alertID) { sinkHandler.remove(Long.parseLong(alertID)); } public void read(String alertID) { sinkHandler.read(Long.parseLong(alertID)); } }; try { Heritrix.registerHeritrix(this, name, jmxregister); } catch (InstanceAlreadyExistsException e) { throw new RuntimeException(e); } catch (MBeanRegistrationException e) { throw new RuntimeException(e); } catch (NotCompliantMBeanException e) { throw new RuntimeException(e); } catch (MalformedObjectNameException e) { throw new RuntimeException(e); } } /** * Run setup tasks for this 'container'. Idempotent. * * @throws IOException */ protected static void containerInitialization() throws IOException { if (Heritrix.containerInitialized) { return; } Heritrix.containerInitialized = true; // Load up the properties. This invocation adds heritrix properties // to system properties so all available via System.getProperty. // Note, loadProperties and patchLogging have global effects. May be an // issue if we're running inside a container such as tomcat or jboss. Heritrix.loadProperties(); Heritrix.patchLogging(); Heritrix.configureTrustStore(); // Will run on SIGTERM but not on SIGKILL, unfortunately. // Otherwise, ensures we cleanup after ourselves (Deregister from // JMX and JNDI). Runtime.getRuntime().addShutdownHook( Heritrix.getShutdownThread(false, 0, "Heritrix shutdown hook")); // Register this heritrix 'container' though we may be inside another // tomcat or jboss container. try { registerContainerJndi(); } catch (Exception e) { logger.log(Level.WARNING, "Failed jndi container registration.", e); } } /** * Do inverse of construction. Used by anyone who does a 'new Heritrix' when * they want to cleanup the instance. Of note, there may be Heritrix threads * still hanging around after the call to destroy completes. They'll * eventually go down after they've finished their cleanup routines. In * particular, if you are watching Heritrix via JMX, you can see the * Heritrix instance JMX bean unregister ahead of the CrawlJob JMX bean that * its hosting. */ public void destroy() { stop(); try { Heritrix.unregisterHeritrix(this); } catch (InstanceNotFoundException e) { e.printStackTrace(); } catch (MBeanRegistrationException e) { e.printStackTrace(); } catch (NullPointerException e) { e.printStackTrace(); } this.jobHandler = null; this.openMBeanInfo = null; } /** * Launch program. Optionally will launch a web server to host UI. Will also * register Heritrix MBean with first found JMX Agent (Usually the 1.5.0 JVM * Agent). * * @param args * Command line arguments. * @throws Exception */ public static void main(String[] args) throws Exception { Heritrix.commandLine = true; // Set timezone here. Would be problematic doing it if we're running // inside in a container. TimeZone.setDefault(TimeZone.getTimeZone("GMT+8")); File startLog = new File(getHeritrixHome(), STARTLOG); Heritrix.out = new PrintWriter(isDevelopment() ? System.out : new PrintStream(new FileOutputStream(startLog))); try { containerInitialization(); String status = doCmdLineArgs(args); if (status != null) { Heritrix.out.println(status); } } catch (Exception e) { // Show any exceptions in STARTLOG. e.printStackTrace(Heritrix.out); throw e; } finally { // If not development, close the file that signals the wrapper // script that we've started. Otherwise, just flush it; if in // development, the output is probably a console. if (!isDevelopment()) { if (Heritrix.out != null) { Heritrix.out.close(); } System.out .println("Heritrix version: " + Heritrix.getVersion()); } else { if (Heritrix.out != null) { Heritrix.out.flush(); } } } } protected static String doCmdLineArgs(final String[] args) throws Exception { // Get defaults for commandline arguments from the properties file. String tmpStr = PropertyUtils.getPropertyOrNull("heritrix.context"); if (tmpStr != null) { Heritrix.adminContext = tmpStr; } tmpStr = PropertyUtils.getPropertyOrNull("heritrix.cmdline.port"); if (tmpStr != null) { Heritrix.guiPort = Integer.parseInt(tmpStr); } tmpStr = PropertyUtils.getPropertyOrNull("heritrix.cmdline.admin"); String adminLoginPassword = (tmpStr == null) ? "" : tmpStr; String crawlOrderFile = PropertyUtils .getPropertyOrNull("heritrix.cmdline.order"); tmpStr = PropertyUtils.getPropertyOrNull("heritrix.cmdline.run"); boolean runMode = PropertyUtils .getBooleanProperty("heritrix.cmdline.run"); boolean selfTest = false; String selfTestName = null; CommandLineParser clp = new CommandLineParser(args, Heritrix.out, Heritrix.getVersion()); List arguments = clp.getCommandLineArguments(); Option[] options = clp.getCommandLineOptions(); // Check passed argument. Only one argument, the ORDER_FILE is allowed. // If one argument, make sure exists and xml suffix. if (arguments.size() > 1) { clp.usage(1); } else if (arguments.size() == 1) { crawlOrderFile = (String) arguments.get(0); if (!(new File(crawlOrderFile).exists())) { clp.usage("ORDER.XML <" + crawlOrderFile + "> specified does not exist.", 1); } // Must end with '.xml' if (crawlOrderFile.length() > 4 && !crawlOrderFile.substring(crawlOrderFile.length() - 4) .equalsIgnoreCase(".xml")) { clp.usage("ORDER.XML <" + crawlOrderFile + "> does not have required '.xml' suffix.", 1); } } // Now look at options passed. for (int i = 0; i < options.length; i++) { switch (options[i].getId()) { case 'h': clp.usage(); break; case 'a': adminLoginPassword = options[i].getValue(); break; case 'n': if (crawlOrderFile == null) { clp.usage("You must specify an ORDER_FILE with" + " '--nowui' option.", 1); } Heritrix.gui = false; break; case 'b': Heritrix.guiHosts = parseHosts(options[i].getValue()); break; case 'p': try { Heritrix.guiPort = Integer.parseInt(options[i].getValue()); } catch (NumberFormatException e) { clp.usage("Failed parse of port number: " + options[i].getValue(), 1); } if (Heritrix.guiPort <= 0) { clp.usage("Nonsensical port number: " + options[i].getValue(), 1); } break; case 'r': runMode = true; break; case 's': selfTestName = options[i].getValue(); selfTest = true; break; default: assert false : options[i].getId(); } } // Ok, we should now have everything to launch the program. String status = null; if (selfTest) { // If more than just '--selftest' and '--port' passed, then // there is confusion on what is being asked of us. Print usage // rather than proceed. for (int i = 0; i < options.length; i++) { if (options[i].getId() != 'p' && options[i].getId() != 's') { clp.usage(1); } } if (arguments.size() > 0) { // No arguments accepted by selftest. clp.usage(1); } status = selftest(selfTestName, Heritrix.guiPort); } else { if (!isValidLoginPasswordString(adminLoginPassword)) { clp.usage("Invalid admin login:password value, or none " + "specified. ", 1); } if (!Heritrix.gui) { if (options.length > 1) { // If more than just '--nowui' passed, then there is // confusion on what is being asked of us. Print usage // rather than proceed. clp.usage(1); } Heritrix h = new Heritrix(true); status = h.doOneCrawl(crawlOrderFile); } else { status = startEmbeddedWebserver(Heritrix.guiHosts, Heritrix.guiPort, adminLoginPassword); Heritrix h = new Heritrix(true); String tmp = h.launch(crawlOrderFile, runMode); if (tmp != null) { status += ('\n' + tmp); } } } return status; } /** * @return The file we dump stdout and stderr into. */ public static String getHeritrixOut() { String tmp = System.getProperty("heritrix.out"); if (tmp == null || tmp.length() == 0) { tmp = Heritrix.DEFAULT_HERITRIX_OUT; } return tmp; } /** * Exploit <code>-Dheritrix.home</code> if available to us. Is current * working dir if no heritrix.home property supplied. * * @return Heritrix home directory. * @throws IOException */ protected static File getHeritrixHome() throws IOException { File heritrixHome = null; String home = System.getProperty("heritrix.home"); if (home != null && home.length() > 0) { heritrixHome = new File(home); if (!heritrixHome.exists()) { throw new IOException("HERITRIX_HOME <" + home + "> does not exist."); } } else { heritrixHome = new File(new File("").getAbsolutePath()); } return heritrixHome; } /** * @return The directory into which we put jobs. If the system property * 'heritrix.jobsdir' is set, we will use its value in place of the * default 'jobs' directory in the current working directory. * @throws IOException */ public static File getJobsdir() throws IOException { String jobsdirStr = System.getProperty("heritrix.jobsdir", "jobs"); return jobsdirStr.startsWith(File.separator) ? new File(jobsdirStr) : new File(getHeritrixHome(), jobsdirStr); } /** * Get and check for existence of expected subdir. * * If development flag set, then look for dir under src dir. * * @param subdirName * Dir to look for. * @return The extant subdir. Otherwise null if we're running in a webapp * context where there is no conf directory available. * @throws IOException * if unable to find expected subdir. */ protected static File getSubDir(String subdirName) throws IOException { return getSubDir(subdirName, true); } /** * Get and optionally check for existence of subdir. * * If development flag set, then look for dir under src dir. * * @param subdirName * Dir to look for. * @param fail * True if we are to fail if directory does not exist; false if * we are to return false if the directory does not exist. * @return The extant subdir. Otherwise null if we're running in a webapp * context where there is no subdir directory available. * @throws IOException * if unable to find expected subdir. */ protected static File getSubDir(String subdirName, boolean fail) throws IOException { String path = isDevelopment() ? "src" + File.separator + subdirName : subdirName; File dir = new File(getHeritrixHome(), path); if (!dir.exists()) { if (fail) { throw new IOException("Cannot find subdir: " + subdirName); } dir = null; } return dir; } /** * Test string is valid login/password string. * * A valid login/password string has the login and password compounded w/ a
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -