⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 heritrix.java

📁 高性能分词算法
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
                addJob(createCrawlJob(this.jobHandler, order, name));        } catch (InvalidAttributeValueException e) {            FatalConfigurationException fce = new FatalConfigurationException(                "Converted InvalidAttributeValueException on " +                order.getAbsolutePath() + ": " + e.getMessage());            fce.setStackTrace(e.getStackTrace());        }        return addedJob != null? addedJob.getUID(): null;    }        /**     * Undo jar file and use as basis for a new job.     * @param jarFile Pointer to file that holds jar.     * @param name Name to use for new job.     * @param description      * @param seeds      * @return Message.     * @throws IOException     * @throws FatalConfigurationException     */    protected String addCrawlJobBasedonJar(final File jarFile,            final String name, final String description, final String seeds)    throws IOException, FatalConfigurationException {        if (jarFile == null || !jarFile.exists()) {            throw new FileNotFoundException(jarFile.getAbsolutePath());        }        // Create a directory with a tmp name.  Do it by first creating file,        // removing it, then creating the directory. There is a hole during        // which the OS may put a file of same exact name in our way but        // unlikely.        File dir = File.createTempFile(Heritrix.class.getName(), ".expandedjar",            TMPDIR);        dir.delete();        dir.mkdir();        try {            org.archive.crawler.util.IoUtils.unzip(jarFile, dir);            // Expect to find an order file at least.            File orderFile = new File(dir, "order.xml");            if (!orderFile.exists()) {                throw new IOException("Missing order: " +                    orderFile.getAbsolutePath());            }            CrawlJob job =                createCrawlJobBasedOn(orderFile, name, description, seeds);            // Copy into place any seeds and settings directories before we            // add job to Heritrix to crawl.            File seedsFile = new File(dir, "seeds.txt");            if (seedsFile.exists()) {                FileUtils.copyFiles(seedsFile, new File(job.getDirectory(),                    seedsFile.getName()));            }            addCrawlJob(job);            return job.getUID();         } finally {             // After job has been added, no more need of expanded content.             // (Let the caller be responsible for cleanup of jar. Sometimes             // its should be deleted -- when its a local copy of a jar pulled             // across the net -- wherease other times, if its a jar passed             // in w/ a 'file' scheme, it shouldn't be deleted.             org.archive.util.FileUtils.deleteDir(dir);         }    }        public String addCrawlJobBasedOn(String jobUidOrProfile,            String name, String description, String seeds) {        try {            CrawlJob cj = getJobHandler().getJob(jobUidOrProfile);            if (cj == null) {                throw new InvalidAttributeValueException(jobUidOrProfile +                    " is not a job UID or profile name (Job UIDs are " +                    " usually the 14 digit date portion of job name).");            }            CrawlJob job = addCrawlJobBasedOn(                cj.getSettingsHandler().getOrderFile(), name, description,                    seeds);            return job.getUID();        } catch (Exception e) {            e.printStackTrace();            return "Exception on " + jobUidOrProfile + ": " + e.getMessage();        }     }        protected CrawlJob addCrawlJobBasedOn(final File orderFile,        final String name, final String description, final String seeds)    throws FatalConfigurationException {        return addCrawlJob(createCrawlJobBasedOn(orderFile, name, description,                seeds));    }        protected CrawlJob createCrawlJobBasedOn(final File orderFile,            final String name, final String description, final String seeds)    throws FatalConfigurationException {        CrawlJob job = getJobHandler().newJob(orderFile, name, description,                seeds);        return CrawlJobHandler.ensureNewJobWritten(job, name, description);    }        protected CrawlJob addCrawlJob(final CrawlJob job) {        return getJobHandler().addJob(job);    }        public void startCrawling() {        if (getJobHandler() == null) {            throw new NullPointerException("Heritrix jobhandler is null.");        }        getJobHandler().startCrawler();    }    public void stopCrawling() {        if (getJobHandler() == null) {            throw new NullPointerException("Heritrix jobhandler is null.");        }        getJobHandler().stopCrawler();    }        /**     * Get the heritrix version.     *     * @return The heritrix version.  May be null.     */    public static String getVersion() {        return System.getProperty("heritrix.version");    }    /**     * Get the job handler     *     * @return The CrawlJobHandler being used.     */    public CrawlJobHandler getJobHandler() {        return this.jobHandler;    }    /**     * Get the configuration directory.     * @return The conf directory under HERITRIX_HOME or null if none can     * be found.     * @throws IOException     */    public static File getConfdir()    throws IOException {        return getConfdir(true);    }    /**     * Get the configuration directory.     * @param fail Throw IOE if can't find directory if true, else just     * return null.     * @return The conf directory under HERITRIX_HOME or null (or an IOE) if     * can't be found.     * @throws IOException     */    public static File getConfdir(final boolean fail)    throws IOException {        final String key = "heritrix.conf";        // Look to see if heritrix.conf property passed on the cmd-line.        String tmp = System.getProperty(key);        // if not fall back to default $HERITIX_HOME/conf        if (tmp == null || tmp.length() == 0) {            return getSubDir("conf", fail);        }        File dir = new File(tmp);        if (!dir.exists()) {            if (fail) {                throw new IOException("Cannot find conf dir: " + tmp);            } else {                logger.log(Level.WARNING, "Specified " + key +                    " dir does not exist.  Falling back on default");            }            dir = getSubDir("conf", fail);        }        return dir;    }    /**     * @return Returns the httpServer. May be null if one was not started.     */    public static SimpleHttpServer getHttpServer() {        return Heritrix.httpServer;    }    /**     * @throws IOException     * @return Returns the directory under which reside the WAR files     * we're to load into the servlet container.     */    public static File getWarsdir()    throws IOException {        return getSubDir("webapps");    }    /**     * Prepars for program shutdown. This method does it's best to prepare the     * program so that it can exit normally. It will kill the httpServer and     * terminate any running job.<br>     * It is advisible to wait a few (~1000) millisec after calling this method     * and before calling performHeritrixShutDown() to allow as many threads as     * possible to finish what they are doing.     */    public static void prepareHeritrixShutDown() {        // Stop and destroy all running Heritrix instances.        // Get array of the key set to avoid CCEs for case where call to        // destroy does a remove of an instance from Heritrix.instances.        final Object [] keys = Heritrix.instances.keySet().toArray();        for (int i = 0; i < keys.length; i++) {            ((Heritrix)Heritrix.instances.get(keys[i])).destroy();        }                try {            deregisterJndi(getJndiContainerName());        } catch (NameNotFoundException e) {            // We were probably unbound already. Ignore.            logger.log(Level.WARNING, "deregistration of jndi", e);        } catch (Exception e) {            e.printStackTrace();        }                if(Heritrix.httpServer != null) {            // Shut down the web access.            try {                Heritrix.httpServer.stopServer();            } catch (InterruptedException e) {                // Generally this can be ignored, but we'll print a stack trace                // just in case.                e.printStackTrace();            } finally {                Heritrix.httpServer = null;            }        }    }    /**     * Exit program. Recommended that prepareHeritrixShutDown() be invoked     * prior to this method.     */    public static void performHeritrixShutDown() {        performHeritrixShutDown(0);    }    /**     * Exit program. Recommended that prepareHeritrixShutDown() be invoked     * prior to this method.     *     * @param exitCode Code to pass System.exit.     *     */    public static void performHeritrixShutDown(int exitCode) {        System.exit(exitCode);    }    /**     * Shutdown all running heritrix instances and the JVM.     * Assumes stop has already been called.	 * @param exitCode Exit code to pass system exit.	 */	public static void shutdown(final int exitCode) {        getShutdownThread(true, exitCode, "Heritrix shutdown").start();	}        protected static Thread getShutdownThread(final boolean sysexit,            final int exitCode, final String name) {        Thread t = new Thread(name) {            public void run() {                Heritrix.prepareHeritrixShutDown();                if (sysexit) {                    Heritrix.performHeritrixShutDown(exitCode);                }            }        };        t.setDaemon(true);        return t;    }        public static void shutdown() {        shutdown(0);    }        /**     * Register Heritrix with JNDI, JMX, and with the static hashtable of all     * Heritrix instances known to this JVM.     *      * If launched from cmdline, register Heritrix MBean if an agent to register     * ourselves with. Usually this method will only have effect if we're     * running in a 1.5.0 JDK and command line options such as     * '-Dcom.sun.management.jmxremote.port=8082     * -Dcom.sun.management.jmxremote.authenticate=false     * -Dcom.sun.management.jmxremote.ssl=false' are supplied.     * See <a href="http://java.sun.com/j2se/1.5.0/docs/guide/management/agent.html">Monitoring     * and Management Using JMX</a>     * for more on the command line options and how to connect to the     * Heritrix bean using the JDK 1.5.0 jconsole tool.  We register currently     * with first server we find (TODO: Make configurable).     *      * <p>If we register successfully with a JMX agent, then part of the     * registration will include our registering ourselves with JNDI.     *      * <p>Finally, add the heritrix instance to the hashtable of all the     * Heritrix instances floating in the current VM.  This latter registeration     * happens whether or no there is a JMX agent to register with.  This is     * a list we keep out of convenience so its easy iterating over all     *  all instances calling stop when main application is going down.     *      * @param h Instance of heritrix to register.     * @param name Name to use for this Heritrix instance.     * @param jmxregister True if we are to register this instance with JMX.     * @throws NullPointerException     * @throws MalformedObjectNameException     * @throws NotCompliantMBeanException      * @throws MBeanRegistrationException      * @throws InstanceAlreadyExistsException      */    protected static void registerHeritrix(final Heritrix h,            final String name, final boolean jmxregister)    throws MalformedObjectNameException, InstanceAlreadyExistsException,    MBeanRegistrationException, NotCompliantMBeanException {        MBeanServer server = getMBeanServer();        if (server != null) {            // Are we to manage the jmx registration?  Or is it being done for            // us by an external process: e.g. This instance was created by            // MBeanAgent.            if (jmxregister) {                ObjectName objName = (name == null || name.length() <= 0)?                    getJmxObjectName(): getJmxObjectName(name);                registerMBean(server, h, objName);            }        } else {            // JMX ain't available. Put this instance into the list of Heritrix            // instances so findable by the UI (Normally this is done in the            // JMX postRegister routine below).  When no JMX, can only have            // one instance of Heritrix so no need to do the deregisteration.            Heritrix.instances.put(h.getNoJmxName(), h);        }    }        protected static void unregisterHeritrix(final Heritrix h)    throws InstanceNotFoundException, MBeanRegistration

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -