⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 heritrix.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
        CredentialStore cs = (CredentialStore)job.getSettingsHandler().            getOrder().getAttribute(CredentialStore.ATTR_NAME);        for (Iterator i = cs.iterator(null); i.hasNext();) {            ((Credential)i.next()).setCredentialDomain(null, ROOTURI);        }        h.getJobHandler().startCrawler();        StringBuffer buffer = new StringBuffer();        buffer.append("Heritrix " + Heritrix.getVersion() +                " selftest started.");        buffer.append("\nSelftest first crawls " + selfTestUrl +            " and then runs an analysis.");        buffer.append("\nResult of analysis printed to " +            getHeritrixOut() + " when done.");        buffer.append("\nSelftest job directory for logs and arcs:\n" +            job.getDirectory().getAbsolutePath());        return buffer.toString();    }    /**     * Launch the crawler without a web UI and run the passed crawl only.     *      * Specialized version of {@link #launch()}.     *     * @param crawlOrderFile The crawl order to crawl.     * @throws InitializationException     * @throws InvalidAttributeValueException     * @return Status string.     */    protected String doOneCrawl(String crawlOrderFile)    throws InitializationException, InvalidAttributeValueException {        return doOneCrawl(crawlOrderFile, null);    }        /**     * Launch the crawler without a web UI and run passed crawl only.     *      * Specialized version of {@link #launch()}.     *     * @param crawlOrderFile The crawl order to crawl.     * @param listener Register this crawl status listener before starting     * crawl (You can use this listener to notice end-of-crawl).     * @throws InitializationException     * @throws InvalidAttributeValueException     * @return Status string.     */    protected String doOneCrawl(String crawlOrderFile,        CrawlStatusListener listener)    throws InitializationException, InvalidAttributeValueException {        XMLSettingsHandler handler =            new XMLSettingsHandler(new File(crawlOrderFile));        handler.initialize();        CrawlController controller = new CrawlController();        controller.initialize(handler);        if (listener != null) {            controller.addCrawlStatusListener(listener);        }        controller.requestCrawlStart();        return "Crawl started using " + crawlOrderFile + ".";    }        /**     * Launch the crawler for a web UI.     *     * Crawler hangs around waiting on jobs.     *     * @exception Exception     * @return A status string describing how the launch went.     * @throws Exception     */    public String launch() throws Exception {        return launch(null, false);    }    /**     * Launch the crawler for a web UI.     *     * Crawler hangs around waiting on jobs.     *      * @param crawlOrderFile File to crawl.  May be null.     * @param runMode Whether crawler should be set to run mode.     *     * @exception Exception     * @return A status string describing how the launch went.     */    public String launch(String crawlOrderFile, boolean runMode)    throws Exception {        String status = null;        if (crawlOrderFile != null) {            addCrawlJob(crawlOrderFile, "Autolaunched", "", "");            if(runMode) {                this.jobHandler.startCrawler();                status = "Job being crawled: " + crawlOrderFile;            } else {                status = "Crawl job ready and pending: " + crawlOrderFile;            }        } else if(runMode) {            // The use case is that jobs are to be run on a schedule and that            // if the crawler is in run mode, then the scheduled job will be            // run at appropriate time.  Otherwise, not.            this.jobHandler.startCrawler();            status = "Crawler set to run mode.";        }        return status;    }        /**     * Start up the embedded Jetty webserver instance.     * This is done when we're run from the command-line.     * @param port Port number to use for web UI.     * @param adminLoginPassword Compound of login and password.     * @throws Exception     * @return Status on webserver startup.     * @deprecated  Use startEmbeddedWebserver(hosts, port, adminLoginPassword)     */    protected static String startEmbeddedWebserver(final int port,        final boolean lho, final String adminLoginPassword)    throws Exception {        ArrayList<String> hosts = new ArrayList<String>();        if (lho) {            hosts.add("127.0.0.1");        }        return startEmbeddedWebserver(hosts, port, adminLoginPassword);    }        /**     * Parses a list of host names.     *      * <p>If the given string is <code>/</code>, then an empty     * collection is returned.  This indicates that all available network     * interfaces should be used.     *      * <p>Otherwise, the string must contain a comma-separated list of      * IP addresses or host names.  The parsed list is then returned.     *      * @param hosts  the string to parse     * @return  the parsed collection of hosts      */    private static Collection<String> parseHosts(String hosts) {        hosts = hosts.trim();        if (hosts.equals("/")) {            return new ArrayList<String>(1);        }        String[] hostArray = hosts.split(",");        for (int i = 0; i < hostArray.length; i++) {            hostArray[i] = hostArray[i].trim();        }        return Arrays.asList(hostArray);    }        /**     * Start up the embedded Jetty webserver instance.     * This is done when we're run from the command-line.     *      * @param hosts  a list of IP addresses or hostnames to bind to, or an     *               empty collection to bind to all available network      *               interfaces     * @param port Port number to use for web UI.     * @param adminLoginPassword Compound of login and password.     * @throws Exception     * @return Status on webserver startup.     */    protected static String startEmbeddedWebserver(Collection<String> hosts,         int port, String adminLoginPassword)     throws Exception {        adminUsername = adminLoginPassword.            substring(0, adminLoginPassword.indexOf(":"));        adminPassword = adminLoginPassword.            substring(adminLoginPassword.indexOf(":") + 1);        Heritrix.httpServer = new SimpleHttpServer("admin",            Heritrix.adminContext, hosts, port, false);                final String DOTWAR = ".war";        final String SELFTEST = "selftest";                // Look for additional WAR files beyond 'selftest' and 'admin'.        File[] wars = getWarsdir().listFiles();        for(int i = 0; i < wars.length; i++) {            if(wars[i].isFile()) {                final String warName = wars[i].getName();                final String warNameNC = warName.toLowerCase();                if(warNameNC.endsWith(DOTWAR) &&                        !warNameNC.equals(ADMIN + DOTWAR) &&                        !warNameNC.equals(SELFTEST + DOTWAR)) {                    int dot = warName.indexOf('.');                    Heritrix.httpServer.addWebapp(warName.substring(0, dot),                            null, true);                }            }        }                // Name of passed 'realm' must match what is in configured in web.xml.        // We'll use ROLE for 'realm' and 'role'.        final String ROLE = ADMIN;        Heritrix.httpServer.setAuthentication(ROLE, Heritrix.adminContext,            adminUsername, adminPassword, ROLE);        Heritrix.httpServer.startServer();        StringBuffer buffer = new StringBuffer();        buffer.append("Heritrix " + Heritrix.getVersion() + " is running.");        for (String host: httpServer.getHosts()) {            buffer.append("\nWeb console is at: http://");            buffer.append(host).append(':').append(port);        }        buffer.append("\nWeb console login and password: " +            adminUsername + "/" + adminPassword);        return buffer.toString();    }        /**     * Replace existing administrator login info with new info.     *      * @param newUsername new administrator login username     * @param newPassword new administrator login password     */    public static void resetAuthentication(String newUsername,            String newPassword) {        Heritrix.httpServer.resetAuthentication(ADMIN, adminUsername,                newUsername, newPassword);        adminUsername = newUsername;        adminPassword = newPassword;         logger.info("administrative login changed to "                +newUsername+":"+newPassword);    }    protected static CrawlJob createCrawlJob(CrawlJobHandler handler,            File crawlOrderFile, String name)    throws InvalidAttributeValueException {        XMLSettingsHandler settings = new XMLSettingsHandler(crawlOrderFile);        settings.initialize();        return new CrawlJob(handler.getNextJobUID(), name, settings,            new CrawlJobErrorHandler(Level.SEVERE),            CrawlJob.PRIORITY_HIGH,            crawlOrderFile.getAbsoluteFile().getParentFile());    }        /**     * This method is called when we have an order file to hand that we want     * to base a job on.  It leaves the order file in place and just starts up     * a job that uses all the order points to for locations for logs, etc.     * @param orderPathOrUrl Path to an order file or to a seeds file.     * @param name Name to use for this job.     * @param description      * @param seeds      * @return A status string.     * @throws IOException      * @throws FatalConfigurationException      */    public String addCrawlJob(String orderPathOrUrl, String name,            String description, String seeds)    throws IOException, FatalConfigurationException {        if (!UURI.hasScheme(orderPathOrUrl)) {            // Assume its a file path.            return addCrawlJob(new File(orderPathOrUrl), name, description,                    seeds);        }        // Otherwise, must be an URL.        URL url = new URL(orderPathOrUrl);        // Handle http and file only for now (Tried to handle JarUrlConnection        // but too awkward undoing jar stream.  Rather just look for URLs that        // end in '.jar').        String result = null;        URLConnection connection = url.openConnection();        if (connection instanceof HttpURLConnection) {            result = addCrawlJob(url, (HttpURLConnection)connection, name,                description, seeds);        } else if (connection instanceof FileURLConnection) {            result = addCrawlJob(new File(url.getPath()), name, description,                seeds);        } else {            throw new UnsupportedOperationException("No support for "                + connection);        }        return result;    }        protected String addCrawlJob(final URL url,            final HttpURLConnection connection,            final String name, final String description, final String seeds)    throws IOException, FatalConfigurationException {        // Look see if its a jar file.  If it is undo it.        boolean isJar = url.getPath() != null &&            url.getPath().toLowerCase().endsWith(JAR_SUFFIX);        // If http url connection, bring down the resource local.        File localFile = File.createTempFile(Heritrix.class.getName(),           isJar? JAR_SUFFIX: null, TMPDIR);        connection.connect();        String result = null;        try {            IoUtils.readFullyToFile(connection.getInputStream(), localFile);            result = addCrawlJob(localFile, name, description, seeds);        } catch (IOException ioe) {            // Cleanup if an Exception.            localFile.delete();            localFile = null;        } finally {             connection.disconnect();             // If its a jar file, then we made a job based on the jar contents.             // Its no longer needed.  Remove it.  If not a jar file, then leave             // the file around because the job depends on it.             if (isJar && localFile != null && localFile.exists()) {                 localFile.delete();             }        }        return result;    }        protected String addCrawlJob(final File order, final String name,            final String description, final String seeds)    throws FatalConfigurationException, IOException {        CrawlJob addedJob = null;        if (this.jobHandler == null) {            throw new NullPointerException("Heritrix jobhandler is null.");        }        try {            if (order.getName().toLowerCase().endsWith(JAR_SUFFIX)) {                return addCrawlJobBasedonJar(order, name, description, seeds);            }            addedJob = this.jobHandler.                addJob(createCrawlJob(this.jobHandler, order, name));        } catch (InvalidAttributeValueException e) {            FatalConfigurationException fce = new FatalConfigurationException(                "Converted InvalidAttributeValueException on " +                order.getAbsolutePath() + ": " + e.getMessage());            fce.setStackTrace(e.getStackTrace());        }        return addedJob != null? addedJob.getUID(): null;    }        /**     * Undo jar file and use as basis for a new job.     * @param jarFile Pointer to file that holds jar.     * @param name Name to use for new job.     * @param description      * @param seeds      * @return Message.     * @throws IOException     * @throws FatalConfigurationException

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -