📄 crawljobhandler.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
上一页 1 2 34
     * written to disk.     */    public void discardNewJob(){        FileUtils.deleteDir(new File(newJob.getSettingsDirectory()));    }    /**     * Get the handler's 'new job'     * @return the handler's 'new job'     */    public CrawlJob getNewJob(){        return newJob;    }    /**     * Is the crawler accepting crawl jobs to run?     * @return True if the next availible CrawlJob will be crawled. False otherwise.     */    public boolean isRunning() {        return running;    }    /**     * Is a crawl job being crawled?     * @return True if a job is actually being crawled (even if it is paused).     *         False if no job is being crawled.     */    public boolean isCrawling() {        return this.currentJob != null;    }    /**     * Allow jobs to be crawled.     */    public void startCrawler() {        running = true;        if (pendingCrawlJobs.size() > 0 && isCrawling() == false) {            // Ok, can just start the next job            startNextJob();        }    }    /**     * Stop future jobs from being crawled.     *     * This action will not affect the current job.     */    public void stopCrawler() {        running = false;    }    /**     * Start next crawl job.     *     * If a is job already running this method will do nothing.     */    protected final void startNextJob() {        synchronized (this) {            if(startingNextJob != null) {                try {                    startingNextJob.join();                } catch (InterruptedException e) {                    e.printStackTrace();                    return;                }            }            startingNextJob = new Thread(new Runnable() {                public void run() {                    startNextJobInternal();                }            }, "StartNextJob");            startingNextJob.start();        }    }        protected void startNextJobInternal() {        if (pendingCrawlJobs.size() == 0 || isCrawling()) {            // No job ready or already crawling.            return;        }        this.currentJob = (CrawlJob)pendingCrawlJobs.first();        assert pendingCrawlJobs.contains(currentJob) :            "pendingCrawlJobs is in an illegal state";        pendingCrawlJobs.remove(currentJob);        try {            this.currentJob.setupForCrawlStart();            // This is ugly but needed so I can clear the currentJob            // reference in the crawlEnding and update the list of completed            // jobs.  Also, crawlEnded can startup next job.            this.currentJob.getController().addCrawlStatusListener(this);            // now, actually start            this.currentJob.getController().requestCrawlStart();        } catch (InitializationException e) {            loadJob(getStateJobFile(this.currentJob.getDirectory()));            this.currentJob = null;            startNextJobInternal(); // Load the next job if there is one.        }    }    /**     * Forward a 'kick' update to current job if any.     */    public void kickUpdate() {        if(this.currentJob != null) {            this.currentJob.kickUpdate();        }    }    /**     * Loads options from a file. Typically these are a list of available     * modules that can be plugged into some part of the configuration.     * For examples Processors, Frontiers, Filters etc. Leading and trailing     * spaces are trimmed from each line.     *      * <p>Options are loaded from the CLASSPATH.     * @param file the name of the option file (without path!)     * @return The option file with each option line as a seperate entry in the     *         ArrayList.     * @throws IOException when there is trouble reading the file.     */    public static ArrayList<String> loadOptions(String file)    throws IOException {        ArrayList<String> ret = new ArrayList<String>();        Enumeration resources =             CrawlJob.class.getClassLoader().getResources("modules/" + file);        boolean noFileFound = true;        while (resources.hasMoreElements()) {            InputStream is = ((URL) resources.nextElement()).openStream();            noFileFound = false;            String line = null;            BufferedReader bf =                new BufferedReader(new InputStreamReader(is), 8192);            try {                while ((line = bf.readLine()) != null) {                    line = line.trim();                    if(line.indexOf('#')<0 && line.length()>0){                        // Looks like a valid line.                        ret.add(line);                    }                }            } finally {                bf.close();            }        }                if (noFileFound) {            throw new IOException("Failed to get " + file + " from the " +                " CLASSPATH");        }        return ret;    }    /**     * Returns a URIFrontierMarker for the current, paused, job. If there is no     * current job or it is not paused null will be returned.     *     * @param regexpr     *            A regular expression that each URI must match in order to be     *            considered 'within' the marker.     * @param inCacheOnly     *            Limit marker scope to 'cached' URIs.     * @return a URIFrontierMarker for the current job.     * @see #getPendingURIsList(FrontierMarker, int, boolean)     * @see org.archive.crawler.framework.Frontier#getInitialMarker(String,     *      boolean)     * @see org.archive.crawler.framework.FrontierMarker     */    public FrontierMarker getInitialMarker(String regexpr,            boolean inCacheOnly) {        return (this.currentJob != null)?                this.currentJob.getInitialMarker(regexpr, inCacheOnly): null;    }    /**     * Returns the frontiers URI list based on the provided marker. This method     * will return null if there is not current job or if the current job is     * not paused. Only when there is a paused current job will this method     * return a URI list.     *     * @param marker     *            URIFrontier marker     * @param numberOfMatches     *            maximum number of matches to return     * @param verbose     *            should detailed info be provided on each URI?     * @return the frontiers URI list based on the provided marker     * @throws InvalidFrontierMarkerException     *             When marker is inconsistent with the current state of the     *             frontier.     * @see #getInitialMarker(String, boolean)     * @see org.archive.crawler.framework.FrontierMarker     */    public ArrayList getPendingURIsList(FrontierMarker marker,            int numberOfMatches, boolean verbose)    throws InvalidFrontierMarkerException {        return (this.currentJob != null)?           this.currentJob.getPendingURIsList(marker, numberOfMatches, verbose):           null;    }    /**     * Delete any URI from the frontier of the current (paused) job that match     * the specified regular expression. If the current job is not paused (or     * there is no current job) nothing will be done.     * @param regexpr Regular expression to delete URIs by.     * @return the number of URIs deleted     */    public long deleteURIsFromPending(String regexpr) {        return (this.currentJob != null)?                this.currentJob.deleteURIsFromPending(regexpr): 0;    }        public String importUris(String file, String style, String force) {        return importUris(file, style, "true".equals(force));    }    /**     * @param fileOrUrl Name of file w/ seeds.     * @param style What style of seeds -- crawl log (<code>crawlLog</code>     * style) or recovery journal (<code>recoveryJournal</code> style), or     * seeds file style (Pass <code>default</code> style).     * @param forceRevisit Should we revisit even if seen before?     * @return A display string that has a count of all added.     */    public String importUris(final String fileOrUrl, final String style,            final boolean forceRevisit) {        return (this.currentJob != null)?            this.currentJob.importUris(fileOrUrl, style, forceRevisit): null;    }        protected int importUris(InputStream is, String style,            boolean forceRevisit) {        return (this.currentJob != null)?                this.currentJob.importUris(is, style, forceRevisit): 0;    }        /**     * Schedule a uri.     * @param uri Uri to schedule.     * @param forceFetch Should it be forcefetched.     * @param isSeed True if seed.     * @throws URIException     */    public void importUri(final String uri, final boolean forceFetch,            final boolean isSeed)    throws URIException {        importUri(uri, forceFetch, isSeed, true);    }        /**     * Schedule a uri.     * @param str String that can be: 1. a UURI, 2. a snippet of the     * crawl.log line, or 3. a snippet from recover log.  See     * {@link #importUris(InputStream, String, boolean)} for how it subparses     * the lines from crawl.log and recover.log.     * @param forceFetch Should it be forcefetched.     * @param isSeed True if seed.     * @param isFlush If true, flush the frontier IF it implements     * flushing.     * @throws URIException     */    public void importUri(final String str, final boolean forceFetch,            final boolean isSeed, final boolean isFlush)    throws URIException {        if (this.currentJob != null) {            this.currentJob.importUri(str, forceFetch, isSeed, isFlush);        }    }        /**     * If its a HostQueuesFrontier, needs to be flushed for the queued.     */    protected void doFlush() {        if (this.currentJob != null) {            this.currentJob.flush();        }    }        public void stop() {        if (isCrawling()) {            deleteJob(getCurrentJob().getUID());        }    }        public void requestCrawlStop() {        if (this.currentJob != null) {            this.currentJob.stopCrawling();        }    }        /**     * Ensure order file with new name/desc is written.     * See '[ 1066573 ] sometimes job based-on other job uses older job name'.     * @param newJob Newly created job.     * @param metaname Metaname for new job.     * @param description Description for new job.     * @return <code>newJob</code>     */    public static CrawlJob ensureNewJobWritten(CrawlJob newJob, String metaname,            String description) {        XMLSettingsHandler settingsHandler = newJob.getSettingsHandler();        CrawlerSettings orderfile = settingsHandler.getSettingsObject(null);        orderfile.setName(metaname);        orderfile.setDescription(description);        settingsHandler.writeSettingsObject(orderfile);        return newJob;    }    public void crawlStarted(String message) {        // TODO Auto-generated method stub            }    public void crawlEnding(String sExitMessage) {        loadJob(getStateJobFile(this.currentJob.getDirectory()));        currentJob = null;        synchronized (this) {            // If the GUI terminated the job then it is waiting for this event.            notifyAll();        }    }    public void crawlEnded(String sExitMessage) {        if (this.running) {            startNextJob();        }    }    public void crawlPausing(String statusMessage) {        // TODO Auto-generated method stub            }    public void crawlPaused(String statusMessage) {        // TODO Auto-generated method stub            }    public void crawlResuming(String statusMessage) {        // TODO Auto-generated method stub    }    public void crawlCheckpoint(File checkpointDir) throws Exception {        // TODO Auto-generated method stub    }}
上一页 1 2 34
💿 文件大小 9430 K
👤 上传用户 zergwyk
📂 所属分类 Internet/网络编程
🏷️ 相关标签

#lucece
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -