📄 crawljobhandler.java
字号:
* written to disk. */ public void discardNewJob(){ FileUtils.deleteDir(new File(newJob.getSettingsDirectory())); } /** * Get the handler's 'new job' * @return the handler's 'new job' */ public CrawlJob getNewJob(){ return newJob; } /** * Is the crawler accepting crawl jobs to run? * @return True if the next availible CrawlJob will be crawled. False otherwise. */ public boolean isRunning() { return running; } /** * Is a crawl job being crawled? * @return True if a job is actually being crawled (even if it is paused). * False if no job is being crawled. */ public boolean isCrawling() { return this.currentJob != null; } /** * Allow jobs to be crawled. */ public void startCrawler() { running = true; if (pendingCrawlJobs.size() > 0 && isCrawling() == false) { // Ok, can just start the next job startNextJob(); } } /** * Stop future jobs from being crawled. * * This action will not affect the current job. */ public void stopCrawler() { running = false; } /** * Start next crawl job. * * If a is job already running this method will do nothing. */ protected final void startNextJob() { synchronized (this) { if(startingNextJob != null) { try { startingNextJob.join(); } catch (InterruptedException e) { e.printStackTrace(); return; } } startingNextJob = new Thread(new Runnable() { public void run() { startNextJobInternal(); } }, "StartNextJob"); startingNextJob.start(); } } protected void startNextJobInternal() { if (pendingCrawlJobs.size() == 0 || isCrawling()) { // No job ready or already crawling. return; } this.currentJob = (CrawlJob)pendingCrawlJobs.first(); assert pendingCrawlJobs.contains(currentJob) : "pendingCrawlJobs is in an illegal state"; pendingCrawlJobs.remove(currentJob); try { this.currentJob.setupForCrawlStart(); // This is ugly but needed so I can clear the currentJob // reference in the crawlEnding and update the list of completed // jobs. Also, crawlEnded can startup next job. this.currentJob.getController().addCrawlStatusListener(this); // now, actually start this.currentJob.getController().requestCrawlStart(); } catch (InitializationException e) { loadJob(getStateJobFile(this.currentJob.getDirectory())); this.currentJob = null; startNextJobInternal(); // Load the next job if there is one. } } /** * Forward a 'kick' update to current job if any. */ public void kickUpdate() { if(this.currentJob != null) { this.currentJob.kickUpdate(); } } /** * Loads options from a file. Typically these are a list of available * modules that can be plugged into some part of the configuration. * For examples Processors, Frontiers, Filters etc. Leading and trailing * spaces are trimmed from each line. * * <p>Options are loaded from the CLASSPATH. * @param file the name of the option file (without path!) * @return The option file with each option line as a seperate entry in the * ArrayList. * @throws IOException when there is trouble reading the file. */ public static ArrayList<String> loadOptions(String file) throws IOException { ArrayList<String> ret = new ArrayList<String>(); Enumeration resources = CrawlJob.class.getClassLoader().getResources("modules/" + file); boolean noFileFound = true; while (resources.hasMoreElements()) { InputStream is = ((URL) resources.nextElement()).openStream(); noFileFound = false; String line = null; BufferedReader bf = new BufferedReader(new InputStreamReader(is), 8192); try { while ((line = bf.readLine()) != null) { line = line.trim(); if(line.indexOf('#')<0 && line.length()>0){ // Looks like a valid line. ret.add(line); } } } finally { bf.close(); } } if (noFileFound) { throw new IOException("Failed to get " + file + " from the " + " CLASSPATH"); } return ret; } /** * Returns a URIFrontierMarker for the current, paused, job. If there is no * current job or it is not paused null will be returned. * * @param regexpr * A regular expression that each URI must match in order to be * considered 'within' the marker. * @param inCacheOnly * Limit marker scope to 'cached' URIs. * @return a URIFrontierMarker for the current job. * @see #getPendingURIsList(FrontierMarker, int, boolean) * @see org.archive.crawler.framework.Frontier#getInitialMarker(String, * boolean) * @see org.archive.crawler.framework.FrontierMarker */ public FrontierMarker getInitialMarker(String regexpr, boolean inCacheOnly) { return (this.currentJob != null)? this.currentJob.getInitialMarker(regexpr, inCacheOnly): null; } /** * Returns the frontiers URI list based on the provided marker. This method * will return null if there is not current job or if the current job is * not paused. Only when there is a paused current job will this method * return a URI list. * * @param marker * URIFrontier marker * @param numberOfMatches * maximum number of matches to return * @param verbose * should detailed info be provided on each URI? * @return the frontiers URI list based on the provided marker * @throws InvalidFrontierMarkerException * When marker is inconsistent with the current state of the * frontier. * @see #getInitialMarker(String, boolean) * @see org.archive.crawler.framework.FrontierMarker */ public ArrayList getPendingURIsList(FrontierMarker marker, int numberOfMatches, boolean verbose) throws InvalidFrontierMarkerException { return (this.currentJob != null)? this.currentJob.getPendingURIsList(marker, numberOfMatches, verbose): null; } /** * Delete any URI from the frontier of the current (paused) job that match * the specified regular expression. If the current job is not paused (or * there is no current job) nothing will be done. * @param regexpr Regular expression to delete URIs by. * @return the number of URIs deleted */ public long deleteURIsFromPending(String regexpr) { return (this.currentJob != null)? this.currentJob.deleteURIsFromPending(regexpr): 0; } public String importUris(String file, String style, String force) { return importUris(file, style, "true".equals(force)); } /** * @param fileOrUrl Name of file w/ seeds. * @param style What style of seeds -- crawl log (<code>crawlLog</code> * style) or recovery journal (<code>recoveryJournal</code> style), or * seeds file style (Pass <code>default</code> style). * @param forceRevisit Should we revisit even if seen before? * @return A display string that has a count of all added. */ public String importUris(final String fileOrUrl, final String style, final boolean forceRevisit) { return (this.currentJob != null)? this.currentJob.importUris(fileOrUrl, style, forceRevisit): null; } protected int importUris(InputStream is, String style, boolean forceRevisit) { return (this.currentJob != null)? this.currentJob.importUris(is, style, forceRevisit): 0; } /** * Schedule a uri. * @param uri Uri to schedule. * @param forceFetch Should it be forcefetched. * @param isSeed True if seed. * @throws URIException */ public void importUri(final String uri, final boolean forceFetch, final boolean isSeed) throws URIException { importUri(uri, forceFetch, isSeed, true); } /** * Schedule a uri. * @param str String that can be: 1. a UURI, 2. a snippet of the * crawl.log line, or 3. a snippet from recover log. See * {@link #importUris(InputStream, String, boolean)} for how it subparses * the lines from crawl.log and recover.log. * @param forceFetch Should it be forcefetched. * @param isSeed True if seed. * @param isFlush If true, flush the frontier IF it implements * flushing. * @throws URIException */ public void importUri(final String str, final boolean forceFetch, final boolean isSeed, final boolean isFlush) throws URIException { if (this.currentJob != null) { this.currentJob.importUri(str, forceFetch, isSeed, isFlush); } } /** * If its a HostQueuesFrontier, needs to be flushed for the queued. */ protected void doFlush() { if (this.currentJob != null) { this.currentJob.flush(); } } public void stop() { if (isCrawling()) { deleteJob(getCurrentJob().getUID()); } } public void requestCrawlStop() { if (this.currentJob != null) { this.currentJob.stopCrawling(); } } /** * Ensure order file with new name/desc is written. * See '[ 1066573 ] sometimes job based-on other job uses older job name'. * @param newJob Newly created job. * @param metaname Metaname for new job. * @param description Description for new job. * @return <code>newJob</code> */ public static CrawlJob ensureNewJobWritten(CrawlJob newJob, String metaname, String description) { XMLSettingsHandler settingsHandler = newJob.getSettingsHandler(); CrawlerSettings orderfile = settingsHandler.getSettingsObject(null); orderfile.setName(metaname); orderfile.setDescription(description); settingsHandler.writeSettingsObject(orderfile); return newJob; } public void crawlStarted(String message) { // TODO Auto-generated method stub } public void crawlEnding(String sExitMessage) { loadJob(getStateJobFile(this.currentJob.getDirectory())); currentJob = null; synchronized (this) { // If the GUI terminated the job then it is waiting for this event. notifyAll(); } } public void crawlEnded(String sExitMessage) { if (this.running) { startNextJob(); } } public void crawlPausing(String statusMessage) { // TODO Auto-generated method stub } public void crawlPaused(String statusMessage) { // TODO Auto-generated method stub } public void crawlResuming(String statusMessage) { // TODO Auto-generated method stub } public void crawlCheckpoint(File checkpointDir) throws Exception { // TODO Auto-generated method stub }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -