📄 memoryworkloadmanager.java
字号:
/*
* Encog Neural Network and Bot Library for Java v1.x
* http://www.heatonresearch.com/encog/
* http://code.google.com/p/encog-java/
*
* Copyright 2008, Heaton Research Inc., and individual contributors.
* See the copyright.txt in the distribution for a full listing of
* individual contributors.
*
* This is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this software; if not, write to the Free
* Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
* 02110-1301 USA, or see the FSF site: http://www.fsf.org.
*/
package org.encog.bot.spider.workload.memory;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import org.encog.bot.spider.Spider;
import org.encog.bot.spider.workload.WorkloadError;
import org.encog.bot.spider.workload.WorkloadManager;
/**
* MemoryWorkloadManager: This class implements a workload manager that stores
* the list of URL's in memory. This workload manager only supports spidering
* against a single host. For multiple hosts use the SQLWorkloadManager.
*/
public class MemoryWorkloadManager implements WorkloadManager {
/**
* How many seconds to wait for work.
*/
public static final int WAIT_FOR_WORK = 5;
/**
* The current workload, a map between URL and URLStatus objects.
*/
private final Map<URL, URLStatus> workload = new HashMap<URL, URLStatus>();
/**
* The list of those items, which are already in the workload, that are
* waiting for processing.
*/
private final BlockingQueue<URL> waiting = new LinkedBlockingQueue<URL>();
/**
* How many URL's are currently being processed.
*/
private int workingCount = 0;
/**
* Because the MemoryWorkloadManager only supports a single host, the
* currentHost is set to the host of the first URL added.
*/
private String currentHost;
/**
* Add the specified URL to the workload.
*
* @param url
* The URL to be added.
* @param source
* The page that contains this URL.
* @param depth
* The depth of this URL.
* @return True if the URL was added, false otherwise.
*/
public boolean add(final URL url, final URL source, final int depth) {
if (!contains(url)) {
this.waiting.add(url);
setStatus(url, source, URLStatus.Status.WAITING, depth);
if (this.currentHost == null) {
this.currentHost = url.getHost().toLowerCase();
}
return true;
}
return false;
}
/**
* Clear the workload.
*/
public void clear() {
this.workload.clear();
this.waiting.clear();
this.workingCount = 0;
}
/**
* Determine if the workload contains the specified URL.
*
* @param url
* The URL to check.
* @return True if the URL is contained by the workload.
*/
public boolean contains(final URL url) {
return this.workload.containsKey(url);
}
/**
* Convert the specified String to a URL. If the string is too long or has
* other issues, throw a WorkloadException.
*
* @param url
* A String to convert into a URL.
* @return The URL.
*/
public URL convertURL(final String url) {
try {
return new URL(url);
} catch (final MalformedURLException e) {
throw new WorkloadError(e);
}
}
/**
* Get the current host.
*
* @return The current host.
*/
public String getCurrentHost() {
return this.currentHost;
}
/**
* Get the depth of the specified URL.
*
* @param url
* The URL to get the depth of.
* @return The depth of the specified URL.
*/
public int getDepth(final URL url) {
final URLStatus s = this.workload.get(url);
assert s != null;
return s.getDepth();
}
/**
* Get the source page that contains the specified URL.
*
* @param url
* The URL to seek the source for.
* @return The source of the specified URL.
*/
public URL getSource(final URL url) {
final URLStatus s = this.workload.get(url);
if (s == null) {
return null;
}
return s.getSource();
}
/**
* Get a new URL to work on. Wait if there are no URL's currently available.
* Return null if done with the current host. The URL being returned will be
* marked as in progress.
*
* @return The next URL to work on,
*/
public URL getWork() {
URL url;
try {
url = this.waiting.poll(WAIT_FOR_WORK, TimeUnit.SECONDS);
if (url != null) {
setStatus(url, null, URLStatus.Status.WORKING, -1);
this.workingCount++;
}
return url;
} catch (final InterruptedException e) {
return null;
}
}
/**
* Setup this workload manager for the specified spider. This method is not
* used by the MemoryWorkloadManager.
*
* @param spider
* The spider using this workload manager.
*/
public void init(final Spider spider) {
}
/**
* Mark the specified URL as error.
*
* @param url
* The URL that had an error.
*/
public void markError(final URL url) {
this.workingCount--;
assert this.workingCount > 0;
this.waiting.remove(url);
setStatus(url, null, URLStatus.Status.ERROR, -1);
}
/**
* Mark the specified URL as successfully processed.
*
* @param url
* The URL to mark as processed.
*/
public void markProcessed(final URL url) {
this.workingCount--;
assert this.workingCount > 0;
this.waiting.remove(url);
setStatus(url, null, URLStatus.Status.PROCESSED, -1);
}
/**
* Move on to process the next host. This should only be called after
* getWork returns null. Because the MemoryWorkloadManager is single host
* only, this function simply returns null.
*
* @return The name of the next host.
*/
public String nextHost() {
return null;
}
/**
* Setup the workload so that it can be resumed from where the last spider
* left the workload.
*/
public void resume() {
throw new WorkloadError(
"Memory based workload managers can not resume.");
}
/**
* Set the source, status and depth for the specified URL.
*
* @param url
* The URL to set.
* @param source
* The source of this URL.
* @param status
* The status of this URL.
* @param depth
* The depth of this URL.
*/
private void setStatus(final URL url, final URL source,
final URLStatus.Status status, final int depth) {
URLStatus s = this.workload.get(url);
if (s == null) {
s = new URLStatus();
this.workload.put(url, s);
}
s.setStatus(status);
if (source != null) {
s.setSource(source);
}
if (depth != -1) {
s.setDepth(depth);
}
}
/**
* If there is currently no work available, then wait until a new URL has
* been added to the workload. Because the MemoryWorkloadManager uses a
* blocking queue, this method is not needed. It is implemented to support
* the interface.
*
* @param time
* The amount of time to wait.
* @param length
* What tiem unit is being used.
*/
public void waitForWork(final int time, final TimeUnit length) {
}
/**
* Return true if there are no more workload units.
*
* @return Returns true if there are no more workload units.
*/
public boolean workloadEmpty() {
if (!this.waiting.isEmpty()) {
return false;
}
return this.workingCount < 1;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -