📄 webrobot.java
字号:
* @return true if this tasks can be added to the task list,
* false otherwise
*/
protected boolean taskAddAllowed(RobotTask task) {
if (task == null) {
log.info("Null task not allowed");
return false;
}
if (!isAllowed(task.getUrl())) {
return false;
}
if (todo.contains(task)) {
return false;
}
return true;
}
/**
* Is it allowed to travel to this new URL ?
* @param u the URL to test
* @return true if traveling to this URL is allowed, false otherwise
*/
protected boolean isAllowed(URL u) {
// do the basic checks
if (basicURLCheck(u)) {
// if we have an URLCheck then test this URL against it
if ((urlCheck != null) && (!urlCheck.checkURL(u))) {
log.debug("not allowed by URLCheck:" + u);
return false;
}
if (robCheck.ok(u)) {
return true;
} else {
log.debug("not allowed by robots.txt:" + u);
return false;
}
}
return false;
}
/**
* Is it allowed to process this document ?
* @param document
* @return true if processing of this URL is allowed
*/
protected boolean isProcessingAllowed(HttpDoc doc) {
URL u = doc.getURL();
if ((urlCheck != null) && (!urlCheck.checkURLForProcessing(u))) {
log.debug("processing not allowed by URLCheck:" + u);
return false;
}
DownloadRuleSet downloadRuleSet = httpTool.getDownloadRuleSet();
if (downloadRuleSet != null && !downloadRuleSet.processAllowed(doc.getHttpHeaders())) {
log.debug("processing not allowed by DownloadRuleSet:" + u);
return false;
}
return true;
}
/**
* Basic URL allow check
* it is allowed to walk to a new URL if <ul>
* <li>WalkToOtherHost is true. In this case there will be no additional
* tests.</li>
* <li>The new URL is located below the start URL, e.g. is the start URL
* is http://localhost/test, the URL http://localhost/test/index.html
* is allowed, but http://localhost/ is not allowed.</li>
* <li>AllowWholeHost is true and the new URL is located on the same host
* as the start URL.</li>
* <li>FlexibleHostCheck is true and the host part of the current URL
* is equal to the host part of the start URL modulo the prefix "www."
* </li>
* <li>The URL starts with a string in the "AllowedURLs" list.</li>
* </ul>
*/
protected boolean basicURLCheck(URL currURL) {
String currURLStr = currURL.getHost() + currURL.getPath();
String currHost = currURL.getHost().toLowerCase();
String startHost = startURL.getHost().toLowerCase();
// no more checks, if walkToOtherHosts is true
if (walkToOtherHosts) {
return true;
}
// new URL below start URL ?
if (currURLStr.startsWith(startDir)) {
return true;
}
// on the same host ?
if (allowWholeHost && (currURL.getHost().equalsIgnoreCase(startURL.getHost()))) {
return true;
}
// on the same host with flexible test (host name with and without "www."
if (flexibleHostCheck) {
if (cutWWW(currHost).equalsIgnoreCase(cutWWW(startHost))) {
return true;
}
}
// allow whole domain ?
if (allowWholeDomain) {
if (currHost.endsWith(getDomain(startHost))) {
return true;
}
}
// in the list of allowed URLs ?
for (int i = 0; i < allowedURLs.size(); i++) {
String s = (String) allowedURLs.elementAt(i);
if (currURLStr.startsWith(s)) {
return true;
}
}
log.debug("URL " + currURLStr + " not allowed");
return false;
}
/**
* remove a leading www. from a given hostname
*
* @param hostname some hostname
* @return the hostname if it doesn't start with "www." otherwise
* the hostname without the leading www.
*/
private String cutWWW(String hostname) {
if (hostname.toLowerCase().startsWith("www.")) {
return hostname.substring(4);
} else {
return hostname;
}
}
/**
* Gets the domain name of a given host (just delete everything
* to the last "."
*
* @param hostname some hostname
* @return the domain part of this hostname
*/
private String getDomain(String hostname) {
int pos = hostname.indexOf(".");
if (pos < 0) {
// this should not happen !
return hostname;
} else {
return hostname.substring(pos + 1);
}
}
/**
* Method getExceptionHandler.
* @return RobotExceptionHandler the exceptionhandler of the robot
*/
public RobotExceptionHandler getExceptionHandler() {
return exceptionHandler;
}
/**
* Method setExceptionHandler.
* sets the exceptionhandler of the robot
* @param newExceptionHandler the new exception handler
*/
public void setExceptionHandler(RobotExceptionHandler newExceptionHandler) {
if (newExceptionHandler != null) {
exceptionHandler = newExceptionHandler;
}
}
/**
* Method setStart.
* sets the start URL
* @param the startURL as String
*/
public void setStart(String startURL) {
try {
setStartURL(new URL(startURL));
} catch (MalformedURLException e) {
e.printStackTrace();
}
}
/**
* Method getStart.
* gets the start url as string
* @return String
*/
public String getStart() {
URL url = getStartURL();
if (url != null) {
return url.toExternalForm();
} else {
return null;
}
}
/**
* This method finishes HttpTool, NoRobots, HttpDocManager.
*/
public void finish() {
if (httpTool != null) {
httpTool.finish();
}
if (robCheck != null) {
robCheck.finish();
}
if (docManager != null) {
docManager.finish();
}
}
public static void main(String[] args) {
if (args.length > 0) System.err.println("Arguments will be ignored!");
Field[] fields = WebRobot.class.getDeclaredFields();
StringBuffer str = new StringBuffer(60);
for (int i = 0; i < fields.length; i++) {
if (!Modifier.isFinal(fields[i].getModifiers())
&& !Modifier.isStatic(fields[i].getModifiers())) {
str.delete(0, str.length());
str.append(" robot." + fields[i].getName() + " = " + fields[i].getName() + ";");
while (str.length() < 50) {
str.append(" ");
}
System.out.println(str.toString()+"// ("+fields[i].getType().getName()+")");
}
}
}
/** default expected count of documents */
private static final int DEFAULT_EXPECTED_DOCUMENT_COUNT = 50000;
/** expected count of documents */
protected int expectedDocumentCount = DEFAULT_EXPECTED_DOCUMENT_COUNT;
/** remember visited content here (md5, urlString) */
protected HashMap content2UrlMap;
/** counter for pages that were found in cache */
long countCache = 0;
/** counter for pages retrieved by web */
long countWeb = 0;
/** counter for pages that didn磘 need a refresh */
long countNoRefresh = 0;
/** counter for refreshed pages (=cache+web) */
long countRefresh = 0;
/**
* Method getContentVisitedURL.
* Checks if the content was visited before and retrieves the corresponding URL.
* @param content
* @return found url or null if not found
*/
public String getContentVisitedURL(HttpDoc doc) {
Object key = doc.getContentMD5();
synchronized(content2UrlMap) {
String url = (String) content2UrlMap.get(key);
return url;
}
}
/**
* Method setContentVisitedURL.
* Makes an URL retrievable by its content by entering it in content2UrlMap.
* @param content
* @param url
*/
public void setContentVisitedURL(HttpDoc doc, String url) {
Object key = doc.getContentMD5();
synchronized(content2UrlMap) {
content2UrlMap.put(key, url);
}
}
private final RobotTask createRobotTask(URL url, int maxDepth, String startReferer) {
url = removeWasteParameters(url);
return new RobotTask(url, maxDepth, startReferer);
}
/** only true if form-handlers are defined */
boolean hasFormHandlers = false;
/** list of wasteParameters (will be removed from URLs) **/
protected Vector wasteParameters = new Vector();
/**
* Set the list of wasteParameters (will be removed from URLs)
* @param wasteParameters
* if they begin of a string in this vector
*/
public void setWasteParameters(Vector wasteParameters) {
this.wasteParameters = wasteParameters;
}
/**
* Gets the list of wasteParameters (will be removed from URLs)
* @return a Vector containing Strings
*/
public Vector getWasteParameters() {
return this.wasteParameters;
}
/** Removes wasteParameters from URL.
* (eg. ID)
* @param url
* @return URL
*/
public URL removeWasteParameters(URL url) {
String urlString = url.toExternalForm();
String newUrlString = removeParametersFromString(urlString, wasteParameters);
if (urlString != newUrlString) {
try {
url = new URL(newUrlString);
} catch (MalformedURLException ex) {
ex.printStackTrace();
}
};
return url;
}
/**
* Remove passed Parameters from UrlString
* @param urlString
* @param wasteParameters
* @return String
*/
public static String removeParametersFromString(String urlString, Vector wasteParameters) {
if (wasteParameters != null && wasteParameters.size() > 0) {
int questionMark = urlString.indexOf("?");
if (questionMark>0 && questionMark<urlString.length()) {
int restPosition = urlString.indexOf("#", questionMark);
String parameters;
String rest;
if (restPosition<0) {
parameters = urlString.substring(questionMark+1);
rest = null;
} else {
parameters = urlString.substring(questionMark+1,restPosition);
rest = urlString.substring(restPosition);
}
StringBuffer filteredUrl = new StringBuffer(urlString.substring(0,questionMark));
StringTokenizer tokenizer = new StringTokenizer(parameters, "&");
String and = "?";
boolean changed = false;
while (tokenizer.hasMoreTokens()) {
String token = tokenizer.nextToken();
boolean keep = true;
for (int w=0; w<wasteParameters.size(); w++) {
String wasteParameter = (String) wasteParameters.elementAt(w);
if (token.startsWith(wasteParameter + "=")) {
keep = false;
changed = true;
break;
}
}
if (keep) {
filteredUrl.append(and);
filteredUrl.append(token);
and = "&";
}
}
if (rest != null) filteredUrl.append(rest);
if (changed) {
urlString = filteredUrl.toString();
}
}
}
return urlString;
}
/** time of WebRobot start in milliseconds */
protected long startTime = System.currentTimeMillis();
/** number of allowed retries for document retrieval */
protected int maxRetries = 0;
/**
* Set allowed retries for document retrieval
* @param maxRetries
*/
public void setMaxRetries(int maxRetries) { this.maxRetries = maxRetries; }
/**
* Get allowed retries for document retrieval
* @return maxRetries
*/
public int getMaxRetries() { return maxRetries; }
/**
* expiration age of documents in cache.
* Documents older than expirationAge will be removed,
* negative value means no limit.
*/
protected long expirationAge = -1;
/**
* set expiration age of documents in cache.
* Documents older than expirationAge will be removed,
* negative value means no limit.
* @param age
*/
public void setExpirationAge(long age) { expirationAge = age; }
/**
* get expiration age of documents in cache.
* @return long
*/
public long getExpirationAge() { return expirationAge; }
/**
* Remove Parameters from Url
* @param url
* @return url without parameters
*/
private final static String removeParameters(String url) {
int pos = url.indexOf("?");
return pos >= 0 ? url.substring(0,pos) : url;
}
/**
* Reads a File to a byte array.
* @param file
* @return byte[]
* @throws IOException
*/
protected byte[] readFileToByteArray(File file) throws IOException
{
FileInputStream in = null;
try
{
byte[] buffer = new byte[(int) file.length()];
in = new FileInputStream(file);
in.read(buffer);
return buffer;
}
finally
{
if (in != null)
{
try
{
in.close();
}
catch (IOException e)
{
}
}
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -