📄 psuckerthread.java
字号:
package ie.vnit.webcrawler;
import ie.vnit.threads.*;
import java.net.*;
import java.io.*;
import java.util.Vector;
public class PSuckerThread extends ControllableThread {
public void process(Object o) {
// The objects that we're dealing with here a strings for urls
try {
URL pageURL = (URL) o;
// See if it's a jpeg, mpeg or avi
String filename = pageURL.getFile().toLowerCase();
if (filename.endsWith(".jpg") ||
filename.endsWith(".jpeg")||
filename.endsWith(".mpeg") ||
filename.endsWith(".mpg") ||
filename.endsWith(".avi") ||
filename.endsWith(".wmv")) {
filename = filename.replace('/', '-');
filename = ((URLQueue) queue).getFilenamePrefix() +
pageURL.getHost() + filename;
System.out.println("Saving to file " + filename);
try {
SaveURL.writeURLtoFile(pageURL, filename);
} catch (Exception e) {
System.out.println("Saving to file " + filename + " from URL " + pageURL.toString() + " failed due to a " + e.toString());
}
return;
}
// If it's neither a jpg nor some text, it's not interesting.
String mimetype = pageURL.openConnection().getContentType();
if (!mimetype.startsWith("text")) return;
String rawPage = SaveURL.getURL(pageURL);
String smallPage = rawPage.toLowerCase().replaceAll("\\s", " ");
// treat the url a a html file and try to extract links
Vector links = SaveURL.extractLinks(rawPage, smallPage);
// Convert each link text to a url and enque
for (int n = 0; n < links.size(); n++) {
try {
// urls might be relative to current page
URL link = new URL(pageURL,
(String) links.elementAt(n));
// If layers are not used, write everything into same layer
if (tc.getMaxLevel() == -1)
queue.push(link, level);
else
queue.push(link, level + 1);
} catch (MalformedURLException e) {
// Ignore malformed URLs, the link extractor might
// have failed.
}
}
} catch (Exception e) {
// e.printStackTrace();
// process of this object has failed, but we just ignore it here
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -