⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 psuckerthread.java

📁 Java Crawler with domain knowledge path
💻 JAVA
字号:
package ie.vnit.webcrawler;

import ie.vnit.threads.*;
import java.net.*;
import java.io.*;
import java.util.Vector;

public class PSuckerThread extends ControllableThread {
	public void process(Object o) {
		// The objects that we're dealing with here a strings for urls
		try {
			URL pageURL = (URL) o;
 
			// See if it's a jpeg, mpeg or avi

			String filename = pageURL.getFile().toLowerCase();
			if (filename.endsWith(".jpg") ||
				filename.endsWith(".jpeg")||
				filename.endsWith(".mpeg") ||
				filename.endsWith(".mpg") ||
				filename.endsWith(".avi") ||
				filename.endsWith(".wmv")) {
				filename = filename.replace('/', '-');
				filename = ((URLQueue) queue).getFilenamePrefix() +
					pageURL.getHost() + filename;
				System.out.println("Saving to file " + filename);
				try {
					SaveURL.writeURLtoFile(pageURL, filename);
				} catch (Exception e) {
					System.out.println("Saving to file " + filename + " from URL " + pageURL.toString() + " failed due to a " + e.toString());
				}
				return;
			}

			// If it's neither a jpg nor some text, it's not interesting.
 			String mimetype = pageURL.openConnection().getContentType();
            if (!mimetype.startsWith("text")) return;

			String rawPage = SaveURL.getURL(pageURL);
            String smallPage = rawPage.toLowerCase().replaceAll("\\s", " ");
			// treat the url a a html file and try to extract links
			Vector links = SaveURL.extractLinks(rawPage, smallPage);
			// Convert each link text to a url and enque
			for (int n = 0; n < links.size(); n++) {
				try {
					// urls might be relative to current page
					URL link = new URL(pageURL,
									   (String) links.elementAt(n));
					// If layers are not used, write everything into same layer
					if (tc.getMaxLevel() == -1)
						queue.push(link, level);
					else
						queue.push(link, level + 1);
				} catch (MalformedURLException e) {
					// Ignore malformed URLs, the link extractor might
					// have failed.
				}
			}
		} catch (Exception e) {
			// e.printStackTrace();
			// process of this object has failed, but we just ignore it here
		}
	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -