📄 crawler.java
字号:
import java.io.*;
import java.net.*;
import java.util.*;
import java.util.regex.*;
public class Crawler extends Thread {
String memoryFilter=".+", diskFilter=".+", typeFilter=".+";
UrlMap urlMap;
int urlIdx = 0, maxPages=99999999, pages = 0;
String root = "sites/";
public static void main(String[] args) throws Exception {
String file = "Crawler.sites";
if (args.length > 0) file = args[0];
String xml = STR.file2text(file);
setProxy(STR.innerText(xml, "<proxy>", "</proxy>"));
String maxPageStr = STR.innerText(xml, "<maxPages>", "</maxPages>");
int maxPages = 99999999;
if (maxPageStr != null) maxPages = Integer.parseInt(maxPageStr);
String[] sites = STR.innerText(xml, "<siteList>", "</siteList>").trim().split("\n");
String textFilter = ".+";
if (file.endsWith(".sites")) {
String typeFilter = STR.innerText(xml, "<typeFilter>", "</typeFilter>");
for (int i=0; i<sites.length; i++) {
System.out.println("sites[i]="+sites[i]);
String site = STR.head(sites[i], "\t");
siteCrawing(site, typeFilter, maxPages);
}
} else {
String memoryFilter = STR.innerText(xml, "<memoryFilter>", "</memoryFilter>");
String diskFilter = STR.innerText(xml, "<diskFilter>", "</diskFilter>");
String typeFilter = STR.innerText(xml, "<typeFilter>", "</typeFilter>");
crawing("url.txt", sites, memoryFilter, diskFilter, typeFilter, maxPages);
}
}
public static void siteCrawing(String site, String typeFilter, int maxPages) throws Exception {
// if (site.endsWith("/")) site = site.substring(0, site.length()-1);
String pathFilter = ".*?"+site.replaceFirst("http://", "").replaceFirst("www.", "")+".*?";
String memoryFilter = pathFilter;
String diskFilter = pathFilter;
String[] urls = { site };
System.out.println("siteCrawing : "+site);
crawing(STR.head(UrlMap.url2path(site)+"/", "/")+"\\url.txt", urls, memoryFilter, diskFilter, typeFilter, maxPages);
}
public static void crawing(String urlFile, String[] urls, String memoryFilter, String diskFilter, String typeFilter, int maxPages) throws Exception {
if (memoryFilter == null) memoryFilter = ".+";
if (diskFilter == null) diskFilter = ".+";
if (typeFilter == null) typeFilter = ".+";
Crawler crawler = new Crawler(urlFile, memoryFilter, diskFilter, typeFilter, maxPages); //
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -