📄 spider.java
字号:
import java.io.BufferedReader;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.net.Socket;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Spider {
static String host = "";
static String address = "";
static Pattern p = Pattern.compile("\"[^\\ ]+\\.html\"");
static Matcher m = p.matcher("");
static List<String> allURL = new ArrayList<String>();
static Socket s = null;
static PrintWriter out = null;
static BufferedReader br = null;
static int ok = 0;
static int fal = 0;
public static void main(String[] args) throws Exception {
System.out.println("运行开始时间:"+new Date());
analyseURL(args);
fetch(address);
System.out.println("运行结束时间:"+new Date());
System.out.println("文件创建总数"+ok);
System.out.println("文件失败总数"+fal);
System.out.println("共有文件"+allURL.size());
}
private static void fetch(String address) throws IOException {
s = new Socket(host, 80);
out = new PrintWriter(new OutputStreamWriter(s
.getOutputStream()));
br = new BufferedReader(new InputStreamReader(s
.getInputStream()));
out.println("GET " + address + " HTTP/1.0");
//System.out.println(address);
out.println("Host : " + host);
out.println("Content-Type: text/html");
out.println();
out.flush();
List<String> myList = new ArrayList<String>();
String str = br.readLine();
File f = new File(address);
File tf = new File("E:" + address);
//System.out.println(tf);
tf.getParentFile().mkdirs();
if(!tf.createNewFile()){
fal++;
} else {
ok++;
}
FileWriter fw = new FileWriter(tf, true);
int skiptime = 0;
while (str != null) {
if(skiptime > 8) {
fw.write(str);
fw.write("\n");
fw.flush();
}
m = p.matcher(str);
while (m.find()) {
String st = m.group();
st = st.substring(1, st.length() - 1);
if(!st.contains("./")&&!st.contains("http:")&&!st.contains("..")&&!st.contains("?")) {
st = f.getParent().replace('\\', '/') + "/" + st;
insertURL(myList, st);
} else if(!st.contains("./")&&!st.contains("http:")&&st.contains("..")&&!st.contains("?")) {
st = getURL(f, st);
insertURL(myList, st);
}
}
str = br.readLine();
skiptime++;
}
fw.flush();
fw.close();
out.close();
br.close();
s.close();
s = null;
br = null;
fw = null;
for (int i = 0; i < myList.size(); i++) {
// System.out.println(myList.get(i));
fetch(myList.get(i));
}
}
private static String getURL(File f, String st) {
String qian = f.getParent().replace('\\', '/');
String hou = st;
//System.out.println(qian+"~~~~~"+hou);
while(hou.contains("..")) {
try {
int i = qian.lastIndexOf("/");
qian = qian.substring(0, i);
int j = hou.indexOf("../");
hou = hou.substring(j + 3, hou.length());
} catch(StringIndexOutOfBoundsException e) {
while (hou.contains("..")) {
int j = hou.indexOf("../");
hou = hou.substring(j + 3, hou.length());
}
}
}
st = qian+"/"+hou;
return st;
}
private static void insertURL(List<String> myList, String st) {
if(!allURL.contains(st)) {
myList.add(st);
allURL.add(st);
}
}
private static void analyseURL(String[] args) {
String[] urls = args[0].split("//");
char[] url = urls[1].toCharArray();
int index = urls[1].indexOf("/");
for (int i = 0; i < index; i++) {
host = host + url[i];
}
for (int i = index; i < url.length; i++) {
address = address + url[i];
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -