📄 parseitem.java
字号:
package com.blogool.crawl;
import java.util.*;
import java.util.regex.*;
import java.io.*;
import org.flytinge.HttpGet;
import com.blogool.crawl.GetRealCount.ExitThread;
import com.blogool.crawl.lib.*;
public class ParseItem {
public static void main(String[] args) throws Exception {
Pattern pItem = Pattern.compile("<div\\s+class=\"sea_r_part4_left\">\\s*<a\\s*href=\"(.+?)\".+?><img src=\"(.+?)\" border=\"\\d*\"\\s*alt=\"(.+?)\"\\s*width=\"60\"\\s*height=\"60\"\\s*\\/><\\/a>\\s*<\\/div>");
File catsFile = new File("d:/libox1/cats1.xml");
File saveFile = new File("d:/libox1/cats2.xml");
Cat root = Util.loadCat(catsFile);
ExitThread et = new ExitThread(root, saveFile);
//注册退出事件
Runtime.getRuntime().addShutdownHook(et);
for (int i = 0; i < root.getCats().size(); i ++) {
Cat c = root.getCats().get(i);
List<Cat> list = c.getCats();
for (int j = 0; j < list.size(); j ++) {
Cat cat = list.get(j);
int size = cat.getSize();
int page = size / Util.PAGE_COUNT;
if (size % Util.PAGE_COUNT != 0) page ++;
if (cat.getItems() == null) {
cat.setItems(new ArrayList<Item>());
}
for (int k = 0; k < page; k ++) {
String url = null;
if (k > 0) {
url = cat.getUrl() + "_p" + (k + 1);
} else {
url = cat.getUrl();
}
ParseItemContentHandle pich = new ParseItemContentHandle(cat);
HttpGet hg = new HttpGet(url, "</html>", pich);
hg.start();
}
}
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -