📄 parseitem.java

📁 Light in the box 抓取程序。使用HttpClient

💻 JAVA

字号:

package com.blogool.crawl;

import java.util.*;
import java.util.regex.*;
import java.io.*;

import org.flytinge.HttpGet;

import com.blogool.crawl.GetRealCount.ExitThread;
import com.blogool.crawl.lib.*;

public class ParseItem {

	public static void main(String[] args) throws Exception {
		Pattern pItem = Pattern.compile("<div\\s+class=\"sea_r_part4_left\">\\s*<a\\s*href=\"(.+?)\".+?><img src=\"(.+?)\" border=\"\\d*\"\\s*alt=\"(.+?)\"\\s*width=\"60\"\\s*height=\"60\"\\s*\\/><\\/a>\\s*<\\/div>");
		
		File catsFile = new File("d:/libox1/cats1.xml");
		File saveFile = new File("d:/libox1/cats2.xml");
		Cat root = Util.loadCat(catsFile);
		ExitThread et = new ExitThread(root, saveFile);
		//注册退出事件
		Runtime.getRuntime().addShutdownHook(et);
		
		for (int i = 0; i < root.getCats().size(); i ++) {
			Cat c = root.getCats().get(i);
			List<Cat> list = c.getCats();
			for (int j = 0; j < list.size(); j ++) {
				Cat cat = list.get(j);
				int size = cat.getSize();
				int page = size / Util.PAGE_COUNT;
				if (size % Util.PAGE_COUNT != 0) page ++;
				if (cat.getItems() == null) {
					cat.setItems(new ArrayList<Item>());
				}
				
				for (int k = 0; k < page; k ++) {
					String url = null;
					if (k > 0) {
						url = cat.getUrl() + "_p" + (k + 1);
					} else {
						url = cat.getUrl();
					}
					ParseItemContentHandle pich = new ParseItemContentHandle(cat);
					
					HttpGet hg = new HttpGet(url, "</html>", pich);
					hg.start();
				}
				
			}
		}
		


	}
	
	
	

}

💿 文件大小 1714 K

👤 上传用户 hushanlyn

📂 所属分类 Internet/网络编程

🏷️ 相关标签

#HttpClient #Light #box #the

⌨️ 快捷键说明

复制代码 Ctrl + C

搜索代码 Ctrl + F

全屏模式 F11

切换主题 Ctrl + Shift + D

显示快捷键 ?

增大字号 Ctrl + =

减小字号 Ctrl + -