⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 parseitemhtml.java

📁 Light in the box 抓取程序。 使用HttpClient
💻 JAVA
字号:
package com.blogool.crawl;

import java.io.*;
import java.util.*;
import java.util.regex.*;

import com.blogool.crawl.lib.*;

public class ParseItemHtml {
	
	public static void main(String[] args) {
		Cat root = Util.loadCat(new File("d:/libox1/cats2.xml"));
		
		for (int i = 0; i < root.getCats().size(); i ++) {
			Cat c = root.getCats().get(i);
			System.out.println("Parsing " + c.getCatName());
			for (int j = 0; j < c.getCats().size(); j ++) {
				Cat cat = c.getCats().get(j);
				
				List<Item> items = cat.getItems();
				if (items == null) continue;
				for (int k = 0; k < items.size(); k ++) {
					Item item = items.get(k);
					parseItemHtml(item);
				}
			}
		}
		
		Util.saveCat(root, new File("d:/libox1/cats3.xml"));
		
	}
	
	public static int ERROR_COUNT = 0;
	
	/*
	public static void main(String[] args) {

		Item item = new Item();
		item.setUrl("_p4893.html");
		parseItemHtml(item);

		
		
	}*/
	public static int FLAG = Pattern.MULTILINE | Pattern.CASE_INSENSITIVE | Pattern.UNIX_LINES;
	
	public static Pattern P_ITEM_NAME = Pattern.compile("<title>(.+?)</title>", FLAG);
	public static Pattern P_ITEM_CODE = Pattern.compile("<div class=\"h_6 text_666\">Item \\#(.+?)</div>", FLAG);
	public static Pattern P_ITEM_CODE_FROM_FILE = Pattern.compile("p(\\d+)\\.html");
	
	public static Pattern P_LIST_PRICE = Pattern.compile("<div class=\"h_6 text_black\">List price:.+?<span class=\"style4\">(.+?)</span>", FLAG);
	public static Pattern P_UNIT_PRICE = Pattern.compile("<div class=\"text12 text_red text_bold h_6\">Unit Price:<span class=\"style2\">&nbsp;(.+?) per unit</span></div>", FLAG);
	public static Pattern P_START_FROM_MIN = Pattern.compile("Starting from Min:(&nbsp;)?(.+?)<input", FLAG);
	
	public static Pattern P_IMAGES = Pattern.compile("<div class=\"part3_left\"><a href=\"javascript:changeMainImageSrc(.+?)</div>");
	public static Pattern P_IMAGES_PART = Pattern.compile("<img src=\"(.+?)\".+?/>");
	
	
	public static Pattern P_DETAIL = Pattern.compile("<div id=\"info_b_1\">(<h\\d>\\s*Notice\\s*:?.*?</h\\d>.+?)?(<h\\d>\\s*Package include\\s*:?.*?</h\\d>.+?)?(<h\\d>\\s*Description\\s*:?.*?</h\\d>.+?)?(<h\\d>\\s*Specifications?\\s*:?.*?</h\\d>.+?)?</div>\\s*<div id=\"info_b_2\"", FLAG | Pattern.DOTALL);
	public static Pattern P_DETAIL_1 = Pattern.compile("<div id=\"info_b_1\">(.+?)</div>.+?<div id=\"info_b_2\"", FLAG | Pattern.DOTALL);
	
	public static Pattern P_SIMILAR = Pattern.compile("<div class=\"part8\">\\s*<ul>(.+?)</ul>\\s*<div class=\"clear\"></div>\\s*</div>", FLAG | Pattern.DOTALL);
	public static Pattern P_SIMILAR_PART = Pattern.compile("<a href=\".+?_p(\\d+).html\"><img");
	
	public static String fixItemCode(String itemCode) {
		int itemCodeLength = 8;
		if (itemCode == null) return null;
		for (int i = 0; i < itemCodeLength - itemCode.length(); i ++) {
			itemCode = "0" + itemCode;
		}
		return itemCode;
	}
	
	public static void parseItemHtml(Item item) {
		try {
			int matchCount = 0;
			String url = item.getUrl();
			String content = Util.getContentOrignal(new File("d:/libox1/items", Util.getItemFileName(url)));
			Matcher m = null;
			m = P_ITEM_NAME.matcher(content);
			if (m.find()) {
				item.setProductName(m.group(1));
				//System.out.println(m.group(1));
				matchCount ++;
			} 
			
			m = P_ITEM_CODE.matcher(content);
			if (m.find()) {
				item.setItemCode(m.group(1));
				//System.out.println(m.group(1));
				matchCount ++;
			} else {
				String str = Util.getItemFileName(url);
				m = P_ITEM_CODE_FROM_FILE.matcher(str);
				if (m.find()) {
					item.setItemCode(fixItemCode(m.group(1)));
				}
			}
			m = P_LIST_PRICE.matcher(content);
			if (m.find()) {
				item.setListPrice(m.group(1).replaceAll(",", ""));
				//System.out.println(m.group(1).replaceAll(",", ""));
				matchCount ++;
			}
			m = P_UNIT_PRICE.matcher(content);
			if (m.find()) {
				item.setUnitPrice(m.group(1).replaceAll(",", ""));
				//System.out.println(m.group(1).replaceAll(",", ""));
				matchCount ++;
			}
			//最小购买量
			m = P_START_FROM_MIN.matcher(content);
			if (m.find()) {
				item.setLimitNumber(m.group(2).trim());
				//System.out.println(m.group(2));
			} else {
				item.setLimitNumber("0");
			}
			//图片
			m = P_IMAGES.matcher(content);
			if (m.find()) {
				//解析图片
				String images = m.group(1);
				m = P_IMAGES_PART.matcher(images);
				List<String> list = new ArrayList<String>();
				while (m.find()) {
					String imageUrl = m.group(1);
					list.add(imageUrl);
				}
				String[] imageUrls = new String[list.size()];
				for (int i = 0; i < imageUrls.length; i ++) {
					imageUrls[i] = list.get(i);
					//System.out.println(imageUrls[i]);
				}
				item.setImageUrls(imageUrls);
			}
			
			m = P_DETAIL.matcher(content);
			if (m.find()) {
				item.setNotice(m.group(1));
				item.setPackageinfo(m.group(2));
				item.setDescription(m.group(3));
				item.setSpecfication(m.group(4));
			} else {
				m = P_DETAIL_1.matcher(content);
				if (m.find()) {
					item.setSpecfication(m.group(1));
					item.setContentType(1);
				} else {
					throw new Exception("Detail is empty.");
				}
			}
			
			//相关产品
			m = P_SIMILAR.matcher(content);
			if (m.find()) {
				String str = m.group(1);
				m = P_SIMILAR_PART.matcher(str);
				String similarCode = "";
				while (m.find()) {
					similarCode += m.group(1) + ",";
				}
				if (similarCode != null && !similarCode.equals("")) {
					item.setSimilarCodes(similarCode.split(","));
				}
			}
			
		} catch (Exception e) {
			System.out.println("ERROR parse:" + item.getItemCode());
			item.setValid(false);
			ERROR_COUNT ++;
		}
	}
	
	
	
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -