📄 parseitemhtml.java
字号:
package com.blogool.crawl;
import java.io.*;
import java.util.*;
import java.util.regex.*;
import com.blogool.crawl.lib.*;
public class ParseItemHtml {
public static void main(String[] args) {
Cat root = Util.loadCat(new File("d:/libox1/cats2.xml"));
for (int i = 0; i < root.getCats().size(); i ++) {
Cat c = root.getCats().get(i);
System.out.println("Parsing " + c.getCatName());
for (int j = 0; j < c.getCats().size(); j ++) {
Cat cat = c.getCats().get(j);
List<Item> items = cat.getItems();
if (items == null) continue;
for (int k = 0; k < items.size(); k ++) {
Item item = items.get(k);
parseItemHtml(item);
}
}
}
Util.saveCat(root, new File("d:/libox1/cats3.xml"));
}
public static int ERROR_COUNT = 0;
/*
public static void main(String[] args) {
Item item = new Item();
item.setUrl("_p4893.html");
parseItemHtml(item);
}*/
public static int FLAG = Pattern.MULTILINE | Pattern.CASE_INSENSITIVE | Pattern.UNIX_LINES;
public static Pattern P_ITEM_NAME = Pattern.compile("<title>(.+?)</title>", FLAG);
public static Pattern P_ITEM_CODE = Pattern.compile("<div class=\"h_6 text_666\">Item \\#(.+?)</div>", FLAG);
public static Pattern P_ITEM_CODE_FROM_FILE = Pattern.compile("p(\\d+)\\.html");
public static Pattern P_LIST_PRICE = Pattern.compile("<div class=\"h_6 text_black\">List price:.+?<span class=\"style4\">(.+?)</span>", FLAG);
public static Pattern P_UNIT_PRICE = Pattern.compile("<div class=\"text12 text_red text_bold h_6\">Unit Price:<span class=\"style2\"> (.+?) per unit</span></div>", FLAG);
public static Pattern P_START_FROM_MIN = Pattern.compile("Starting from Min:( )?(.+?)<input", FLAG);
public static Pattern P_IMAGES = Pattern.compile("<div class=\"part3_left\"><a href=\"javascript:changeMainImageSrc(.+?)</div>");
public static Pattern P_IMAGES_PART = Pattern.compile("<img src=\"(.+?)\".+?/>");
public static Pattern P_DETAIL = Pattern.compile("<div id=\"info_b_1\">(<h\\d>\\s*Notice\\s*:?.*?</h\\d>.+?)?(<h\\d>\\s*Package include\\s*:?.*?</h\\d>.+?)?(<h\\d>\\s*Description\\s*:?.*?</h\\d>.+?)?(<h\\d>\\s*Specifications?\\s*:?.*?</h\\d>.+?)?</div>\\s*<div id=\"info_b_2\"", FLAG | Pattern.DOTALL);
public static Pattern P_DETAIL_1 = Pattern.compile("<div id=\"info_b_1\">(.+?)</div>.+?<div id=\"info_b_2\"", FLAG | Pattern.DOTALL);
public static Pattern P_SIMILAR = Pattern.compile("<div class=\"part8\">\\s*<ul>(.+?)</ul>\\s*<div class=\"clear\"></div>\\s*</div>", FLAG | Pattern.DOTALL);
public static Pattern P_SIMILAR_PART = Pattern.compile("<a href=\".+?_p(\\d+).html\"><img");
public static String fixItemCode(String itemCode) {
int itemCodeLength = 8;
if (itemCode == null) return null;
for (int i = 0; i < itemCodeLength - itemCode.length(); i ++) {
itemCode = "0" + itemCode;
}
return itemCode;
}
public static void parseItemHtml(Item item) {
try {
int matchCount = 0;
String url = item.getUrl();
String content = Util.getContentOrignal(new File("d:/libox1/items", Util.getItemFileName(url)));
Matcher m = null;
m = P_ITEM_NAME.matcher(content);
if (m.find()) {
item.setProductName(m.group(1));
//System.out.println(m.group(1));
matchCount ++;
}
m = P_ITEM_CODE.matcher(content);
if (m.find()) {
item.setItemCode(m.group(1));
//System.out.println(m.group(1));
matchCount ++;
} else {
String str = Util.getItemFileName(url);
m = P_ITEM_CODE_FROM_FILE.matcher(str);
if (m.find()) {
item.setItemCode(fixItemCode(m.group(1)));
}
}
m = P_LIST_PRICE.matcher(content);
if (m.find()) {
item.setListPrice(m.group(1).replaceAll(",", ""));
//System.out.println(m.group(1).replaceAll(",", ""));
matchCount ++;
}
m = P_UNIT_PRICE.matcher(content);
if (m.find()) {
item.setUnitPrice(m.group(1).replaceAll(",", ""));
//System.out.println(m.group(1).replaceAll(",", ""));
matchCount ++;
}
//最小购买量
m = P_START_FROM_MIN.matcher(content);
if (m.find()) {
item.setLimitNumber(m.group(2).trim());
//System.out.println(m.group(2));
} else {
item.setLimitNumber("0");
}
//图片
m = P_IMAGES.matcher(content);
if (m.find()) {
//解析图片
String images = m.group(1);
m = P_IMAGES_PART.matcher(images);
List<String> list = new ArrayList<String>();
while (m.find()) {
String imageUrl = m.group(1);
list.add(imageUrl);
}
String[] imageUrls = new String[list.size()];
for (int i = 0; i < imageUrls.length; i ++) {
imageUrls[i] = list.get(i);
//System.out.println(imageUrls[i]);
}
item.setImageUrls(imageUrls);
}
m = P_DETAIL.matcher(content);
if (m.find()) {
item.setNotice(m.group(1));
item.setPackageinfo(m.group(2));
item.setDescription(m.group(3));
item.setSpecfication(m.group(4));
} else {
m = P_DETAIL_1.matcher(content);
if (m.find()) {
item.setSpecfication(m.group(1));
item.setContentType(1);
} else {
throw new Exception("Detail is empty.");
}
}
//相关产品
m = P_SIMILAR.matcher(content);
if (m.find()) {
String str = m.group(1);
m = P_SIMILAR_PART.matcher(str);
String similarCode = "";
while (m.find()) {
similarCode += m.group(1) + ",";
}
if (similarCode != null && !similarCode.equals("")) {
item.setSimilarCodes(similarCode.split(","));
}
}
} catch (Exception e) {
System.out.println("ERROR parse:" + item.getItemCode());
item.setValid(false);
ERROR_COUNT ++;
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -