📄 parselib1.java
字号:
package com.blogool.crawl;
import java.io.*;
import java.sql.Savepoint;
import java.util.*;
import java.util.regex.*;
import java.net.*;
import com.blogool.crawl.lib.Cat;
public class ParseLIB1 {
private static Set<String> VALID_CATS = new HashSet<String>();
static {
String[] strs = new String[] {
"Video Game",
"Refurbished",
"MP4 Player",
"MP3 player",
"Auto",
"Electronics",
"Camera and Camcorder",
"Cell Phones",
"Computers",
"Watches",
};
for (int i = 0; i < strs.length; i ++)
VALID_CATS.add(strs[i]);
}
public static void main(String[] args) throws Exception {
String content = getContent(new File("d:/abcd.htm"));
File outputSubCats = new File("D:/libox1/pages.html");
BufferedWriter bw = new BufferedWriter(new FileWriter(outputSubCats));
//Pattern LEVEL_ONE_CAT = Pattern.compile("<span\\s+style=\"float:left;cursor:hand;\"\\s+onclick=.+?>(.+?)<\\/span>(.+?)<li\\s+class=\"list_style");
Pattern LEVEL_ONE_CAT = Pattern.compile("style=\"clear:both;\"><a id=\"category_nav\" href=\".+?\">(.+?)</a><span style=\"float:right;margin:2px 5px 0 0;cursor:hand;\"(.+?)<li class=\"list_style1\"");
Matcher mLoc = LEVEL_ONE_CAT.matcher(content);
Cat root = new Cat();
root.setParent(null);
Pattern LEVEL_TWO_CAT = Pattern.compile("<li\\sstyle=\"clear:both;display:.*?;\"\\sid=\".+?\"\\svisible=\".+?\"><a\\sclass=\"\"\\shref=\"(.+?)\">(.+?)\\((\\d+)\\)</a></li>");
List<Cat> rootSubCat = new ArrayList<Cat>();
root.setCats(rootSubCat);
while (mLoc.find()) {
Cat c = new Cat();
c.setParent(root);
String catname = mLoc.group(1);
if (!VALID_CATS.contains(catname)) continue;
c.setCatName(catname);
String contentSub = mLoc.group(2).replaceAll(" ", "");
rootSubCat.add(c);
List<Cat> subSubCat = new ArrayList<Cat>();
c.setCats(subSubCat);
System.out.println("cat:" + c.getCatName());
Matcher mLtc = LEVEL_TWO_CAT.matcher(contentSub);
while (mLtc.find()) {
int size = Integer.parseInt(mLtc.group(3));
String url = mLtc.group(1);
String name = mLtc.group(2);
Cat ch = new Cat();
subSubCat.add(ch);
ch.setCatName(name);
ch.setUrl(url);
ch.setParent(c);
ch.setSize(size);
String fileName = URLEncoder.encode(name, "utf-8");
StringBuilder sb = new StringBuilder();
sb.append("<a href='").append(url).append("'>").append(fileName).append("</a>");
bw.write(sb.toString());
bw.newLine();
System.out.println("\t\t\t\t" + url);
}
}
bw.close();
Util.saveCat(root, new File("d:/libox1/cats.xml"));
}
public static String getContent(File f) {
System.out.println("获取文件" + f.getPath() + "内容.");
BufferedReader br = null;
StringBuilder sb = new StringBuilder();
try {
br = new BufferedReader(new FileReader(f));
String line = null;
while ((line = br.readLine()) != null) {
sb.append(line);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if (br != null) {
try {
br.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
return sb.toString();
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -