📄 extract163moblie.java
字号:
package com.luceneheritrixbook.extractor.com163;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.util.Date;
import org.htmlparser.NodeFilter;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.util.NodeList;
import com.luceneheritrixbook.extractor.Extractor;
import com.luceneheritrixbook.util.StringUtils;
public class Extract163Moblie extends Extractor {
private static final String MATCH_STRING1 = "<td width=\"31%\" align=\"left\" bgcolor=\"#F1F1F1\" class=\"fB\">(.*)</td>";
public void extract() {
BufferedWriter bw = null;
String image_url = null;
NodeFilter attributes_filter = new AndFilter(new TagNameFilter("td"),
new HasAttributeFilter("width", "31%"));
NodeFilter title_filter = new AndFilter(new TagNameFilter("td"),
new AndFilter(
new HasAttributeFilter("class", "f14px fB cWhite"),
new HasAttributeFilter("width", "141")));
NodeFilter iamge_filter = new AndFilter(new TagNameFilter("td"),
new HasAttributeFilter("width", "33%"));
try {
NodeList title_nodes = this.getParser().parse(title_filter);
for (int i = 0; i < title_nodes.size(); i++) {
TableColumn title_node = (TableColumn) title_nodes
.elementAt(i);
String[] names = title_node.getChildrenHTML().split(" ");
StringBuffer title = new StringBuffer();
for (int k = 0; k < names.length; k++) {
title.append(names[k]).append("-");
}
title.append((new Date()).getTime());
String title_str = title.toString().replaceAll("/", "_");
bw = new BufferedWriter(new FileWriter(new File(this
.getOutputPath()
+ title_str)));
int startPos = getInuputFilePath().indexOf("mirror") + 6;
String url_seg = getInuputFilePath().substring(startPos);
url_seg = url_seg.replaceAll("\\\\", "/");
String url = "http:/" + url_seg;
bw.write(url + NEWLINE);
bw.write(names[0] + NEWLINE);
bw.write(names[1] + NEWLINE);
}
} catch (Exception e) {
e.printStackTrace();
}
this.getParser().reset();
try {
NodeList attributes_nodes = this.getParser().parse(
attributes_filter);
for (int i = 0; i < attributes_nodes.size(); i++) {
TableColumn node = (TableColumn) attributes_nodes.elementAt(i);
String text = node.getChildrenHTML();
if (node.getAttribute("width") != null
&& node.getAttribute("width").equals("31%")) {
String result = getProp(MATCH_STRING1, node.toHtml(), 1);
TableColumn nodeExt = (TableColumn) node.getNextSibling()
.getNextSibling();
bw.write(StringUtils.trim(result) + ":"
+ StringUtils.trim(nodeExt.getChildrenHTML())
+ NEWLINE);
}
}
} catch (Exception e) {
e.printStackTrace();
}
this.getParser().reset();
try {
NodeList image_nodes = this.getParser().parse(iamge_filter);
for (int i = 0; i < image_nodes.size(); i++) {
TableColumn image_node = (TableColumn) image_nodes
.elementAt(i);
image_url = getProp("<img src=\"(.*)\" />", image_node
.toHtml(), 1);
String fileType = image_url.substring(image_url
.lastIndexOf(".") + 1);
String new_iamge_file = StringUtils.encodePassword(image_url,
HASH_ALGORITHM)
+ "." + fileType;
copyImage(image_url, new_iamge_file);
bw.write(SEPARATOR + NEWLINE);
bw.write(new_iamge_file + NEWLINE);
}
} catch (Exception e) {
e.printStackTrace();
}
try {
if (bw != null)
bw.close();
} catch (Exception e) {
e.printStackTrace();
} finally {
}
}
public static void main(String[] agrs) throws Exception {
Extract163Moblie ex = new Extract163Moblie();
ex.setOutputPath("C:\\");
traverse(
ex,
new File(
"C:\\Documents and Settings\\qz\\Desktop\\pconline\\0EXW.html"));
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -