📄 extractpconlinemoblie.java
字号:
package com.luceneheritrixbook.extractor.pconline.mobile;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Date;
import org.htmlparser.NodeFilter;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NotFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.util.NodeList;
import com.luceneheritrixbook.extractor.Extractor;
import com.luceneheritrixbook.util.StringUtils;
public class ExtractPconlineMoblie extends Extractor {
public void extract() {
BufferedWriter bw = null;
//创建属性过滤器
NodeFilter attributes_filter = new AndFilter(new TagNameFilter("td"),
new HasAttributeFilter("WIDTH", "198"));
//创建标题过滤器
NodeFilter title_filter = new AndFilter(new TagNameFilter("td"),
new AndFilter(new HasAttributeFilter("class", "hi"),
new NotFilter(new HasAttributeFilter("width"))));
//创建图片过滤器
NodeFilter image_filter = new AndFilter(new TagNameFilter("IMG"),
new AndFilter(new HasAttributeFilter("WIDTH", "200"),
new HasAttributeFilter("HEIGHT","150")));
//提取标题信息
try {
//Parser根据过滤器返回所有满足过滤条件的节点
NodeList title_nodes = this.getParser().parse(title_filter);
//遍历所有节点
for (int i = 0; i < title_nodes.size(); i++) {
TableColumn node = (TableColumn) title_nodes.elementAt(i);
//用空格分割节点内胡html文本
String[] names = node.getChildrenHTML().split(" ");
StringBuffer title = new StringBuffer();
//创建要生成的文本文件名
for (int k = 0; k < names.length; k++) {
title.append(names[k]).append("-");
}
title.append((new Date()).getTime());
//创建要生成的文件
bw = new BufferedWriter(new FileWriter(new File(this
.getOutputPath()
+ title + ".txt")));
//获取当前提取页的完整URL地址
int startPos = getInuputFilePath().indexOf("mirror") + 6;
String url_seg = getInuputFilePath().substring(startPos);
url_seg = url_seg.replaceAll("\\\\", "/");
String url = "http:/" + url_seg;
System.out.println(url);
//写入当前提取页的完整URL地址
bw.write(url + NEWLINE);
bw.write(names[0] + NEWLINE);
bw.write(names[1] + NEWLINE);
}
} catch (Exception e) {
e.printStackTrace();
}
//重置Parser
this.getParser().reset();
try {
//Parser根据过滤器返回所有满足过滤条件的节点
NodeList attributes_nodes = this.getParser().parse(attributes_filter);
for (int i = 0; i < attributes_nodes.size(); i++) {
//Parser根据过滤器返回所有满足过滤条件的节点
TableColumn node = (TableColumn) attributes_nodes.elementAt(i);
String text = node.getChildrenHTML();
//提取属性名信息
String result = getProp(
"<TD CLASS=btd WIDTH=198 BGCOLOR=\"#FCFCFC\"><B>(.*)</B></TD>",
node.toHtml(), 1);
//属性里面包含有link标签的情况
if (result.indexOf("<") != -1)
result = getProp(
"<TD CLASS=btd WIDTH=198 BGCOLOR=\"#FCFCFC\"(.*)>(.*)</a></B></TD>",
node.toHtml(), 2);
//提取属性值信息
TableColumn nodeExt = (TableColumn) node.getNextSibling()
.getNextSibling();
bw.write(StringUtils.trim(result) + ":"
+ StringUtils.trim(nodeExt.getChildrenHTML()));
bw.newLine();
continue;
}
} catch (Exception e) {
e.printStackTrace();
}
// 重置Parser
this.getParser().reset();
try {
// Parser根据过滤器返回所有满足过滤条件的节点
NodeList image_nodes = this.getParser().parse(image_filter);
for (int i = 0; i < image_nodes.size(); i++) {
ImageTag node = (ImageTag) image_nodes.elementAt(i);
//获取当前节点的SRC属性值
String image_url = node.getAttribute("SRC");
//提取文件类型
String fileType = image_url.substring(image_url
.lastIndexOf(".") + 1);
//生成新的图片的文件名
String new_iamge_file = StringUtils.encodePassword(
image_url, HASH_ALGORITHM)
+ "." + fileType;
image_url = StringUtils.replace(image_url, "+", " ");
//利用miorr目录下的图片生成的新的图片
copyImage(image_url, new_iamge_file);
bw.write(SEPARATOR + NEWLINE);
bw.write(new_iamge_file + NEWLINE);
}
} catch (Exception e) {
e.printStackTrace();
}
try{
if (bw != null)
bw.close();
}catch(IOException e){
e.printStackTrace();
}
}
public static void main(String[] agrs) throws Exception {
// String url_seg="\\mirror\\product.pconline.com.cn\\product\\109\\p109744.html";
// url_seg=url_seg.replaceAll("\\\\","/");
// System.out.println(url_seg);
ExtractPconlineMoblie ex = new ExtractPconlineMoblie();
ex.setOutputPath("C:\\");
traverse(ex, new File("C:\\Documents and Settings\\qz\\Desktop\\pconline\\p08317.html"));
// ex.loadFile("C:\\Documents and Settings\\zl\\桌面\\mirror\\product.pconline.com.cn\\product\\106\\p106938.html");
// ex.extract();
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -