⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 extract163moblie.java

📁 搜索引擎
💻 JAVA
字号:
package com.luceneheritrixbook.extractor.com163;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.util.Date;

import org.htmlparser.NodeFilter;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.util.NodeList;

import com.luceneheritrixbook.extractor.Extractor;
import com.luceneheritrixbook.util.StringUtils;

public class Extract163Moblie extends Extractor {

	private static final String MATCH_STRING1 = "<td width=\"31%\" align=\"left\" bgcolor=\"#F1F1F1\" class=\"fB\">(.*)</td>";

	public void extract() {
		BufferedWriter bw = null;
		String image_url = null;
		NodeFilter attributes_filter = new AndFilter(new TagNameFilter("td"),
				new HasAttributeFilter("width", "31%"));
		NodeFilter title_filter = new AndFilter(new TagNameFilter("td"),
				new AndFilter(
						new HasAttributeFilter("class", "f14px fB cWhite"),
						new HasAttributeFilter("width", "141")));
		NodeFilter iamge_filter = new AndFilter(new TagNameFilter("td"),
				new HasAttributeFilter("width", "33%"));
		try {
			NodeList title_nodes = this.getParser().parse(title_filter);
			for (int i = 0; i < title_nodes.size(); i++) {
				TableColumn title_node = (TableColumn) title_nodes
						.elementAt(i);
				String[] names = title_node.getChildrenHTML().split(" ");
				StringBuffer title = new StringBuffer();
				for (int k = 0; k < names.length; k++) {
					title.append(names[k]).append("-");
				}
				title.append((new Date()).getTime());
				String title_str = title.toString().replaceAll("/", "_");
				bw = new BufferedWriter(new FileWriter(new File(this
						.getOutputPath()
						+ title_str)));
				int startPos = getInuputFilePath().indexOf("mirror") + 6;
				String url_seg = getInuputFilePath().substring(startPos);
				url_seg = url_seg.replaceAll("\\\\", "/");
				String url = "http:/" + url_seg;
				bw.write(url + NEWLINE);
				bw.write(names[0] + NEWLINE);
				bw.write(names[1] + NEWLINE);
			}

		} catch (Exception e) {
			e.printStackTrace();
		}

		this.getParser().reset();
		try {
			NodeList attributes_nodes = this.getParser().parse(
					attributes_filter);
			for (int i = 0; i < attributes_nodes.size(); i++) {

				TableColumn node = (TableColumn) attributes_nodes.elementAt(i);
				String text = node.getChildrenHTML();
				if (node.getAttribute("width") != null
						&& node.getAttribute("width").equals("31%")) {

					String result = getProp(MATCH_STRING1, node.toHtml(), 1);
					TableColumn nodeExt = (TableColumn) node.getNextSibling()
							.getNextSibling();
					bw.write(StringUtils.trim(result) + ":"
							+ StringUtils.trim(nodeExt.getChildrenHTML())
							+ NEWLINE);
				}
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
		this.getParser().reset();
		try {
			NodeList image_nodes = this.getParser().parse(iamge_filter);
			for (int i = 0; i < image_nodes.size(); i++) {
				TableColumn image_node = (TableColumn) image_nodes
						.elementAt(i);
				image_url = getProp("<img src=\"(.*)\"  />", image_node
						.toHtml(), 1);
				String fileType = image_url.substring(image_url
						.lastIndexOf(".") + 1);
				String new_iamge_file = StringUtils.encodePassword(image_url,
						HASH_ALGORITHM)
						+ "." + fileType;
				copyImage(image_url, new_iamge_file);
				bw.write(SEPARATOR + NEWLINE);
				bw.write(new_iamge_file + NEWLINE);
			}

		} catch (Exception e) {
			e.printStackTrace();
		}
		try {
			if (bw != null)
				bw.close();
		} catch (Exception e) {
			e.printStackTrace();
		} finally {

		}
	}
	public static void main(String[] agrs) throws Exception {

		Extract163Moblie ex = new Extract163Moblie();
		ex.setOutputPath("C:\\");
		traverse(
				ex,
				new File(
						"C:\\Documents and Settings\\qz\\Desktop\\pconline\\0EXW.html"));
	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -