mobile163extractor.java

来自「一个搜索引擎,希望对大家有用」· Java 代码 · 共 102 行

JAVA
102
字号
package my.extractor;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.httpclient.URIException;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.extractor.Extractor;
import org.archive.crawler.extractor.ExtractorHTML;
import org.archive.crawler.extractor.Link;
import org.archive.crawler.settings.SimpleType;
import org.archive.crawler.settings.Type;
import org.archive.io.ReplayCharSequence;
import org.archive.util.HttpRecorder;
import org.archive.util.TextUtils;

public class Mobile163Extractor extends Extractor {

	protected boolean ignoreUnexpectedHTML = true;

	private static Logger logger = Logger.getLogger(Mobile163Extractor.class
			.getName());

	public Mobile163Extractor(String name) {
		this(name, "Mobile163 extractor. Extracts links from HTML documents");
	}

	public Mobile163Extractor(String name, String description) {
		super(name, description);
	}

	protected void extract(CrawlURI curi) {
		String url = curi.toString();
		if (url
				.equals("http://mobile.163.com/0011/product/0011000B/special/l/left.html")) {

			ReplayCharSequence cs = null;

			try {
				HttpRecorder hr = curi.getHttpRecorder();
				if (hr == null) {
					throw new IOException("Why is recorder null here?");
				}
				cs = hr.getReplayCharSequence();
			} catch (IOException e) {
				curi.addLocalizedError(this.getName(), e,
						"Failed get of replay char sequence " + curi.toString()
								+ " " + e.getMessage());
				logger.log(Level.SEVERE,
						"Failed get of replay char sequence in "
								+ Thread.currentThread().getName(), e);
			}

			if (cs == null) {
				return;
			}

			String content = cs.toString();
			try {
				BufferedReader reader = new BufferedReader(new StringReader(
						content));
				String line = reader.readLine();
				while (line != null) {
					if (line.endsWith(".html\"")) {

						String fullUrl = null;
						fullUrl = "http://mobile.163.com"
								+ line.substring(line.indexOf("url:") + 4, line
										.length() - 1);
						addLinkFromString(curi, fullUrl, "", Link.NAVLINK_HOP);
						System.out.println(fullUrl);
					}
					line = reader.readLine();
				}

			} catch (Exception e) {
				e.printStackTrace();
			}
		}
	}

	private void addLinkFromString(CrawlURI curi, String uri,
			CharSequence context, char hopType) {
		try {
			curi.createAndAddLinkRelativeToBase(uri, context.toString(),
					hopType);
		} catch (URIException e) {
			if (getController() != null) {
				getController().logUriError(e, curi.getUURI(), uri);
			} else {
				logger.info("Failed createAndAddLinkRelativeToBase " + curi
						+ ", " + uri + ", " + context + ", " + hopType + ": "
						+ e);
			}
		}
	}

}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?