mobile163extractor.java
来自「一个搜索引擎,希望对大家有用」· Java 代码 · 共 102 行
JAVA
102 行
package my.extractor;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.httpclient.URIException;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.extractor.Extractor;
import org.archive.crawler.extractor.ExtractorHTML;
import org.archive.crawler.extractor.Link;
import org.archive.crawler.settings.SimpleType;
import org.archive.crawler.settings.Type;
import org.archive.io.ReplayCharSequence;
import org.archive.util.HttpRecorder;
import org.archive.util.TextUtils;
public class Mobile163Extractor extends Extractor {
protected boolean ignoreUnexpectedHTML = true;
private static Logger logger = Logger.getLogger(Mobile163Extractor.class
.getName());
public Mobile163Extractor(String name) {
this(name, "Mobile163 extractor. Extracts links from HTML documents");
}
public Mobile163Extractor(String name, String description) {
super(name, description);
}
protected void extract(CrawlURI curi) {
String url = curi.toString();
if (url
.equals("http://mobile.163.com/0011/product/0011000B/special/l/left.html")) {
ReplayCharSequence cs = null;
try {
HttpRecorder hr = curi.getHttpRecorder();
if (hr == null) {
throw new IOException("Why is recorder null here?");
}
cs = hr.getReplayCharSequence();
} catch (IOException e) {
curi.addLocalizedError(this.getName(), e,
"Failed get of replay char sequence " + curi.toString()
+ " " + e.getMessage());
logger.log(Level.SEVERE,
"Failed get of replay char sequence in "
+ Thread.currentThread().getName(), e);
}
if (cs == null) {
return;
}
String content = cs.toString();
try {
BufferedReader reader = new BufferedReader(new StringReader(
content));
String line = reader.readLine();
while (line != null) {
if (line.endsWith(".html\"")) {
String fullUrl = null;
fullUrl = "http://mobile.163.com"
+ line.substring(line.indexOf("url:") + 4, line
.length() - 1);
addLinkFromString(curi, fullUrl, "", Link.NAVLINK_HOP);
System.out.println(fullUrl);
}
line = reader.readLine();
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
private void addLinkFromString(CrawlURI curi, String uri,
CharSequence context, char hopType) {
try {
curi.createAndAddLinkRelativeToBase(uri, context.toString(),
hopType);
} catch (URIException e) {
if (getController() != null) {
getController().logUriError(e, curi.getUURI(), uri);
} else {
logger.info("Failed createAndAddLinkRelativeToBase " + curi
+ ", " + uri + ", " + context + ", " + hopType + ": "
+ e);
}
}
}
}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?