📄 htmlparser.java
字号:
package issa.webspider;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import au.id.jericho.lib.html.Source;
import au.id.jericho.lib.html.StartTag;
import au.id.jericho.lib.html.Tag;
public class HtmlParser {
URL baseUrl;
public HtmlParser(URL url) {
this.baseUrl = url;
}
public List<URL> getLinks() throws IOException {
Source source = new Source(baseUrl);
List l = source.findAllStartTags(Tag.A);
List<URL> links = new LinkedList<URL>();
// displaySegments(l);
for (Iterator i = l.iterator(); i.hasNext();) {
StartTag tag = (StartTag) i.next();
// System.out.println("-------------------------------------------------------------------------------");
String fullUrl = getFullURL(tag.getAttributeValue("href"));
if (fullUrl != null) {
links.add(new URL(fullUrl));
}
// System.out.println(getFullURL(tag.getAttributeValue("href")));
}
return links;
}
private String getFullURL(String url) {
try {
if (url == null)
return url;
// if (processPattern(url)) return null;
// 如果url前有http://或https://,为绝对路径,按原样返回
if (url.toLowerCase().startsWith("http://")
|| url.toLowerCase().startsWith("https://"))
return url;
URI parentUri = this.baseUrl.toURI();
String port = "";
if (parentUri.getPort() != -1)
port = ":" + parentUri.getPort();
if (url.startsWith("/")) // url以"/"开头,直接放在host后面
return parentUri.getScheme() + "://" + parentUri.getHost()
+ port + url;
else // url不以"/"开头,放在url的路径后面
{
String s = "";
s = parentUri.getPath().substring(0,
parentUri.getPath().lastIndexOf("/"));
return parentUri.getScheme() + "://" + parentUri.getHost()
+ port + s + "/" + url;
}
} catch (URISyntaxException e) {
System.err.println(e);
return null;
}
}
private void writeToFile() {
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -