📄 webparser.java
字号:
package issa.webspider.demo;
import org.apache.log4j.Logger;
import org.apache.log4j.xml.DOMConfigurator;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLConnection;
import java.util.Iterator;
import java.util.List;
import au.id.jericho.lib.html.Segment;
import au.id.jericho.lib.html.Source;
import au.id.jericho.lib.html.StartTag;
import au.id.jericho.lib.html.Tag;
public class WebParser {
/**
* Logger for this class
*/
private static final Logger logger = Logger.getLogger(WebParser.class);
URL url;
String path;
public WebParser(URL url, String path) {
this.url = url;
this.path = path;
}
/**
* @param args
*/
public static void main(String[] args) {
if (args.length > 0) {
DOMConfigurator.configure("log4j.xml");
try {
URL u = new URL(args[0]);
WebParser wp = new WebParser(u, args[1]);
URLConnection uc = wp.url.openConnection();
Class[] types = { String.class, Reader.class, InputStream.class };
Object o = uc.getContent(types);
System.out.println("I got a " + o.getClass().getName());
if (o instanceof String) {
System.out.println("String");
System.out.println(o);
} else if (o instanceof Reader) {
System.out.println("Reader");
int c;
Reader r = (Reader) o;
// while ((c=r.read()) != -1) {
// System.out.print((char) c);
// }
wp.getLinks();
wp.writeToFile(r);
} else if (o instanceof InputStream) {
// System.out.println("InputStream");
int c;
InputStream in = new BufferedInputStream((InputStream) o);
Reader r = new InputStreamReader(in);
// while ((c=r.read()) != -1) {
// System.out.print((char) c);
// }
wp.getLinks();
wp.writeToFile(r);
} else if (o == null) {
System.out
.println("None of the requested types were available.");
} else {
System.out
.println("Error: unexpected type " + o.getClass());
}
System.out.println("Content Type " + uc.getContentType());
} catch (MalformedURLException ex) {
System.err.println(args[0] + "is not a parseable URL");
} catch (IOException e) {
e.printStackTrace();
}
}
}
private void getLinks() throws IOException {
Source source = new Source(url);
List l = source.findAllStartTags(Tag.A);
// displaySegments(l);
for (Iterator i = l.iterator(); i.hasNext();) {
StartTag tag = (StartTag) i.next();
// System.out
// .println("-------------------------------------------------------------------------------");
// System.out.println(segment.getDebugInfo());
// System.out.println(getFullURL(tag.getAttributeValue("href")));
}
}
private String getFullURL(String url) {
try {
if (url == null)
return url;
// if (processPattern(url)) return null;
// 如果url前有http://或https://,为绝对路径,按原样返回
if (url.toLowerCase().startsWith("http://")
|| url.toLowerCase().startsWith("https://"))
return url;
URI parentUri = this.url.toURI();
String port = "";
if (parentUri.getPort() != -1)
port = ":" + parentUri.getPort();
if (url.startsWith("/")) // url以"/"开头,直接放在host后面
return parentUri.getScheme() + "://" + parentUri.getHost()
+ port + url;
else // url不以"/"开头,放在url的路径后面
{
String s = "";
s = parentUri.getPath().substring(0,
parentUri.getPath().lastIndexOf("/"));
return parentUri.getScheme() + "://" + parentUri.getHost()
+ port + s + "/" + url;
}
} catch (URISyntaxException e) {
System.err.println(e);
return null;
}
}
private void writeToFile(Reader r) throws IOException {
String stringFile = makeLocalPath();
String stringPath = splitPath(stringFile);
File file = new File(stringFile);
File dir = new File(stringPath);
if (!dir.exists()) {
System.out.println(dir + " is not exists!");
dir.mkdirs();
}
if (!file.exists()) {
System.out.println(file + " is not exists!");
file.createNewFile();
}
FileWriter fw = new FileWriter(file);
int c;
while ((c = r.read()) != -1) {
fw.write(c);
}
fw.flush();
fw.close();
}
private String makeLocalPath() {
URI parentUri;
try {
parentUri = url.toURI();
} catch (URISyntaxException e) {
e.printStackTrace();
return null;
}
StringBuilder sb = new StringBuilder();
sb.append(path);
sb.append("\\");
sb.append(parentUri.getHost());
sb.append(parentUri.getPath());
return replaceSlash(sb.toString());
}
private String replaceSlash(String url) {
char[] temp = url.toCharArray();
for (int i = 0; i < temp.length; i++) {
if (temp[i] == '/' || temp[i] == '\\') {
temp[i] = File.separatorChar;
}
}
String tempString = new String(temp);
if (tempString.endsWith(File.separator)) {
return tempString + "noname.html";
} else {
return tempString;
}
}
private String splitPath(String url) {
int slash = url.lastIndexOf(File.separator);
return url.substring(0, slash + 1);
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -