📄 webparser.java

📁 基于JAVA的网络蜘蛛系统,使用JAVA实现抓取网络资源的网络蜘蛛。通过一个入口网址来扫描整个互联网的网址
💻 JAVA
字号:
package issa.webspider.demo;

import org.apache.log4j.Logger;
import org.apache.log4j.xml.DOMConfigurator;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLConnection;
import java.util.Iterator;
import java.util.List;

import au.id.jericho.lib.html.Segment;
import au.id.jericho.lib.html.Source;
import au.id.jericho.lib.html.StartTag;
import au.id.jericho.lib.html.Tag;

public class WebParser {
	/**
	 * Logger for this class
	 */
	private static final Logger logger = Logger.getLogger(WebParser.class);

	URL url;
	String path;

	public WebParser(URL url, String path) {
		this.url = url;
		this.path = path;
	}

	/**
	 * @param args
	 */
	public static void main(String[] args) {
		if (args.length > 0) {
			DOMConfigurator.configure("log4j.xml");
			try {
				URL u = new URL(args[0]);
				WebParser wp = new WebParser(u, args[1]);
				URLConnection uc = wp.url.openConnection();

				Class[] types = { String.class, Reader.class, InputStream.class };

				Object o = uc.getContent(types);
				System.out.println("I got a " + o.getClass().getName());

				if (o instanceof String) {
					System.out.println("String");
					System.out.println(o);
				} else if (o instanceof Reader) {
					System.out.println("Reader");
					int c;
					Reader r = (Reader) o;
					// while ((c=r.read()) != -1) {
					// System.out.print((char) c);
					// }
					wp.getLinks();
					wp.writeToFile(r);
				} else if (o instanceof InputStream) {
					// System.out.println("InputStream");
					int c;
					InputStream in = new BufferedInputStream((InputStream) o);
					Reader r = new InputStreamReader(in);
					// while ((c=r.read()) != -1) {
					// System.out.print((char) c);
					// }
					wp.getLinks();
					wp.writeToFile(r);
				} else if (o == null) {
					System.out
							.println("None of the requested types were available.");
				} else {
					System.out
							.println("Error: unexpected type " + o.getClass());
				}

				System.out.println("Content Type " + uc.getContentType());

			} catch (MalformedURLException ex) {
				System.err.println(args[0] + "is not a parseable URL");
			} catch (IOException e) {
				e.printStackTrace();
			}

		}

	}

	private void getLinks() throws IOException {
		Source source = new Source(url);
		List l = source.findAllStartTags(Tag.A);
		// displaySegments(l);
		for (Iterator i = l.iterator(); i.hasNext();) {
			StartTag tag = (StartTag) i.next();
			// System.out
			// .println("-------------------------------------------------------------------------------");
			// System.out.println(segment.getDebugInfo());
			// System.out.println(getFullURL(tag.getAttributeValue("href")));
		}
	}

	private String getFullURL(String url) {
		try {
			if (url == null)
				return url;
			// if (processPattern(url)) return null;
			// 如果url前有http://或https://，为绝对路径，按原样返回
			if (url.toLowerCase().startsWith("http://")
					|| url.toLowerCase().startsWith("https://"))
				return url;

			URI parentUri = this.url.toURI();
			String port = "";
			if (parentUri.getPort() != -1)
				port = ":" + parentUri.getPort();
			if (url.startsWith("/")) // url以"/"开头，直接放在host后面
				return parentUri.getScheme() + "://" + parentUri.getHost()
						+ port + url;
			else // url不以"/"开头，放在url的路径后面
			{
				String s = "";
				s = parentUri.getPath().substring(0,
						parentUri.getPath().lastIndexOf("/"));
				return parentUri.getScheme() + "://" + parentUri.getHost()
						+ port + s + "/" + url;
			}
		} catch (URISyntaxException e) {
			System.err.println(e);
			return null;
		}
	}

	private void writeToFile(Reader r) throws IOException {
		String stringFile = makeLocalPath();
		String stringPath = splitPath(stringFile);

		File file = new File(stringFile);
		File dir = new File(stringPath);
		if (!dir.exists()) {
			System.out.println(dir + " is not exists!");
			dir.mkdirs();
		}
		if (!file.exists()) {
			System.out.println(file + " is not exists!");
			file.createNewFile();
		}
		FileWriter fw = new FileWriter(file);
		int c;
		while ((c = r.read()) != -1) {
			fw.write(c);
		}
		fw.flush();
		fw.close();
	}
	
	private String makeLocalPath() {
		URI parentUri;
		try {
			parentUri = url.toURI();
		} catch (URISyntaxException e) {
			e.printStackTrace();
			return null;
		}
		StringBuilder sb = new StringBuilder();
		sb.append(path);
		sb.append("\\");
		sb.append(parentUri.getHost());
		sb.append(parentUri.getPath());
		return replaceSlash(sb.toString());
	}

	private String replaceSlash(String url) {
		char[] temp = url.toCharArray();
		for (int i = 0; i < temp.length; i++) {
			if (temp[i] == '/' || temp[i] == '\\') {
				temp[i] = File.separatorChar;
			}
		}
		String tempString = new String(temp);
		if (tempString.endsWith(File.separator)) {
			return tempString + "noname.html";
		} else {
			return tempString;
		}
	}

	private String splitPath(String url) {
		int slash = url.lastIndexOf(File.separator);
		return url.substring(0, slash + 1);
	}

}
💿 文件大小 508 K
👤 上传用户 Numb_pqc
📂 所属分类 Java编程
🏷️ 相关标签

#JAVA #网络 #网址
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -