📄 multhreadspidermainclass.java

📁 一个简单的网络搜索引擎的代码
💻 JAVA
字号:
package cs;
import java.util.*;
import java.io.*;
import java.net.*;
import javax.servlet.RequestDispatcher;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import javax.swing.*;
import javax.swing.tree.*;
import javax.swing.text.html.parser.*;
import javax.swing.text.html.HTMLEditorKit.*;
import javax.swing.text.html.*;
import javax.swing.text.*;

import vo.UrlQueueNode;

public class MulThreadSpiderMainclass {
	/* 已经解析的页面的个数 */
	private  int UrlParsed=0;
	/* 当前等待解析的页面的个数 */
	private  int currentQueueNum=1;
	/* 已发现关键字的站点数 */
	private  int sitesFound=0;
	/* 已经查找的站点数*/
	private  int sitesSearched=0;
	/* 线程数 */
	int thread = 40;
	/* 搜索站点的上限 */
	private int siteLimit;
	/* 搜索站点的深度上限 */
	private int depthLimit;
	/* 关键字列表 */
	private String keywordList[];
	/* ip domain列表 */
	private String ipDomainList[];
	/* 获得URL数目 */
	public int UrlGeted = 1;
	/* 开始查找的站点 */
	private String startSite;
	/* 用来标记是否停止搜索 */
	private boolean stopSearch = false;
	/* 等待处理的URL结点队列 */
	private Vector WaiteUrlQueue;
	/* 存放找到关键字的结点的队列 */
	private Vector ReasultUrlQueue;


	private HttpServletRequest request;

	private HttpServletResponse response;

	private static final String path = "/jsp/displaysearch1.jsp";

	public int getUrlParsed(){
		return UrlParsed;
	}
	public int addUrlParsed(){
		return UrlParsed++;
	}
	
	public int getCurrentQueueNum(){
		return currentQueueNum;
	}
	public void addCurrentQueueNum(){
		currentQueueNum++;
	}
	public int getSitesSearched(){
		return sitesSearched;
	}
	public void addSitesSearched(){
		sitesSearched++;
	}
	
	public int getUrlGeted() {
		return UrlGeted;
	}
	
	public void addSitesFound(){
		sitesFound++;
	}
	public int  getSitesFound(){
		return sitesFound;
	}
	
	private void dispatcher() {
		request.setAttribute("SearchResult", ReasultUrlQueue);
		RequestDispatcher rd = request.getRequestDispatcher(path);
		try {
			rd.forward(request, response);
		} catch (Exception e) {
			e.printStackTrace();
		} 
	}

	private int getLen() {
		synchronized (ReasultUrlQueue) {
			return ReasultUrlQueue.size();
		}
	}

	public MulThreadSpiderMainclass(String astartsite, String[] akeywordlist,
			String[] aipdomainlist, int asitelimit, int adepthlimit, HttpServletRequest request, HttpServletResponse response) {
        this.request = request;
        this.response = response;
		WaiteUrlQueue = new Vector(100, 5);
		ReasultUrlQueue = new Vector(100, 5);
		startSite = fixHref(astartsite);

		keywordList = new String[akeywordlist.length];
		for (int i = 0; i < akeywordlist.length; i++)
			keywordList[i] = akeywordlist[i].toUpperCase(); // 全部转化成大写字母

		ipDomainList = new String[aipdomainlist.length];
		for (int i = 0; i < aipdomainlist.length; i++)
			ipDomainList[i] = aipdomainlist[i].toUpperCase(); // 全部转化成大写字母

		siteLimit = asitelimit; // 所能访问的最大结点数目
		depthLimit = adepthlimit; // 所能访问的最大深度

	}

	public void SpiderStart() {
		String urllc = startSite.toLowerCase();
		UrlQueueNode newNode;
		if (!urllc.startsWith("http://") && !urllc.startsWith("ftp://")
				&& !urllc.startsWith("www.")) {
			startSite = "file:///" + startSite; // note you must have 3 slashes
												// !
		} else // http missing ?
		if (urllc.startsWith("www.")) {
			startSite = "http://" + startSite; // 在头部添加http://
		}
		startSite = startSite.replace('\\', '/'); // 修复错误的URL地址
		try {
			URL url = new URL(startSite);
			newNode = new UrlQueueNode(url);
			newNode.setDepthLevel(0);
			WaiteUrlQueue.add(0, newNode);
		} catch (MalformedURLException ex) {

			System.out.println("    Bad URL encountered : " + startSite
					+ "\n\n");
		}

		SpiderThread threads[] = new SpiderThread[thread];
		for (int i = 0; i < threads.length; i++) {
			threads[i] = new SpiderThread(this, ReasultUrlQueue, WaiteUrlQueue,
					keywordList, ipDomainList, depthLimit, siteLimit, i);
			Thread t = new Thread(threads[i]);
			t.start();
		}
		while (UrlGeted < siteLimit) {
			searchURL((UrlQueueNode) WaiteUrlQueue.get(UrlGeted - 1));
		}
		while(true){
			if(UrlParsed>=80){
				dispatcher();
				break;
			}
		}
		
	}

	public void addReasult(UrlQueueNode reasult) {
		ReasultUrlQueue.add(reasult);
	}

	public boolean urlHasBeenGeted(UrlQueueNode reslovingNode) {

		for (int i = 0; i < UrlGeted; i++) {
			if (((UrlQueueNode) WaiteUrlQueue.get(i)).equals(reslovingNode
					.toString1())) {
				return true;
			}
		}
		return false;

	}

	public boolean depthLimitExceeded(UrlQueueNode managing) {

		if (managing.getDepthLevel() >= depthLimit)
			return true;
		else
			return false;

	}

	public void searchURL(UrlQueueNode reslovingNode) {

		if (depthLimitExceeded(reslovingNode))
			return;

		if (UrlGeted >= siteLimit)
			return;

		//
		// 现在开始检查文件
		//
		try {
			URL url = reslovingNode.getUrl(); // create the url object from a
												// string.

			String protocol = url.getProtocol(); // ask the url for its
													// protocol
			if (!protocol.equalsIgnoreCase("http")
					&& !protocol.equalsIgnoreCase("file")) {
				System.out.println("    Skipping : " + reslovingNode.toString()
						+ " not a http site\n\n");
				return;
			}

			String path = url.getPath(); // ask the url for its path
			int lastdot = path.lastIndexOf("."); // check for file extension
			if (lastdot > 0) {
				String extension = path.substring(lastdot); // just the file
															// extension
				if (!extension.equalsIgnoreCase(".html")
						&& !extension.equalsIgnoreCase(".htm"))
					return; // skip everything but html files
			}

			if (!isDomainOk(url)) {
				System.out.println("    Skipping : " + reslovingNode.toString()
						+ " not in domain list\n\n");
				return;
			}
			// System.out.println("前1前1前1前1前1前1前1前1前1");
			InputStream in = url.openStream(); // ask the url object to create
												// an input stream
			InputStreamReader isr = new InputStreamReader(in); // convert the
																// stream to a
																// reader.
			// System.out.println("后1后1后1后1后1后1后1后1后1后1");
			MySpiderParserCallback cb = new MySpiderParserCallback(
					reslovingNode); // create a callback object
			ParserDelegator pd = new ParserDelegator(); // create the delegator
			pd.parse(isr, cb, true); // 解析这个输入流
			// System.out.println("后2后2后2后2后2后2后2");
			isr.close(); // 关闭这个输入流

		} // end try
		catch (MalformedURLException ex) {
			System.out.println("  (1)  Bad URL encountered : "
					+ reslovingNode.toString() + "\n\n");
		} catch (IOException e) {
			System.out.println("    IOException, could not access site : "
					+ e.getMessage() + "\n\n");
		}
		// yield();
		return;

	}

	private boolean isDomainOk(URL url) {
		if (url.getProtocol().equals("file"))
			return true; // file protocol always ok

		String host = url.getHost();
		int lastdot = host.lastIndexOf(".");
		if (lastdot <= 0)
			return true;

		String domain = host.substring(lastdot); // just the .com or .edu
													// part

		if (ipDomainList.length == 0)
			return true;

		for (int i = 0; i < ipDomainList.length; i++) {
			if (ipDomainList[i].equalsIgnoreCase("<any>"))
				return true;
			if (ipDomainList[i].equalsIgnoreCase(domain))
				return true;
		}
		return false;

	}

	public static String fixHref(String href) {
		String newhref = href.replace('\\', '/'); // fix sloppy web references
		int lastdot = newhref.lastIndexOf('.');
		int lastslash = newhref.lastIndexOf('/');
		if (lastslash > lastdot) {
			if (newhref.charAt(newhref.length() - 1) != '/')
				newhref = newhref + "/"; // add on missing /
		}

		return newhref;

	}

	/**
	 * Inner class used to html handle parser callbacks
	 */
	class MySpiderParserCallback extends HTMLEditorKit.ParserCallback {
		/** url node being parsed */
		private UrlQueueNode node;
		/** contents of last text element */
		private String lastText = "";

		/**
		 * Creates a new instance of SpiderParserCallback
		 * 
		 * @param atreenode
		 *            search tree node that is being parsed
		 */
		public MySpiderParserCallback(UrlQueueNode Queuenode) {
			node = Queuenode;
		}

		/**
		 * handle HTML tags that don't have a start and end tag
		 * 
		 * @param t
		 *            HTML tag
		 * @param a
		 *            HTML attributes
		 * @param pos
		 *            Position within file
		 */
		public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) {

			if (t.equals(HTML.Tag.BASE)) {
				Object value = a.getAttribute(HTML.Attribute.HREF);
				if (value != null)
					node.setBase(fixHref(value.toString()));
			}

		}

		/**
		 * take care of start tags
		 * 
		 * @param t
		 *            HTML tag
		 * @param a
		 *            HTML attributes
		 * @param pos
		 *            Position within file
		 */
		public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {

			if (UrlGeted < siteLimit) {
				if (t.equals(HTML.Tag.A)) {
					Object value = a.getAttribute(HTML.Attribute.HREF);
					if (value != null) {
						node.addLinks(1);
						String href = value.toString();
						href = fixHref(href);
						try {
							URL referencedURL = new URL(node.getBase(), href);
							UrlQueueNode newQueueNode = new UrlQueueNode(
									referencedURL);

							if (urlHasBeenGeted(newQueueNode)) {
								System.out.println("!!!该URL已经在页面中！！！");
								return;
							}
							newQueueNode
									.setDepthLevel(node.getDepthLevel() + 1);
							synchronized (WaiteUrlQueue) {
								WaiteUrlQueue.add(UrlGeted, newQueueNode);
								UrlGeted++;
								WaiteUrlQueue.notifyAll();
							}

						} catch (MalformedURLException e) {
							System.out
									.println(" (main2)   Bad URL encountered : "
											+ href + "\n\n");
							return;
						}
					}
				}
			}
		}
	}	
}
💿 文件大小 7 K
👤 上传用户 quzhengjie
📂 所属分类 Java编程
🏷️ 相关标签

#网络搜索引擎 #代码
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -