📄 multhreadspidermainclass.java
字号:
package cs;
import java.util.*;
import java.io.*;
import java.net.*;
import javax.servlet.RequestDispatcher;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import javax.swing.*;
import javax.swing.tree.*;
import javax.swing.text.html.parser.*;
import javax.swing.text.html.HTMLEditorKit.*;
import javax.swing.text.html.*;
import javax.swing.text.*;
import vo.UrlQueueNode;
public class MulThreadSpiderMainclass {
/* 已经解析的页面的个数 */
private int UrlParsed=0;
/* 当前等待解析的页面的个数 */
private int currentQueueNum=1;
/* 已发现关键字的站点数 */
private int sitesFound=0;
/* 已经查找的站点数*/
private int sitesSearched=0;
/* 线程数 */
int thread = 40;
/* 搜索站点的上限 */
private int siteLimit;
/* 搜索站点的深度上限 */
private int depthLimit;
/* 关键字列表 */
private String keywordList[];
/* ip domain列表 */
private String ipDomainList[];
/* 获得URL数目 */
public int UrlGeted = 1;
/* 开始查找的站点 */
private String startSite;
/* 用来标记是否停止搜索 */
private boolean stopSearch = false;
/* 等待处理的URL结点队列 */
private Vector WaiteUrlQueue;
/* 存放找到关键字的结点的队列 */
private Vector ReasultUrlQueue;
private HttpServletRequest request;
private HttpServletResponse response;
private static final String path = "/jsp/displaysearch1.jsp";
public int getUrlParsed(){
return UrlParsed;
}
public int addUrlParsed(){
return UrlParsed++;
}
public int getCurrentQueueNum(){
return currentQueueNum;
}
public void addCurrentQueueNum(){
currentQueueNum++;
}
public int getSitesSearched(){
return sitesSearched;
}
public void addSitesSearched(){
sitesSearched++;
}
public int getUrlGeted() {
return UrlGeted;
}
public void addSitesFound(){
sitesFound++;
}
public int getSitesFound(){
return sitesFound;
}
private void dispatcher() {
request.setAttribute("SearchResult", ReasultUrlQueue);
RequestDispatcher rd = request.getRequestDispatcher(path);
try {
rd.forward(request, response);
} catch (Exception e) {
e.printStackTrace();
}
}
private int getLen() {
synchronized (ReasultUrlQueue) {
return ReasultUrlQueue.size();
}
}
public MulThreadSpiderMainclass(String astartsite, String[] akeywordlist,
String[] aipdomainlist, int asitelimit, int adepthlimit, HttpServletRequest request, HttpServletResponse response) {
this.request = request;
this.response = response;
WaiteUrlQueue = new Vector(100, 5);
ReasultUrlQueue = new Vector(100, 5);
startSite = fixHref(astartsite);
keywordList = new String[akeywordlist.length];
for (int i = 0; i < akeywordlist.length; i++)
keywordList[i] = akeywordlist[i].toUpperCase(); // 全部转化成大写字母
ipDomainList = new String[aipdomainlist.length];
for (int i = 0; i < aipdomainlist.length; i++)
ipDomainList[i] = aipdomainlist[i].toUpperCase(); // 全部转化成大写字母
siteLimit = asitelimit; // 所能访问的最大结点数目
depthLimit = adepthlimit; // 所能访问的最大深度
}
public void SpiderStart() {
String urllc = startSite.toLowerCase();
UrlQueueNode newNode;
if (!urllc.startsWith("http://") && !urllc.startsWith("ftp://")
&& !urllc.startsWith("www.")) {
startSite = "file:///" + startSite; // note you must have 3 slashes
// !
} else // http missing ?
if (urllc.startsWith("www.")) {
startSite = "http://" + startSite; // 在头部添加http://
}
startSite = startSite.replace('\\', '/'); // 修复错误的URL地址
try {
URL url = new URL(startSite);
newNode = new UrlQueueNode(url);
newNode.setDepthLevel(0);
WaiteUrlQueue.add(0, newNode);
} catch (MalformedURLException ex) {
System.out.println(" Bad URL encountered : " + startSite
+ "\n\n");
}
SpiderThread threads[] = new SpiderThread[thread];
for (int i = 0; i < threads.length; i++) {
threads[i] = new SpiderThread(this, ReasultUrlQueue, WaiteUrlQueue,
keywordList, ipDomainList, depthLimit, siteLimit, i);
Thread t = new Thread(threads[i]);
t.start();
}
while (UrlGeted < siteLimit) {
searchURL((UrlQueueNode) WaiteUrlQueue.get(UrlGeted - 1));
}
while(true){
if(UrlParsed>=80){
dispatcher();
break;
}
}
}
public void addReasult(UrlQueueNode reasult) {
ReasultUrlQueue.add(reasult);
}
public boolean urlHasBeenGeted(UrlQueueNode reslovingNode) {
for (int i = 0; i < UrlGeted; i++) {
if (((UrlQueueNode) WaiteUrlQueue.get(i)).equals(reslovingNode
.toString1())) {
return true;
}
}
return false;
}
public boolean depthLimitExceeded(UrlQueueNode managing) {
if (managing.getDepthLevel() >= depthLimit)
return true;
else
return false;
}
public void searchURL(UrlQueueNode reslovingNode) {
if (depthLimitExceeded(reslovingNode))
return;
if (UrlGeted >= siteLimit)
return;
//
// 现在开始检查文件
//
try {
URL url = reslovingNode.getUrl(); // create the url object from a
// string.
String protocol = url.getProtocol(); // ask the url for its
// protocol
if (!protocol.equalsIgnoreCase("http")
&& !protocol.equalsIgnoreCase("file")) {
System.out.println(" Skipping : " + reslovingNode.toString()
+ " not a http site\n\n");
return;
}
String path = url.getPath(); // ask the url for its path
int lastdot = path.lastIndexOf("."); // check for file extension
if (lastdot > 0) {
String extension = path.substring(lastdot); // just the file
// extension
if (!extension.equalsIgnoreCase(".html")
&& !extension.equalsIgnoreCase(".htm"))
return; // skip everything but html files
}
if (!isDomainOk(url)) {
System.out.println(" Skipping : " + reslovingNode.toString()
+ " not in domain list\n\n");
return;
}
// System.out.println("前1前1前1前1前1前1前1前1前1");
InputStream in = url.openStream(); // ask the url object to create
// an input stream
InputStreamReader isr = new InputStreamReader(in); // convert the
// stream to a
// reader.
// System.out.println("后1后1后1后1后1后1后1后1后1后1");
MySpiderParserCallback cb = new MySpiderParserCallback(
reslovingNode); // create a callback object
ParserDelegator pd = new ParserDelegator(); // create the delegator
pd.parse(isr, cb, true); // 解析这个输入流
// System.out.println("后2后2后2后2后2后2后2");
isr.close(); // 关闭这个输入流
} // end try
catch (MalformedURLException ex) {
System.out.println(" (1) Bad URL encountered : "
+ reslovingNode.toString() + "\n\n");
} catch (IOException e) {
System.out.println(" IOException, could not access site : "
+ e.getMessage() + "\n\n");
}
// yield();
return;
}
private boolean isDomainOk(URL url) {
if (url.getProtocol().equals("file"))
return true; // file protocol always ok
String host = url.getHost();
int lastdot = host.lastIndexOf(".");
if (lastdot <= 0)
return true;
String domain = host.substring(lastdot); // just the .com or .edu
// part
if (ipDomainList.length == 0)
return true;
for (int i = 0; i < ipDomainList.length; i++) {
if (ipDomainList[i].equalsIgnoreCase("<any>"))
return true;
if (ipDomainList[i].equalsIgnoreCase(domain))
return true;
}
return false;
}
public static String fixHref(String href) {
String newhref = href.replace('\\', '/'); // fix sloppy web references
int lastdot = newhref.lastIndexOf('.');
int lastslash = newhref.lastIndexOf('/');
if (lastslash > lastdot) {
if (newhref.charAt(newhref.length() - 1) != '/')
newhref = newhref + "/"; // add on missing /
}
return newhref;
}
/**
* Inner class used to html handle parser callbacks
*/
class MySpiderParserCallback extends HTMLEditorKit.ParserCallback {
/** url node being parsed */
private UrlQueueNode node;
/** contents of last text element */
private String lastText = "";
/**
* Creates a new instance of SpiderParserCallback
*
* @param atreenode
* search tree node that is being parsed
*/
public MySpiderParserCallback(UrlQueueNode Queuenode) {
node = Queuenode;
}
/**
* handle HTML tags that don't have a start and end tag
*
* @param t
* HTML tag
* @param a
* HTML attributes
* @param pos
* Position within file
*/
public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) {
if (t.equals(HTML.Tag.BASE)) {
Object value = a.getAttribute(HTML.Attribute.HREF);
if (value != null)
node.setBase(fixHref(value.toString()));
}
}
/**
* take care of start tags
*
* @param t
* HTML tag
* @param a
* HTML attributes
* @param pos
* Position within file
*/
public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
if (UrlGeted < siteLimit) {
if (t.equals(HTML.Tag.A)) {
Object value = a.getAttribute(HTML.Attribute.HREF);
if (value != null) {
node.addLinks(1);
String href = value.toString();
href = fixHref(href);
try {
URL referencedURL = new URL(node.getBase(), href);
UrlQueueNode newQueueNode = new UrlQueueNode(
referencedURL);
if (urlHasBeenGeted(newQueueNode)) {
System.out.println("!!!该URL已经在页面中!!!");
return;
}
newQueueNode
.setDepthLevel(node.getDepthLevel() + 1);
synchronized (WaiteUrlQueue) {
WaiteUrlQueue.add(UrlGeted, newQueueNode);
UrlGeted++;
WaiteUrlQueue.notifyAll();
}
} catch (MalformedURLException e) {
System.out
.println(" (main2) Bad URL encountered : "
+ href + "\n\n");
return;
}
}
}
}
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -