📄 spiderthread.java
字号:
package cs;
import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.security.*;
import java.util.*;
import javax.servlet.RequestDispatcher;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
import vo.UrlQueueNode;
public class SpiderThread implements Runnable {
MulThreadSpiderMainclass ms;
private List WaiteUrlQueue;
private List ReasultUrlQueue;
//MulThreadSpiderMainclass main;
private String keywordList[];
/*ip domain列表 */
private static String ipDomainList[];
/*搜索站点的深度上限*/
private static int depthLimit;
/*搜索站点的上限*/
private static int siteLimit;
private int number;
public SpiderThread(MulThreadSpiderMainclass ms ,List ReasultUrlQueue,List WaiteUrlQueue,String keywordList[],String ipDomainList[],int depthLimit,int siteLimit,int number){
this.ReasultUrlQueue=ReasultUrlQueue;
this.ms = ms;
this.WaiteUrlQueue=WaiteUrlQueue;
this.keywordList=keywordList;
this.ipDomainList=ipDomainList;
this.depthLimit=depthLimit;
this.siteLimit=siteLimit;
this.number=number;
}
public void run() {
while(ms.getUrlParsed()<siteLimit){
UrlQueueNode reslovingNode;
synchronized(WaiteUrlQueue){
while(ms.getUrlParsed()==ms.getUrlGeted()){
if(ms.getUrlParsed()==siteLimit){
System.out.println("Thread ending");
return;
}
try{
WaiteUrlQueue.wait();
}
catch(InterruptedException ex){
}
}
System.out.println(number+".......操作......");
reslovingNode=(UrlQueueNode)WaiteUrlQueue.get(ms.addUrlParsed());
}
searchWeb(reslovingNode);
synchronized(ReasultUrlQueue){
if(reslovingNode.isMatch()){
ReasultUrlQueue.add(reslovingNode);
}
}
//Thread.yield();
System.out.println(reslovingNode.toString()+" "+reslovingNode.isMatch());
System.out.println(reslovingNode.getText());
System.out.println("有"+ms.getSitesFound()+"个站点发现了该关键字"+" "+" 查找了"+ms.getSitesSearched()+"个站点");
}
//System.out.println("线程结束!");
//ms.dispatcher();
}
public void searchWeb(UrlQueueNode reslovingNode)
{
if(urlHasBeenVisited(reslovingNode)) { // 是否已经被处理过了
System.out.println("该页面已经背查找过了!!!");
return; // 是, 返回
}
if(depthLimitExceeded(reslovingNode))
return;
if(ms.getSitesSearched()>= siteLimit)
return;
System.out.println("Searching :"+reslovingNode.toString()+" \n");
ms.addSitesSearched();
//
// 现在开始检查文件
//
try{
URL url = reslovingNode.getUrl(); // create the url object from a string.
String protocol = url.getProtocol(); // ask the url for its protocol
if(!protocol.equalsIgnoreCase("http") && !protocol.equalsIgnoreCase("file"))
{
System.out.println(" Skipping : "+reslovingNode.toString()+" not a http site\n\n");
return;
}
String path = url.getPath(); // ask the url for its path
int lastdot = path.lastIndexOf("."); // check for file extension
if(lastdot > 0)
{
String extension = path.substring(lastdot); // just the file extension
if(!extension.equalsIgnoreCase(".html") && !extension.equalsIgnoreCase(".htm"))
return; // skip everything but html files
}
if(!isDomainOk(url))
{
System.out.println(" Skipping : "+reslovingNode.toString()+" not in domain list\n\n");
return;
}
InputStream in = url.openStream(); // ask the url object to create an input stream
InputStreamReader isr = new InputStreamReader(in); // convert the stream to a reader.
MySpiderParserCallback cb = new MySpiderParserCallback(reslovingNode); // create a callback object
ParserDelegator pd = new ParserDelegator(); // create the delegator
pd.parse(isr,cb,true); // 解析这个输入流
isr.close(); // 关闭这个输入流
} // end try
catch(MalformedURLException ex)
{
System.out.println(" (1) Bad URL encountered : "+reslovingNode.toString()+"\n\n");
}
catch(IOException e)
{
System.out.println(" IOException, could not access site : "+e.getMessage()+"\n\n");
}
// Thread.yield();
return;
}
private boolean isDomainOk(URL url)
{
if(url.getProtocol().equals("file"))
return true; // file protocol always ok
String host = url.getHost();
int lastdot = host.lastIndexOf(".");
if(lastdot <= 0)
return true;
String domain = host.substring(lastdot); // just the .com or .edu part
if(ipDomainList.length == 0)
return true;
for(int i=0; i < ipDomainList.length; i++)
{
if(ipDomainList[i].equalsIgnoreCase("<any>"))
return true;
if(ipDomainList[i].equalsIgnoreCase(domain))
return true;
}
return false;
}
public boolean depthLimitExceeded(UrlQueueNode managing)
{
if(managing.getDepthLevel() >= depthLimit)
return true;
else
return false;
}
public boolean urlHasBeenVisited(UrlQueueNode reslovingNode){
for(int i=0;i<ms.getUrlParsed()-5;i++){
if(((UrlQueueNode)WaiteUrlQueue.get(i)).equals(reslovingNode.toString1())){
return true;
}
}
return false;
}
public boolean urlHasBeenInsert(UrlQueueNode reslovingNode){
for(int i=0;i<ms.getCurrentQueueNum();i++){
if(((UrlQueueNode)WaiteUrlQueue.get(i)).equals(reslovingNode.toString1())){
return true;
}
}
return false;
}
// private static synchronized void addSitesFound(){
// sitesFound++;
// }
public static String fixHref(String href)
{
String newhref = href.replace('\\', '/'); // fix sloppy web references
int lastdot = newhref.lastIndexOf('.');
int lastslash = newhref.lastIndexOf('/');
if(lastslash > lastdot)
{
if(newhref.charAt(newhref.length()-1) != '/')
newhref = newhref+"/"; // add on missing /
}
return newhref;
}
/**
* Inner class used to html handle parser callbacks
*/
class MySpiderParserCallback extends HTMLEditorKit.ParserCallback {
/** url node being parsed */
private UrlQueueNode node;
/** contents of last text element */
private String lastText = "";
/**
* Creates a new instance of SpiderParserCallback
* @param atreenode search tree node that is being parsed
*/
public MySpiderParserCallback(UrlQueueNode Queuenode) {
node = Queuenode;
}
/**
* take care of start tags
* @param t HTML tag
* @param a HTML attributes
* @param pos Position within file
*/
public void handleStartTag(HTML.Tag t,
MutableAttributeSet a,
int pos)
{
if(t.equals(HTML.Tag.TITLE))
{
lastText="";
return;
}
}
/**
* take care of start tags
* @param t HTML tag
* @param pos Position within file
*/
public void handleEndTag(HTML.Tag t,
int pos)
{
if(t.equals(HTML.Tag.TITLE) && lastText != null)
{
node.setTitle(lastText.trim());
}
}
/**
* take care of text between tags, check against keyword list for matches, if
* match found, set the node match status to true
* @param data Text between tags
* @param pos position of text within web page
*/
public void handleText(char[] data, int pos)
{
int index;
lastText = new String(data);
node.addChars(lastText.length());
String text = lastText.toUpperCase();
for(int i = 0; i < keywordList.length; i++)
{
if((index=text.indexOf(keywordList[i])) >= 0)
{
if(!node.isMatch())
{
if(lastText.length()>=100){
String temp;
temp=lastText.substring(index);
if(temp.length()>100){
node.setText(temp.substring(0, 99));
}
else{
node.setText(temp);
}
}
else{
node.setText(lastText);
}
ms.addSitesFound();
}
node.setMatch(keywordList[i]);
return;
}
}
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -