📄 spidergui.java

📁 一个简单的Java爬虫
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
package crawler;


import   java.awt.*;  
import   java.net.*;   
import   java.io.*;    
import   java.util.*;
import   java.lang.*;
 

class   node{   
	private   Object   data;   
    private   node   next;   
    private   node   prev;   
    public   node(Object   o){   
    data   =   o;   
    prev   =   next   =   null;   
    }   
    public   String   toString(){   
        if(next!=null)
    	    return   data.toString()   +   "   "+   next.toString();   
        return   data.toString();   
    }   
    public   node   getNext(){
	    return   next;
	}   
    public   void   setNext(node   n){
    	next   =   n;
    }   
    public   node   getPrev(){
    	return   prev;
    }   
    public   void   setPrev(node   n){
    	prev   =   n;
    }   
    public   Object   getData(){
    	return   data;
    }   
}









class   linkedlist{   
    node   head;   
    node   tail;   
    public   linkedlist(){   
        tail   =   head   =   null;   
    }   
    public   String   toString(){   
        if(head==null)
        	return   "Empty   list";   
        return   head.toString();   
    }
    public   void   insert(Object   o){   
    	if(tail==null){   
    		head   =   tail   =   new   node(o);   
    	}else{   
    			node   nn   =   new   node(o);   
    			tail.setNext(nn);   
    			tail=nn;   
    	 }   
    }   
    public   boolean   contains(Object   o){   
    	for(node   n   =   head;n!=null;n=n.getNext()){   
    		if(o.equals(n.getData()))return   true;   
    	}   
    	return   false;   
    }   
    public   Object   pop(){   
    	if(head==null)return   null;   
    	Object   ret   =   head.getData();   
    	head   =   head.getNext();   
    	if(head==null)
    		tail   =   null;   
    	return   ret;   
    }   
    public   boolean   isEmpty(){   
    	return   head==null;   
    }   
}   
  
  






class   list{   
	protected   node   tail;   
	protected   node   ptr;   
	private   boolean   stop;   
	public   list(){   
		ptr=tail=null;   
		stop=false;   
	}   
	public   boolean   isEmpty(){return   tail==null;}   
	public   void   reset(){   
		stop=false;   
		ptr=tail;   
	}   
	public   String   toString(){   
		if(tail==null)
			return   "Empty   list";   
		String   ret="";   
		for(node   n   =   tail.getNext();n!=tail;n=n.getNext())ret+=n.getData().toString()+"   ";   
		ret+=tail.getData().toString();   
		return   ret;   
	}   
	public   Object   get(){   
		if(ptr==null)
			return   null;   
		ptr   =   ptr.getNext();   
		if(ptr==tail.getNext()){   
			if(stop)
				return   null;   
			stop=true;   
			return   tail.getNext().getData();   
		}   
		return   ptr.getData();   
	}   
	public   void   insert(Object   o,   boolean   attail){   
		node   nn   =   new   node(o);   
		if(tail==null){   
			nn.setNext(nn);   
			nn.setPrev(nn);   
			ptr=tail=nn;   
			return;   
		}   
		if(attail){   
			tail.getNext().setPrev(nn);   
			nn.setNext(tail.getNext());   
			tail.setNext(nn);   
			nn.setPrev(tail);   
			tail=nn;   
		}else{   
			nn.setNext(tail.getNext());   
			nn.setPrev(tail);   
			tail.setNext(nn);   
			nn.getNext().setPrev(nn);   
		}   
	}   
	public   void   insert(Object   o){}   

}   








class   stack   extends   list{   
	public   stack(){super();}   
	public   void   insert(Object   o){insert(o,   false);}   
}   



class   queue   extends   list{   
	public   queue(){
		super();
	}   
	public   void   insert(Object   o){insert(o,   true);}   
	public   String   peek(){   
		if(tail==null)
			return   "";   
		return   tail.getNext().getData().toString();   
	}   
	public   Object   pop(){   
		if(tail==null)
			return   null;   
		Object   ret   =   tail.getNext().getData();   
		if(tail.getNext()==tail){   
			tail=ptr=null;   
		}else{   
			if(tail.getNext()==ptr)ptr=ptr.getNext();   
			tail.setNext(tail.getNext().getNext());   
		}   
		return   ret;   
	}   
}   
  
  






class   hashtable{   
	private   Vector   table;   
	private   int   size;   
	public   hashtable(){   
		size   =   991;   
		table   =   new   Vector();   
		for(int   i=0;i<size;i++){   
			table.add(new   linkedlist());   
		}   
	}   
	public   void   insert(Object   o){   
		int   index   =   o.hashCode();   
		index   =   index   %   size;   
		if(index<0)index+=size;   
		linkedlist   ol   =   (linkedlist)table.get(index);   
		ol.insert(o);   
	}   
	public   boolean   contains(Object   o){   
		int   index   =   o.hashCode();   
		index   =   index   %   size;   
		if(index<0)
			index+=size;   
		return   ((linkedlist)(table.get(index))).contains(o);   
	}   
	public   String   toString(){   
		String   ret   ="";   
		for(int   i=0;i<size;i++){   
			if(!((linkedlist)(table.get(i))).isEmpty()){   
				ret+="   ";   
				ret+=table.get(i).toString();   
			}   
		}   
		return   ret;   
	}   
} 





class   spider   implements   Runnable{   
	public   queue   todo;   
	public   stack   done;   
	public   stack   errors;   
	public   stack   omittions;   
	private   hashtable   allsites;   
	private   String   last="";   
	int   maxsites;   
	int   visitedsites;   
	int   TIMEOUT;   
	String   base;   
	String   []badEndings2   =   {"ps",   "gz"};   
	String   []badEndings3   =   {"pdf",   "txt",   "zip",   "jpg",   "mpg",   "gif",   "mov",   "tut",   "req",   "abs",   "swf",   "tex",   "dvi",   "bin",   "exe",   "rpm"};   
	String   []badEndings4   =   {"jpeg",   "mpeg"};   
  
	public   spider(String   starturl,   int   max,   String   b){   
		TIMEOUT   =   5000;   
		base   =   b;   
		allsites   =   new   hashtable();   
		todo   =   new   queue();   
		done   =   new   stack();   
		errors   =   new   stack();   
		omittions   =   new   stack();   
		try{   
			URL   u   =   new   URL(starturl);   
			todo.insert(u);   
			}catch(Exception   e){   
				System.out.println(e);   
				errors.insert("bad   starting   url   "+starturl+",   "+e.toString());   
				}   
			maxsites   =   max;   
			visitedsites   =   0;   
		}   
  
/*   
*   how   many   millisec   to   wait   for   each   page   
*/   
	public   void   setTimer(int   amount){   
		TIMEOUT   =   amount;   
	}   
  
/*   
*   strips   the   ′#′   anchor   off   a   url   
*/   
	private   URL   stripRef(URL   u){   
		try{   
			return   new   URL(u.getProtocol(),   u.getHost(),   u.getPort(),   u.getFile());   
		}catch(Exception   e){
			return   u;
		 }   
	}   
  
/*   
*   adds   a   url   for   future   processing   
*/   
	public   void   addSite(URL   toadd){   
		if(null!=toadd.getRef())
			toadd   =   stripRef(toadd);   
		if(!allsites.contains(toadd)){   
			allsites.insert(toadd);   
			if(!toadd.toString().startsWith(base)){   
				omittions.insert("foreign   URL:   "+toadd.toString());   
				return;   
			}   
			if(!toadd.toString().startsWith("http")   &&   !toadd.toString().startsWith("HTTP")){   
				omittions.insert("ignoring   URL:   "+toadd.toString());   
				return;   
			}   
  
			String   s   =   toadd.getFile();   
			String   last="";   
			String   []comp={};   
			if(s.charAt(s.length()-3)=='.'){   
				last   =   s.substring(s.length()-2);   
				comp   =   badEndings2;   
			}else   if(s.charAt(s.length()-4)=='.'){   
				last   =   s.substring(s.length()-3);   
				comp   =   badEndings3;   
			 }else   if(s.charAt(s.length()-5)=='.'){   
					last   =   s.substring(s.length()-4);   
					comp   =   badEndings4;   
				 }   
			for(int   i=0;i<comp.length;i++){   
				if(last.equalsIgnoreCase(comp[i])){//loop   through   all   bad   extensions   
					omittions.insert("ignoring   URL:   "+toadd.toString());   
					return;   
				}   
			}   
  
			todo.insert(toadd);   
		}   
	}   
  
/*   
*   true   if   there   are   pending   urls   and   the   maximum   hasn′t   been   reached   
*/   
	public   boolean   hasMore(){   
		return   !todo.isEmpty()   &&   visitedsites<maxsites;   
	}   
  
/*   
*   returns   the   next   site,   works   like   enumeration,   will   return   new   values   each   time   
*/   
	private   URL   getNextSite(){   
		last   =   todo.peek();   
		visitedsites++;   
		return   (URL)todo.pop();   
	}   
  
/*   
*   Just   to   see   what   we   are   doing   now...   
*/   
	public   String   getCurrent(){   
		return   last;   
	}   
  
/*   
*   process   the   next   site   
*/   
	public   void   doNextSite(){   
		URL   current   =   getNextSite();   
		if(current==null)return;   
		try{   
			//System.err.println("Processing   #"+visitedsites+":   "+current);   
			parse(current);   
			done.insert(current);   
		}catch(Exception   e){   
			errors.insert("Bad   site:   "+current.toString()+",   "+e.toString());   
		 }   
	}   
  
	public   void   run(){   
		while(hasMore())
			doNextSite();   
	}   
  
/*   
*   to   print   out   the   internal   data   structures   
*/   
	public   String   toString(){
		return   getCompleted()+getErrors();
	}   
	
	private   String   getErrors(){   
		if(errors.isEmpty())
			return   "No   errors   ";   
		else   return   "Errors:   "+errors.toString()+"   End   of   errors   ";   
		}   
	
	private   String   getCompleted(){   
		return   "Completed   Sites:   "+done.toString()+"   End   of   completed   sites   ";   
	}   
  
/*   
*   Parses   a   web   page   at   (site)   and   adds   all   the   urls   it   sees   
*/   
	private   void   parse(URL   site)   throws   Exception{   
		String   source=getText(site);   
		String   title=getTitle(source);   
		if(title.indexOf("404")!=-1   ||   
				title.indexOf("Error")!=-1   ||   
				title.indexOf("Not   Found")!=-1){   
			throw   new   Exception   (("404,   Not   Found:   "+site));   
		}   
		int   loc,   beg;   
		boolean   hasLT=false;   
		boolean   hasSp=false;   
		boolean   hasF=false;   
		boolean   hasR=false;   
		boolean   hasA=false;   
		boolean   hasM=false;   
		boolean   hasE=false;   
		for(loc=0;loc<source.length();loc++){   
			char   c   =   source.charAt(loc);   //source=getText(site),charAt返回source中字符的值
			if(!hasLT){   
				hasLT   =   (c=='<');   
			}   
//search   for   "<a   "      //以<a href开头
			else   if(hasLT   &&   !hasA   &&   !hasF){   //(c=='<')为true
				if(c=='a'   ||   c=='A')   //<后紧跟a
					hasA=true;   
				else   if(c=='f'   ||   c=='F')//<后紧跟f
					hasF=true;   
				else   hasLT=false;   
			}else   if(hasLT   &&   hasA   &&   !hasF   &&   !hasSp){   
				if(c==' ' || c=='\t' || c=='\n')
					hasSp=true;   
				else   hasLT   =   hasA   =   false;   
			}
12 下一页
💿 文件大小 21 K
👤 上传用户 zdh103
📂 所属分类 Java编程
🏷️ 相关标签

#Java
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -