⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 spidergui.java

📁 一个简单的Java爬虫
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
  
//search   for   "<frame   "   //frame帧,<frame src="../  //src是source的缩写
			else   if(hasLT   &&   hasF   &&   !hasA   &&   !hasR){   
			    if(c=='r'   ||   c=='R')
					hasR=true;   
				else   hasLT   =   hasF   =   false;   
			}else   if(hasLT   &&   hasF   &&   hasR   &&   !hasA){   
				 if(c=='a'   ||   c=='A')
					 hasA=true;   
				 else   hasLT   =   hasF   =   hasR   =   false;   
			 }else   if(hasLT   &&   hasF   &&   hasR   &&   hasA   &&   !hasM){   
				  if(c=='m'   ||   c=='M')
					  hasM=true;   
				  else   hasLT   =   hasF   =   hasR   =   hasA   =   false;   
			  }else   if(hasLT   &&   hasF   &&   hasR   &&   hasA   &&   hasM   &&   !hasE){   
				   if(c=='e'   ||   c=='E')
					   hasE=true;   
				   else   hasLT   =   hasF   =   hasR   =   hasA   =   hasM   =   false;   
			   }else   if(hasLT   &&   hasF   &&   hasR   &&   hasA   &&   hasM   &&   hasE   &&   !hasSp){   
				   if(c==' ' || c=='\t' || c=='\n')
						hasSp=true;   
				   
					else   hasLT   =   hasF   =   hasR   =   hasA   =   hasM   =   hasE   =   false;   
				 }   
  
//found   "<frame   "   
			else   if(hasLT   &&   hasF   &&   hasR   &&   hasA   &&   hasM   &&   hasE   &&   hasSp){   
				hasLT   =   hasF   =   hasR   =   hasA   =   hasM   =   hasE   =   hasSp   =   false;   
				beg   =   loc;   
				loc   =   source.indexOf(">",   loc);   
				if(loc==-1){   
					errors.insert("malformed   frame   at   "+site.toString());   
					loc   =   beg;   
					}   
				else{   
					try{   
						parseFrame(site,   source.substring(beg,   loc));   
					}catch(Exception   e){   
						errors.insert("while   parsing   "+site.toString()+",   error   parsing   frame:   "+e.toString());   
					 }   
				}   
			}   
  
//found   "<a   "   
			else   if(hasLT   &&   hasA   &&   hasSp   &&   !hasF){   
				hasLT   =   hasA   =   hasSp   =   false;   
				beg   =   loc;   
				loc   =   source.indexOf(">",   loc);   
				if(loc==-1){   
					errors.insert("malformed   linked   at   "+site.toString());   
					loc   =   beg;   
				}   
				else{   
					try{   
						parseLink(site,   source.substring(beg,   loc));   
					}catch(Exception   e){   
						errors.insert("while   parsing   "+site.toString()+",   error   parsing   link:   "+e.toString());   
					 }   
				}   
			}   
		}   
	}   


/*   
*   parses   a   frame   
*/   
	private   void   parseFrame(URL   at_page,   String   s)   throws   Exception{   
		int   beg=s.indexOf("src");   
		if(beg==-1)
			beg=s.indexOf("SRC");   
		if(beg==-1)
			return;//doesn′t   have   a   src,   ignore   
		beg   =   s.indexOf("=",   beg);   
		if(beg==-1)
			throw   new   Exception("while   parsing   "+at_page.toString()+",   bad   frame,   missing   ′=′   after   src:   "+s);   
		int   start   =   beg;   
		for(;beg<s.length();beg++){   
			if(s.charAt(beg)=='′')
				break;   
			if(s.charAt(beg)=='"')
				break;   
		}   
		int   end=beg+1;   
		for(;end<s.length();end++){   
			if(s.charAt(beg)==s.charAt(end))
				break;   
		}   
		beg++;   
		if(beg>=end){//missing   quotes...   just   take   the   first   token   after   "src="   
			for(beg=start+1;beg<s.length() && (s.charAt(beg)==' ');beg++){}   
			for(end=beg+1;end<s.length()   &&   (s.charAt(beg)!=' ')   &&   (s.charAt(beg)!='>');end++){}   
		}   
  
		if(beg>=end){   
			errors.insert("while   parsing   "+at_page.toString()+",   bad   frame:   "+s);   
			return;   
		}   
  
		String   linkto=s.substring(beg,end);   
		if(linkto.startsWith("mailto:")||linkto.startsWith("Mailto:"))return;   
		if(linkto.startsWith("javascript:")||linkto.startsWith("javascript:"))return;   
		if(linkto.startsWith("news:")||linkto.startsWith("javascript:"))return;   
		try{   
			addSite(new   URL(at_page,   linkto));   
			return;   
		}catch(Exception   e1){}   
		try{   
				addSite(new   URL(linkto));   
				return;   
		}catch(Exception   e2){}   
		try{   
			URL   cp   =   new   URL(at_page.toString()+"/index.html");   
			System.out.println("attemping   to   use   "+cp);   
			addSite(new   URL(cp,   linkto));   
			return;   
		}catch(Exception   e3){}   
		errors.insert("while   parsing   "+at_page.toString()+",   bad   frame:   "+linkto+",   formed   from:   "+s);   
	}   
  
/*   
*   given   a   link   at   a   URL,   will   parse   it   and   add   it   to   the   list   of   sites   to   do   
*/   
	private   void   parseLink(URL   at_page,   String   s)   throws   Exception{   
//System.out.println("parsing   link   "+s);   
		int   beg=s.indexOf("href");   
		if(beg==-1)beg=s.indexOf("HREF");   
		if(beg==-1)return;//doesn′t   have   a   href,   must   be   an   anchor   
		beg   =   s.indexOf("=",   beg);   
		if(beg==-1)throw   new   Exception("while   parsing   "+at_page.toString()+",   bad   link,   missing   ′=′   after   href:   "+s);   
		int   start   =   beg;   
		for(;beg<s.length();beg++){   
			if(s.charAt(beg)=='′')break;   
			if(s.charAt(beg)=='"')break;   
		}   
		int   end=beg+1;   

		for(;end<s.length();end++){   
			if(s.charAt(beg)==s.charAt(end))break;   
		}   
		beg++;   
		if(beg>=end){//missing   quotes...   just   take   the   first   token   after   "href="   
			for(beg=start+1;beg<s.length() && (s.charAt(beg)==' ');beg++){}   
			for(end=beg+1;end<s.length()   &&   (s.charAt(beg)!=' ')   &&   (s.charAt(beg)!='>');end++){}   
		}   
  
		if(beg>=end){   
			errors.insert("while   parsing   "+at_page.toString()+",   bad   href:   "+s);   
			return;   
		}   
  
		String   linkto=s.substring(beg,end);   
		if(linkto.startsWith("mailto:")||linkto.startsWith("Mailto:"))
			return;   
		if(linkto.startsWith("javascript:")||linkto.startsWith("javascript:"))
			return;   
		if(linkto.startsWith("news:")||linkto.startsWith("javascript:"))
			return;   
  
		try{   
			addSite(new   URL(at_page,   linkto));   
			return;   
		}catch(Exception   e1){}   
		try{   
			addSite(new   URL(linkto));   
			return;   
		}catch(Exception   e2){}   
		try{   
			addSite(new   URL(new   URL(at_page.toString()+"/index.html"),   linkto));   
			return;   
		}catch(Exception   e3){}   
		errors.insert("while   parsing   "+at_page.toString()+",   bad   link:   "+linkto+",   formed   from:   "+s);   
	}   
  
/*   
*   gets   the   title   of   a   web   page   with   content   s   
*/   
	private   String   getTitle(String   s){   
		try{   
			int   beg=s.indexOf("<title>");   //出现头的位置,区分大小写
			if(beg==-1)
				beg=s.indexOf("<TITLE>");   
			int   end=s.indexOf("</title>");  //出现尾的位置 
			if(end==-1)
				end=s.indexOf("</TITLE>");   
			return   s.substring(beg,end);   //返回一个新的字符串,它是此字符串的一个子字符串
		}catch(Exception   e){
			return   "";  //an empty string
		 }   
	}   
  
/*   
*   gets   the   text   of   a   web   page,   times   out   after   10s   
*/   
	private   String   getText(URL   site)   throws   Exception {   
		urlReader   u   =   new   urlReader(site);   
		Thread   t   =   new   Thread(u);   
		t.setDaemon(true);    //Marks this thread as either a daemon thread or a user thread
		t.start();   
		t.join(TIMEOUT);    //Waits at most millis milliseconds for this thread to die. A timeout of 0 means to wait forever
		String   ret   =   u.poll();   
		if(ret==null){   
			throw   new   Exception("connection   timed   out");   
		}else   if(ret.equals("Not   html")){   
			throw   new   Exception("Not   an   HTML   document");   
		}   
		return   ret;   
	}   
  
/*   
*   returns   how   many   sites   have   been   visited   so   far   
*/   
	public   int   Visited(){
		return   visitedsites;
	}   
}   









class   urlReader   implements   Runnable{   
	URL   site;   

	String   s;   
	public   urlReader(URL   u){   
		site   =   u;   
		s=null;   
	}   
	public   void   run(){   
		try{   
			String   ret=new   String();   
			URLConnection   u   =   site.openConnection();    
			String   type   =   u.getContentType();  //Returns the value of the content-type header field
			                                         //返回指定的头字段的值
			if(type.indexOf("text")==-1   &&     
					type.indexOf("txt")==-1   &&     
					type.indexOf("HTM")==-1   &&     
					type.indexOf("htm")==-1){   
				//System.err.println("bad   content   type   "+type+"   at   site   "+site);   
				System.out.println("bad   content   type   "+type+"   at   site   "+site);   
				ret   =   "Not   html";   
				return;   
			}   
			InputStream   in   =   u.getInputStream();   
			BufferedInputStream   bufIn   =   new   BufferedInputStream(in);   
			int   data;   
			while(true){   
				data   =   bufIn.read();  //从此输入流中读取下一个数据字节。
				                         //返回一个 0 到 255 范围内的 int 字节值,如果结束返回-1 
//				System.out.println(data);
				//   Check   for   EOF   
				if   (data   ==   -1)   
					break;   
				else   ret+=   (   (char)   data);   //强制转换
				}   
			s   =   ret;   
		}catch(Exception   e){
			s=null;
		 }   
	}   
	public   String   poll(){
		return   s;
	}   
}   
  










public   class   spidergui   extends   Frame{   
  
	private   spider   s;   
	private   Color   txtColor;   
	private   Color   errColor;   
	private   Color   topColor;   
	private   Color   numColor;   
	private   Color   curColor;   
  
	public   spidergui(spider   spi,   String   title){   
		super(title);   
		curColor   =   new   Color(40,   40,   200);   
		txtColor   =   new   Color(0,   0,   0);   
		errColor   =   new   Color(255,   0,   0);   
	    topColor   =   new   Color(40,   40,   100);   
	    numColor   =   new   Color(50,   150,   50);   
	    s=spi;   
	    setBounds(0,   0,   800,   600);
	    show();   
	    toFront();   
	    repaint();   
	   }   
	public   void   endShow(){   
		System.out.println(s);   
		hide();   
		dispose();   
	}   
	public   void   paint(Graphics   g){   
		super.paint(g);   
		s.todo.reset();   
		s.done.reset();   
		s.errors.reset();   
		s.omittions.reset();   
		String   txt;   
		Object   o;   
		g.setColor(curColor);   
		g.setFont(new   Font("arial",   Font.PLAIN,   18));   
		String   cur   =   s.getCurrent();   
		if(cur.length()>80)g.drawString(   
				cur.substring(0,   40)+   
				"   .   .   .   "+   
				cur.substring(cur.length()-30,   cur.length()),   
				50,   50);   
		else   g.drawString(cur,   50,   50);   
  
		g.setColor(numColor);   
		g.setFont(new   Font("arial",   Font.BOLD,   24));   
		g.drawString(Integer.toString(s.Visited()),   350,   80);   
  
		g.setFont(new   Font("arial",   Font.PLAIN,   14));   
		g.setColor(topColor);   
		g.drawString("To   Do:",   100,   80);   

		g.drawString("Completed:",   500,   80);   
		g.drawString("Ignored:",   500,   250);   
		g.drawString("Errors:",   100,   420);   
  
		g.setColor(txtColor);   
		g.setFont(new   Font("arial",   Font.PLAIN,   12));   
		for(int   i=0;i<23   &&   (o=s.todo.get())!=null;i++){   
			txt   =   Integer.toString(i+1)   +   ":   "+o.toString();   
			if(txt.length()>65)g.drawString(   
					txt.substring(0,   38)   +   
					"   .   .   .   "   +   
					txt.substring(txt.length()-18,   txt.length()),   
					20,   100+13*i);   
			else   g.drawString(txt,   20,   100+13*i);   
		}   
		for(int   i=0;i<10   &&   (o=s.done.get())!=null;i++){   
			txt   =   Integer.toString(i+1)   +   ":   "+o.toString();   
			if(txt.length()>60)
				g.drawString(txt.substring(0,   57)+"...",   400,   100+13*i);   
			else   g.drawString(txt,   400,   100+13*i);   
		}   
		for(int   i=0;i<10   &&   (o=s.omittions.get())!=null;i++){   
			txt   =   Integer.toString(i+1)   +   ":   "+o.toString();   
			if(txt.length()>60)g.drawString(txt.substring(0,   57)+"...",   400,   270+13*i);   
			else   g.drawString(txt,   400,   270+13*i);   
		}   
		g.setColor(errColor);   
		for(int   i=0;i<10   &&   (o=s.errors.get())!=null;i++){   
			txt   =   Integer.toString(i+1)   +   ":   "+o.toString();   
			g.drawString(txt,   20,   440+13*i);   
		}   
	}   

	
	public   void   run(){   
		repaint();   
		while(s.hasMore()){   
			repaint();   
			s.doNextSite();   
		}   
		repaint();   
	}   


	public   static   void   main(String   []args){   
		int   max   =   5;   
		String   site="";   
		String   base="";   
		int   time=0;   
		System.out.println(args.length);
		for(int   i=0;i<args.length;i++){   
			if(args[i].startsWith("-max=")){   //此方法在字符串的开始处进行比较,确定它是否与此当前实例匹配
				max=Integer.parseInt(args[i].substring(5,args[i].length()));   
			}   
			else   if(args[i].startsWith("-time=")){   
				time=Integer.parseInt(args[i].substring(6,args[i].length()));   
			}   
			else   if(args[i].startsWith("-init=")){   
				site=args[i].substring(6,args[i].length());   
			}   
			else   if(args[i].startsWith("-base=")){   
				base=args[i].substring(6,args[i].length());   
			}   
			else   if(args[i].startsWith("-help")||args[i].startsWith("-?")){   
				System.out.println("additional   command   line   switches:");   
				System.out.println("-max=N   :   to   limit   to   N   sites,   default   5");   
				System.out.println("-init=URL   :   to   set   the   initial   site,   REQUIRED");   
				System.out.println("-base=URL   :   only   follow   url′s   that   start   with   this");   
				System.out.println("   default   (matches   all   URLs)");   
				System.out.println("-time=N   :   how   many   millisec   to   wait   for   each   page");   
				System.out.println("   default   5000   (5   seconds)");   
				System.exit(0);   
			}   
			else   System.err.println("unrecognized   switch:   "+args[i]+",   continuing");   
		}   
		if(site==""){   
			System.err.println("No   initial   site   parameter!");   
			System.err.println("Use   -init=<site>   switch   to   set,   or   -help   for   more   info.");   
			System.exit(1);   
		}   
  
		spider   spi=new   spider(site,   max,   base);   
  
		if(time>0)
			spi.setTimer(time);   
  
		spidergui   s   =   new   spidergui(spi,   "Spider:   "+site);   
		s.run();   
		System.out.println(spi);   
	}   
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -