📄 spidergui.java
字号:
package crawler;
import java.awt.*;
import java.net.*;
import java.io.*;
import java.util.*;
import java.lang.*;
class node{
private Object data;
private node next;
private node prev;
public node(Object o){
data = o;
prev = next = null;
}
public String toString(){
if(next!=null)
return data.toString() + " "+ next.toString();
return data.toString();
}
public node getNext(){
return next;
}
public void setNext(node n){
next = n;
}
public node getPrev(){
return prev;
}
public void setPrev(node n){
prev = n;
}
public Object getData(){
return data;
}
}
class linkedlist{
node head;
node tail;
public linkedlist(){
tail = head = null;
}
public String toString(){
if(head==null)
return "Empty list";
return head.toString();
}
public void insert(Object o){
if(tail==null){
head = tail = new node(o);
}else{
node nn = new node(o);
tail.setNext(nn);
tail=nn;
}
}
public boolean contains(Object o){
for(node n = head;n!=null;n=n.getNext()){
if(o.equals(n.getData()))return true;
}
return false;
}
public Object pop(){
if(head==null)return null;
Object ret = head.getData();
head = head.getNext();
if(head==null)
tail = null;
return ret;
}
public boolean isEmpty(){
return head==null;
}
}
class list{
protected node tail;
protected node ptr;
private boolean stop;
public list(){
ptr=tail=null;
stop=false;
}
public boolean isEmpty(){return tail==null;}
public void reset(){
stop=false;
ptr=tail;
}
public String toString(){
if(tail==null)
return "Empty list";
String ret="";
for(node n = tail.getNext();n!=tail;n=n.getNext())ret+=n.getData().toString()+" ";
ret+=tail.getData().toString();
return ret;
}
public Object get(){
if(ptr==null)
return null;
ptr = ptr.getNext();
if(ptr==tail.getNext()){
if(stop)
return null;
stop=true;
return tail.getNext().getData();
}
return ptr.getData();
}
public void insert(Object o, boolean attail){
node nn = new node(o);
if(tail==null){
nn.setNext(nn);
nn.setPrev(nn);
ptr=tail=nn;
return;
}
if(attail){
tail.getNext().setPrev(nn);
nn.setNext(tail.getNext());
tail.setNext(nn);
nn.setPrev(tail);
tail=nn;
}else{
nn.setNext(tail.getNext());
nn.setPrev(tail);
tail.setNext(nn);
nn.getNext().setPrev(nn);
}
}
public void insert(Object o){}
}
class stack extends list{
public stack(){super();}
public void insert(Object o){insert(o, false);}
}
class queue extends list{
public queue(){
super();
}
public void insert(Object o){insert(o, true);}
public String peek(){
if(tail==null)
return "";
return tail.getNext().getData().toString();
}
public Object pop(){
if(tail==null)
return null;
Object ret = tail.getNext().getData();
if(tail.getNext()==tail){
tail=ptr=null;
}else{
if(tail.getNext()==ptr)ptr=ptr.getNext();
tail.setNext(tail.getNext().getNext());
}
return ret;
}
}
class hashtable{
private Vector table;
private int size;
public hashtable(){
size = 991;
table = new Vector();
for(int i=0;i<size;i++){
table.add(new linkedlist());
}
}
public void insert(Object o){
int index = o.hashCode();
index = index % size;
if(index<0)index+=size;
linkedlist ol = (linkedlist)table.get(index);
ol.insert(o);
}
public boolean contains(Object o){
int index = o.hashCode();
index = index % size;
if(index<0)
index+=size;
return ((linkedlist)(table.get(index))).contains(o);
}
public String toString(){
String ret ="";
for(int i=0;i<size;i++){
if(!((linkedlist)(table.get(i))).isEmpty()){
ret+=" ";
ret+=table.get(i).toString();
}
}
return ret;
}
}
class spider implements Runnable{
public queue todo;
public stack done;
public stack errors;
public stack omittions;
private hashtable allsites;
private String last="";
int maxsites;
int visitedsites;
int TIMEOUT;
String base;
String []badEndings2 = {"ps", "gz"};
String []badEndings3 = {"pdf", "txt", "zip", "jpg", "mpg", "gif", "mov", "tut", "req", "abs", "swf", "tex", "dvi", "bin", "exe", "rpm"};
String []badEndings4 = {"jpeg", "mpeg"};
public spider(String starturl, int max, String b){
TIMEOUT = 5000;
base = b;
allsites = new hashtable();
todo = new queue();
done = new stack();
errors = new stack();
omittions = new stack();
try{
URL u = new URL(starturl);
todo.insert(u);
}catch(Exception e){
System.out.println(e);
errors.insert("bad starting url "+starturl+", "+e.toString());
}
maxsites = max;
visitedsites = 0;
}
/*
* how many millisec to wait for each page
*/
public void setTimer(int amount){
TIMEOUT = amount;
}
/*
* strips the ′#′ anchor off a url
*/
private URL stripRef(URL u){
try{
return new URL(u.getProtocol(), u.getHost(), u.getPort(), u.getFile());
}catch(Exception e){
return u;
}
}
/*
* adds a url for future processing
*/
public void addSite(URL toadd){
if(null!=toadd.getRef())
toadd = stripRef(toadd);
if(!allsites.contains(toadd)){
allsites.insert(toadd);
if(!toadd.toString().startsWith(base)){
omittions.insert("foreign URL: "+toadd.toString());
return;
}
if(!toadd.toString().startsWith("http") && !toadd.toString().startsWith("HTTP")){
omittions.insert("ignoring URL: "+toadd.toString());
return;
}
String s = toadd.getFile();
String last="";
String []comp={};
if(s.charAt(s.length()-3)=='.'){
last = s.substring(s.length()-2);
comp = badEndings2;
}else if(s.charAt(s.length()-4)=='.'){
last = s.substring(s.length()-3);
comp = badEndings3;
}else if(s.charAt(s.length()-5)=='.'){
last = s.substring(s.length()-4);
comp = badEndings4;
}
for(int i=0;i<comp.length;i++){
if(last.equalsIgnoreCase(comp[i])){//loop through all bad extensions
omittions.insert("ignoring URL: "+toadd.toString());
return;
}
}
todo.insert(toadd);
}
}
/*
* true if there are pending urls and the maximum hasn′t been reached
*/
public boolean hasMore(){
return !todo.isEmpty() && visitedsites<maxsites;
}
/*
* returns the next site, works like enumeration, will return new values each time
*/
private URL getNextSite(){
last = todo.peek();
visitedsites++;
return (URL)todo.pop();
}
/*
* Just to see what we are doing now...
*/
public String getCurrent(){
return last;
}
/*
* process the next site
*/
public void doNextSite(){
URL current = getNextSite();
if(current==null)return;
try{
//System.err.println("Processing #"+visitedsites+": "+current);
parse(current);
done.insert(current);
}catch(Exception e){
errors.insert("Bad site: "+current.toString()+", "+e.toString());
}
}
public void run(){
while(hasMore())
doNextSite();
}
/*
* to print out the internal data structures
*/
public String toString(){
return getCompleted()+getErrors();
}
private String getErrors(){
if(errors.isEmpty())
return "No errors ";
else return "Errors: "+errors.toString()+" End of errors ";
}
private String getCompleted(){
return "Completed Sites: "+done.toString()+" End of completed sites ";
}
/*
* Parses a web page at (site) and adds all the urls it sees
*/
private void parse(URL site) throws Exception{
String source=getText(site);
String title=getTitle(source);
if(title.indexOf("404")!=-1 ||
title.indexOf("Error")!=-1 ||
title.indexOf("Not Found")!=-1){
throw new Exception (("404, Not Found: "+site));
}
int loc, beg;
boolean hasLT=false;
boolean hasSp=false;
boolean hasF=false;
boolean hasR=false;
boolean hasA=false;
boolean hasM=false;
boolean hasE=false;
for(loc=0;loc<source.length();loc++){
char c = source.charAt(loc); //source=getText(site),charAt返回source中字符的值
if(!hasLT){
hasLT = (c=='<');
}
//search for "<a " //以<a href开头
else if(hasLT && !hasA && !hasF){ //(c=='<')为true
if(c=='a' || c=='A') //<后紧跟a
hasA=true;
else if(c=='f' || c=='F')//<后紧跟f
hasF=true;
else hasLT=false;
}else if(hasLT && hasA && !hasF && !hasSp){
if(c==' ' || c=='\t' || c=='\n')
hasSp=true;
else hasLT = hasA = false;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -