📄 spidergui.java
字号:
//search for "<frame " //frame帧,<frame src="../ //src是source的缩写
else if(hasLT && hasF && !hasA && !hasR){
if(c=='r' || c=='R')
hasR=true;
else hasLT = hasF = false;
}else if(hasLT && hasF && hasR && !hasA){
if(c=='a' || c=='A')
hasA=true;
else hasLT = hasF = hasR = false;
}else if(hasLT && hasF && hasR && hasA && !hasM){
if(c=='m' || c=='M')
hasM=true;
else hasLT = hasF = hasR = hasA = false;
}else if(hasLT && hasF && hasR && hasA && hasM && !hasE){
if(c=='e' || c=='E')
hasE=true;
else hasLT = hasF = hasR = hasA = hasM = false;
}else if(hasLT && hasF && hasR && hasA && hasM && hasE && !hasSp){
if(c==' ' || c=='\t' || c=='\n')
hasSp=true;
else hasLT = hasF = hasR = hasA = hasM = hasE = false;
}
//found "<frame "
else if(hasLT && hasF && hasR && hasA && hasM && hasE && hasSp){
hasLT = hasF = hasR = hasA = hasM = hasE = hasSp = false;
beg = loc;
loc = source.indexOf(">", loc);
if(loc==-1){
errors.insert("malformed frame at "+site.toString());
loc = beg;
}
else{
try{
parseFrame(site, source.substring(beg, loc));
}catch(Exception e){
errors.insert("while parsing "+site.toString()+", error parsing frame: "+e.toString());
}
}
}
//found "<a "
else if(hasLT && hasA && hasSp && !hasF){
hasLT = hasA = hasSp = false;
beg = loc;
loc = source.indexOf(">", loc);
if(loc==-1){
errors.insert("malformed linked at "+site.toString());
loc = beg;
}
else{
try{
parseLink(site, source.substring(beg, loc));
}catch(Exception e){
errors.insert("while parsing "+site.toString()+", error parsing link: "+e.toString());
}
}
}
}
}
/*
* parses a frame
*/
private void parseFrame(URL at_page, String s) throws Exception{
int beg=s.indexOf("src");
if(beg==-1)
beg=s.indexOf("SRC");
if(beg==-1)
return;//doesn′t have a src, ignore
beg = s.indexOf("=", beg);
if(beg==-1)
throw new Exception("while parsing "+at_page.toString()+", bad frame, missing ′=′ after src: "+s);
int start = beg;
for(;beg<s.length();beg++){
if(s.charAt(beg)=='′')
break;
if(s.charAt(beg)=='"')
break;
}
int end=beg+1;
for(;end<s.length();end++){
if(s.charAt(beg)==s.charAt(end))
break;
}
beg++;
if(beg>=end){//missing quotes... just take the first token after "src="
for(beg=start+1;beg<s.length() && (s.charAt(beg)==' ');beg++){}
for(end=beg+1;end<s.length() && (s.charAt(beg)!=' ') && (s.charAt(beg)!='>');end++){}
}
if(beg>=end){
errors.insert("while parsing "+at_page.toString()+", bad frame: "+s);
return;
}
String linkto=s.substring(beg,end);
if(linkto.startsWith("mailto:")||linkto.startsWith("Mailto:"))return;
if(linkto.startsWith("javascript:")||linkto.startsWith("javascript:"))return;
if(linkto.startsWith("news:")||linkto.startsWith("javascript:"))return;
try{
addSite(new URL(at_page, linkto));
return;
}catch(Exception e1){}
try{
addSite(new URL(linkto));
return;
}catch(Exception e2){}
try{
URL cp = new URL(at_page.toString()+"/index.html");
System.out.println("attemping to use "+cp);
addSite(new URL(cp, linkto));
return;
}catch(Exception e3){}
errors.insert("while parsing "+at_page.toString()+", bad frame: "+linkto+", formed from: "+s);
}
/*
* given a link at a URL, will parse it and add it to the list of sites to do
*/
private void parseLink(URL at_page, String s) throws Exception{
//System.out.println("parsing link "+s);
int beg=s.indexOf("href");
if(beg==-1)beg=s.indexOf("HREF");
if(beg==-1)return;//doesn′t have a href, must be an anchor
beg = s.indexOf("=", beg);
if(beg==-1)throw new Exception("while parsing "+at_page.toString()+", bad link, missing ′=′ after href: "+s);
int start = beg;
for(;beg<s.length();beg++){
if(s.charAt(beg)=='′')break;
if(s.charAt(beg)=='"')break;
}
int end=beg+1;
for(;end<s.length();end++){
if(s.charAt(beg)==s.charAt(end))break;
}
beg++;
if(beg>=end){//missing quotes... just take the first token after "href="
for(beg=start+1;beg<s.length() && (s.charAt(beg)==' ');beg++){}
for(end=beg+1;end<s.length() && (s.charAt(beg)!=' ') && (s.charAt(beg)!='>');end++){}
}
if(beg>=end){
errors.insert("while parsing "+at_page.toString()+", bad href: "+s);
return;
}
String linkto=s.substring(beg,end);
if(linkto.startsWith("mailto:")||linkto.startsWith("Mailto:"))
return;
if(linkto.startsWith("javascript:")||linkto.startsWith("javascript:"))
return;
if(linkto.startsWith("news:")||linkto.startsWith("javascript:"))
return;
try{
addSite(new URL(at_page, linkto));
return;
}catch(Exception e1){}
try{
addSite(new URL(linkto));
return;
}catch(Exception e2){}
try{
addSite(new URL(new URL(at_page.toString()+"/index.html"), linkto));
return;
}catch(Exception e3){}
errors.insert("while parsing "+at_page.toString()+", bad link: "+linkto+", formed from: "+s);
}
/*
* gets the title of a web page with content s
*/
private String getTitle(String s){
try{
int beg=s.indexOf("<title>"); //出现头的位置,区分大小写
if(beg==-1)
beg=s.indexOf("<TITLE>");
int end=s.indexOf("</title>"); //出现尾的位置
if(end==-1)
end=s.indexOf("</TITLE>");
return s.substring(beg,end); //返回一个新的字符串,它是此字符串的一个子字符串
}catch(Exception e){
return ""; //an empty string
}
}
/*
* gets the text of a web page, times out after 10s
*/
private String getText(URL site) throws Exception {
urlReader u = new urlReader(site);
Thread t = new Thread(u);
t.setDaemon(true); //Marks this thread as either a daemon thread or a user thread
t.start();
t.join(TIMEOUT); //Waits at most millis milliseconds for this thread to die. A timeout of 0 means to wait forever
String ret = u.poll();
if(ret==null){
throw new Exception("connection timed out");
}else if(ret.equals("Not html")){
throw new Exception("Not an HTML document");
}
return ret;
}
/*
* returns how many sites have been visited so far
*/
public int Visited(){
return visitedsites;
}
}
class urlReader implements Runnable{
URL site;
String s;
public urlReader(URL u){
site = u;
s=null;
}
public void run(){
try{
String ret=new String();
URLConnection u = site.openConnection();
String type = u.getContentType(); //Returns the value of the content-type header field
//返回指定的头字段的值
if(type.indexOf("text")==-1 &&
type.indexOf("txt")==-1 &&
type.indexOf("HTM")==-1 &&
type.indexOf("htm")==-1){
//System.err.println("bad content type "+type+" at site "+site);
System.out.println("bad content type "+type+" at site "+site);
ret = "Not html";
return;
}
InputStream in = u.getInputStream();
BufferedInputStream bufIn = new BufferedInputStream(in);
int data;
while(true){
data = bufIn.read(); //从此输入流中读取下一个数据字节。
//返回一个 0 到 255 范围内的 int 字节值,如果结束返回-1
// System.out.println(data);
// Check for EOF
if (data == -1)
break;
else ret+= ( (char) data); //强制转换
}
s = ret;
}catch(Exception e){
s=null;
}
}
public String poll(){
return s;
}
}
public class spidergui extends Frame{
private spider s;
private Color txtColor;
private Color errColor;
private Color topColor;
private Color numColor;
private Color curColor;
public spidergui(spider spi, String title){
super(title);
curColor = new Color(40, 40, 200);
txtColor = new Color(0, 0, 0);
errColor = new Color(255, 0, 0);
topColor = new Color(40, 40, 100);
numColor = new Color(50, 150, 50);
s=spi;
setBounds(0, 0, 800, 600);
show();
toFront();
repaint();
}
public void endShow(){
System.out.println(s);
hide();
dispose();
}
public void paint(Graphics g){
super.paint(g);
s.todo.reset();
s.done.reset();
s.errors.reset();
s.omittions.reset();
String txt;
Object o;
g.setColor(curColor);
g.setFont(new Font("arial", Font.PLAIN, 18));
String cur = s.getCurrent();
if(cur.length()>80)g.drawString(
cur.substring(0, 40)+
" . . . "+
cur.substring(cur.length()-30, cur.length()),
50, 50);
else g.drawString(cur, 50, 50);
g.setColor(numColor);
g.setFont(new Font("arial", Font.BOLD, 24));
g.drawString(Integer.toString(s.Visited()), 350, 80);
g.setFont(new Font("arial", Font.PLAIN, 14));
g.setColor(topColor);
g.drawString("To Do:", 100, 80);
g.drawString("Completed:", 500, 80);
g.drawString("Ignored:", 500, 250);
g.drawString("Errors:", 100, 420);
g.setColor(txtColor);
g.setFont(new Font("arial", Font.PLAIN, 12));
for(int i=0;i<23 && (o=s.todo.get())!=null;i++){
txt = Integer.toString(i+1) + ": "+o.toString();
if(txt.length()>65)g.drawString(
txt.substring(0, 38) +
" . . . " +
txt.substring(txt.length()-18, txt.length()),
20, 100+13*i);
else g.drawString(txt, 20, 100+13*i);
}
for(int i=0;i<10 && (o=s.done.get())!=null;i++){
txt = Integer.toString(i+1) + ": "+o.toString();
if(txt.length()>60)
g.drawString(txt.substring(0, 57)+"...", 400, 100+13*i);
else g.drawString(txt, 400, 100+13*i);
}
for(int i=0;i<10 && (o=s.omittions.get())!=null;i++){
txt = Integer.toString(i+1) + ": "+o.toString();
if(txt.length()>60)g.drawString(txt.substring(0, 57)+"...", 400, 270+13*i);
else g.drawString(txt, 400, 270+13*i);
}
g.setColor(errColor);
for(int i=0;i<10 && (o=s.errors.get())!=null;i++){
txt = Integer.toString(i+1) + ": "+o.toString();
g.drawString(txt, 20, 440+13*i);
}
}
public void run(){
repaint();
while(s.hasMore()){
repaint();
s.doNextSite();
}
repaint();
}
public static void main(String []args){
int max = 5;
String site="";
String base="";
int time=0;
System.out.println(args.length);
for(int i=0;i<args.length;i++){
if(args[i].startsWith("-max=")){ //此方法在字符串的开始处进行比较,确定它是否与此当前实例匹配
max=Integer.parseInt(args[i].substring(5,args[i].length()));
}
else if(args[i].startsWith("-time=")){
time=Integer.parseInt(args[i].substring(6,args[i].length()));
}
else if(args[i].startsWith("-init=")){
site=args[i].substring(6,args[i].length());
}
else if(args[i].startsWith("-base=")){
base=args[i].substring(6,args[i].length());
}
else if(args[i].startsWith("-help")||args[i].startsWith("-?")){
System.out.println("additional command line switches:");
System.out.println("-max=N : to limit to N sites, default 5");
System.out.println("-init=URL : to set the initial site, REQUIRED");
System.out.println("-base=URL : only follow url′s that start with this");
System.out.println(" default (matches all URLs)");
System.out.println("-time=N : how many millisec to wait for each page");
System.out.println(" default 5000 (5 seconds)");
System.exit(0);
}
else System.err.println("unrecognized switch: "+args[i]+", continuing");
}
if(site==""){
System.err.println("No initial site parameter!");
System.err.println("Use -init=<site> switch to set, or -help for more info.");
System.exit(1);
}
spider spi=new spider(site, max, base);
if(time>0)
spi.setTimer(time);
spidergui s = new spidergui(spi, "Spider: "+site);
s.run();
System.out.println(spi);
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -