📄 client.c
字号:
找到了域名和端口号后,开始逐个字符取出 页面文件链接 */ webPage[i++]=*pBuff++; break; } if(strlen(webPage)>=PAGELINE-1) webPage[PAGELINE-1]='\0'; } } }else{ memcpy(buff,Buff,MAXLINE-1); return MAP_PAGE_EOF; } loop++;fprintf(stdout,"loop=[%d]\n",loop); } return 0;}/* DFS 1->2 3->4 SCO 下测试迪归超过1000多次就会出现core dump 而且次数和函数的局部变量的占用内存有关,栈空间 容易用没了,因此尽量少迪归次数,将迪归次数用 while循环多次调用解决,参见main()*/int DFSParseWebPage(char *nextPage,char *buff){ char *host=WEBSITE; char webPage[PAGELINE],DNShost[100],VisitWebPage[PAGELINE]; char *pos=buff,*pStr,*pBuff,*posOK,*posHTTP; int i=0,HostPortFlag=FALSE,port=80,ret; int LoopFlag=TRUE,k; char Buff[MAXLINE],*NextPage=NULL; char CharOK;#ifdef __DEBUG__ char tmp[200];#endif memset(Buff,0x00,sizeof(Buff)); memcpy(Buff,buff,MAXLINE); Buff[MAXLINE]='\0'; NextPage = nextPage; webPage[0]=0; DNShost[0]=0; pStr = buff; while(pos) { LoopFlag = TRUE; for(k=0;k<HERF_COUNT;k++) { if(pos=strstr(pStr,SetHref[k])) break; }#ifdef __DEBUG__fprintf(stdout,">>>>>>>>>>>>>>>line:%d(%s)\n",__LINE__,pos);#endif if(pos) { i = 0; pBuff=pos+strlen(SetHref[k]); pStr=pos+strlen(SetHref[k]);#ifdef __DEBUG__memcpy(tmp,pStr,100);fprintf(stdout,"line:%d(%s)",__LINE__,tmp);#endif if((posHTTP=strstr(pBuff,"HTTP://"))|| (posHTTP=strstr(pBuff,"http://"))) pBuff = posHTTP+7;#ifdef __DEBUG__fprintf(stderr,"LINE:%d pBuff=[%s]\n",__LINE__,pBuff);#endif memset(webPage,0x00,sizeof(webPage)); while(LoopFlag) {#ifdef __DEBUG__fprintf(stderr,"LINE:%d webPage=[%s]\n",__LINE__,webPage);#endif /* 如果还没找到域名www.PangoLin.com:9000 */ if(HostPortFlag==FALSE) { switch(*pBuff) { case '/': case '"': case '\'': case '>': case ' ':#ifdef __DEBUG__fprintf(stderr,".................begin..............................\n");fprintf(stderr,"webPage=[%s]\n",webPage);fprintf(stderr,".................end................................\n");#endif posOK=strstr(webPage,":"); host=webPage; if(!posOK) { /* http://www.pangolin.com:80 如果是这样的情况,作如下处理 */ port=80; *(host+strlen(webPage))='\0'; }else{ *(host+(posOK-webPage))='\0'; port=atoi(webPage+(posOK-webPage+1)); } /* http://:80/page.html当前服务器上的目录下的页面 */ memset(DNShost,0x00,sizeof(DNShost)); strcpy(DNShost,host); if(posOK) strcpy(CurDNShost,host); CurDNSport = port;#ifdef __DEBUG__fprintf(stderr,".................begin..............................\n");fprintf(stderr,"Host:[%s] port:[%d]\n",DNShost,port);fprintf(stderr,".................end................................\n");#endif /* 域名[www.PangoLin.com]和 端口号[9000]分离完毕 */ HostPortFlag = TRUE; /* 开始寻找页面文件以及cgi参数 ./cgi-bin/NextPage.html?a&b NextPage=pBuff; */ i = 0; break; } }else if(HostPortFlag){ switch(*pBuff) { case ' ': case '>': case '<': case '"': case '\'': /* case '?': */ /* 页面链接找到,退出循环 */ LoopFlag=FALSE; HostPortFlag = FALSE; /* 保留非结束字符,否则pBuff就会被切断了 */ CharOK= NextPage[i];#ifdef __DEBUG__fprintf(stderr,">>>NextPage:[%s]\n",NextPage);fprintf(stderr,">>>CharOK:[%c]\n",CharOK);#endif /* http://www.baidu.com/copyright.html?488438843> 去掉这个'>' */ NextPage[i]='\0';#ifdef __DEBUG__fprintf(stdout,"Begin search next page...\n http://%s:%d%s\n",CurDNShost,CurDNSport,NextPage);#endif /* 如果有图片等非html,asp,php,jsp等图像 文件,不搜索之,太大了 */ if(strlen(NextPage)&& !strstr(NextPage,".html")&& !strstr(NextPage,".htm")&& !strstr(NextPage,".asp")&& !strstr(NextPage,".php")&& !strstr(NextPage,".jsp")) {fprintf(stdout,"abanded pic:.gif .img .mpeg .mpg .wmx .rm \n"); NextPage[i] = CharOK; break; } /* 开始搜索下个网站的网页 */ fprintf(stdout,"Begin search next page...\n http://%s:%d%s\n",CurDNShost,port,NextPage);#ifdef __DEBUG__fprintf(stdout,"CurHost:[%s]Host:[%s] port:[%d]\n",CurDNShost,DNShost,port);#endif /* 检查此网页是否已经被搜索到,搜索到则放弃搜索 */ memset(VisitWebPage,0x00,sizeof(VisitWebPage)); sprintf(VisitWebPage,"http://%s:%d%s",CurDNShost,port,NextPage); ret=CheckVisitWebPage(&VisitWeb, VisitWebPage); if(ret!=VISIT_BOOL) { NextPage[i] = CharOK;fprintf(stdout,"%s is visited!info{%d}1-visited 0-comfailed!\n",VisitWebPage,ret); break; } ret=StartSearchWebPage(CurDNShost,port, NextPage,Buff); if(ret<0) { /* 还原原来的字符(1) */ NextPage[i] = CharOK;#ifdef __DEBUG__fprintf(stderr,">>>NextPage[i]:[%c]\n",NextPage[i]);#endiffprintf(stdout,"abanded page http://%s:%d.....\n",DNShost,port); /* 记下这个网站连接不通,再次遇到这个网站就不要连接了 */ SetVisitedWebPage(NextPage,FALSE); break; } SetVisitedWebPage(NextPage,TRUE);fprintf(stdout,"Searched page(%ld) http://%s:%d...\n",searchCount,DNShost,port); SetWebPageLog(VisitWeb.WebPage[searchCount],PAGELINK);//fprintf(LogPage,"\nPageidx(%ld)->%s\n",searchCount,VisitWeb.WebPage[searchCount]); /* 如果搜索的网页数量到了DEEPLEN,则推出,同时记下最后搜索的 到的网页链接信息http://host:port/nextpage */ if(searchCount>DEEPLEN) { strcpy(CurWebPage,NextPage); CurDNSport=port; return 0; } DFSParseWebPage(NextPage,Buff); /* 还原原来的字符(2) */ NextPage[i] = CharOK;#ifdef __DEBUG__fprintf(stderr,">>>NextPage[i]:[%c]\n",NextPage[i]);#endif break; default: /* 找到了域名和端口号后,开始逐个字符取出 页面文件链接 */ NextPage[i++]=*pBuff++; break; } } webPage[i++]=*pBuff++; } }else break; loop++;fprintf(stdout,"loop=[%d]\n",loop);#ifdef __DEBUG__memcpy(tmp,pos,100);fprintf(stdout,"line:%d pos=(%s)",__LINE__,tmp);#endif } return 0;}int StartSearchWebPage(char *host,int port,char *defaultPage,char *buff){ int clientfd; rio_t rio; char Buff[100]; memset(buff,0x00,sizeof(buff)); clientfd = Open_clientfd(host,port); if(clientfd<0) { fprintf(stdout,"connect %s:%d failed!(%d)\n",host,port,clientfd); return clientfd; } fprintf(stdout,"connect %s:%d successfully!\n",host,port); Rio_readinitb(&rio,clientfd); /* DFSTraverse,actually index.html was read by Web Server,it's a default page bisides index.asp/index.htm/index.html/index.jsp */ memset(Buff,0x00,sizeof(Buff)); sprintf(buff,"GET /%s HTTP/1.1\r\n",defaultPage); sprintf(Buff,"Host:%s\r\n",host); strcat(buff,Buff); strcat(buff,"User-Agent: (Unix;Windows XP;zh-CN;rv:1.8) Release/20060501 PangoLin/1.0\r\n"); strcat(buff,"Accept: image/png,*/*;q=0.5\r\n"); strcat(buff,"Accept-Language: zh-cn,zh;q=0.5\r\n"); strcat(buff,"Accept-Encoding: gzip,deflate\r\n"); strcat(buff,"Accept-Charset: gb2312,utf-8;q=0.7,*;q=0.7\r\n"); strcat(buff,"Keep-Alive: 300\r\n"); strcat(buff,"Connection: keep-alive\r\n"); sprintf(Buff,"Referer: http://%s:%d/%s\r\n",host,port,defaultPage); strcat(buff,Buff); /* '\r\n' is very important for the tail of the send buff */ strcat(buff,"\r\n"); /* Request to HTTP:// Web Server */ Rio_writen(clientfd,buff,strlen(buff)); if(Rio_readnb(&rio,buff,MAXLINE-1)<0) { fprintf(stdout,"Time out for Recv data!\n",__LINE__); Close(clientfd); return -1; } fprintf(stdout,"Recv data OK!\n");#ifdef __DEBUG__ Fputs(buff,stdout);#endif SetWebPageLog(buff,PAGEBUFF); /* Analys the page returned by buff and Fetch the new HTTP:// in the buff DFSParseWebPage(...) */ return 0;}/* return FALSE :站点网页访问过但是连接不通 return TRUE :站点网页访问过并且成功 return VISIT_BOOL: :站点网页没有访问过 return VISIT_ROBOTS: 站点没有访问过,初次访问先访问 robots.txt: http://news.baidu.com/robots.txt User-agent: Baiduspider Disallow: /ns http://www.baidu.com//robots.txt User-agent: Baiduspider Disallow: /baidu User-agent: * Disallow: /shifen/dqzd.html http://www.google.com/robots.txt User-agent: * Allow: /searchhistory/ Disallow: /search Disallow: /groups Disallow: /images*/int CheckVisitWebPage(VISITWEB *VisitWeb,char *VisitWebPage){ long i; char *pos=NULL,CharOK='\0';#ifdef __DEBUG__fprintf(stdout,"VisitWebPage=[%s]\n",VisitWebPage);#endif for(i=0;i<DEEPLEN;i++) {#ifdef __DEBUG__fprintf(stdout,"VisitWeb->WebPage[%ld]=[%s]\n",i,VisitWeb->WebPage[i]);#endif if(!strcmp(VisitWeb->WebPage[i],VisitWebPage)) { if(VisitWeb->Visit[i]==FALSE) return FALSE; else return TRUE; }else{ pos=strstr(VisitWebPage+7,"/"); if(pos) { CharOK = VisitWebPage[pos-VisitWebPage]; VisitWebPage[pos-VisitWebPage]='\0'; } if(strstr(VisitWeb->WebPage[i],VisitWebPage)) { if(pos) VisitWebPage[pos-VisitWebPage]=CharOK; return VISIT_ROBOTS; } if(pos) VisitWebPage[pos-VisitWebPage]=CharOK; } } return VISIT_BOOL;}/* Flag = FALSE 站点连接失败 TRUE 站点访问成功*/void SetVisitedWebPage(char *NextPage,int Flag){ searchCount = searchCount+1; VisitWeb.Visit[searchCount]=Flag; sprintf(VisitWeb.WebPage[searchCount], "http://%s:%d%s",CurDNShost,CurDNSport,NextPage); return ;}void SetWebPageLog(char *WebPage,int PageFlag){ LogPage = fopen("./log/LogPage","a+"); if(LogPage==(FILE *)NULL) { fprintf(stdout,"LogPage open error\n"); exit(0); } if(PageFlag==PAGELINK) fprintf(LogPage,"\nPageidx(%ld)->%s\n",searchCount,WebPage); if(PageFlag==PAGEBUFF) Fputs(WebPage,LogPage); fflush(LogPage); fclose(LogPage); return ;}RegistRobotsDir(){ return 0;}/* 根据返回的buff提取出来href链接标示,写入单链表中 */typedef struct{ Boolean VisitFlag; char WebPage[PAGELINE];}HTMLnode;int ParserHTML(char *buff,HTMLnode *link){ Boolean VisitFlag; char WebPage[PAGELINE]; return 0;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -