📄 client.c
字号:
/* $ PangoLin/1.0 (tiny server from CMU ) $ Write By William Beijing $ 2006-05-01 - 2006-05-07*//* SCO OpenServer5.0.6 config: /etc/resolv.conf nameserver 192.168.0.1 /etc/rc route add default 192.168.0.1 /etc/rc2.d/ vi S99route route add default 192.168.0.1 pangolin www.baidu.com 80 pangolin www.sina.com.cn 80*/#include "csapp.h"/* william MAXLINE 是一个很重要的变量,我把它设置成1024*1024 时,在DFS算法运行下,会导致stack溢出,暂时采用8192 CMU 的CS系主任所做的tinyWeb服务器用的也是8192,但是 通常的页面最小也要200kb以上(200*1024),UNIX下的程序 段不能超过64k,这也就是63356*/typedef int Boolean;//#define MAXLINE 8192 /*返回网页信息长度*/#define PAGELINE 1024 /*网页链接长度*/#define WEBSITE "www.PangoLin.com"#define TINY_FLAG//#define DFS_SEARCH /*深度搜索*/#define BFS_SEARCH /*广度搜索*///#define __DEBUG__#define TRUE 1#define FALSE 0 #define VISIT_BOOL 100#define MAP_PAGE_EOF 100#define VISIT_ROBOTS 200 /*站点没有访问过,初次访问先访问*/#define PAGELINK 0#define PAGEBUFF 1static int loop;static long searchCount;static FILE *LogPage;/* 当前的网页Cache*/static char CurDNShost[80]; /*谁家的域名会超过80个*/static int CurDNSport;static char CurWebPage[PAGELINE];/* 网页识别标示符号*/#define HERF_COUNT 5static char SetHref[HERF_COUNT][15]={ "href=", "HREF=", "location=", "src=http://", "http://"};#ifdef DFS_SEARCH #define DEEPLEN 5 /*函数迪归5次*/#else #define DEEPLEN 8192 /*在8192次网页内不能重复搜索*/#endiftypedef struct{ int Visit[DEEPLEN]; /*是否访问标示*/ char WebPage[DEEPLEN][PAGELINE]; /*网页链接*/}VISITWEB;static VISITWEB VisitWeb;void SetVisitedWebPage(char *NextPage,int Flag);int BFSParseWebPage(char *NextPage,char *buff);int DFSParseWebPage(char *nextPage,char *buff);int CheckVisitWebPage(VISITWEB *VisitWeb,char *VisitWebPage);void SetWebPageLog(char *WebPage,int PageFlag);int main(int argc,char **argv){ int clientfd,port,ret=0; char *host; char buff[MAXLINE]; if(argc <3) { fprintf(stderr,"usage: %s <host> <port>\n",argv[0]); exit(0); } /* get index.html of the first web site 获得初始网站的缺省网页index.html,返回 缺省页面的文件信息流 */ memset(CurDNShost,0x00,sizeof(CurDNShost)); host = argv[1]; port = atoi(argv[2]); memset(&VisitWeb,0x00,sizeof(VisitWeb)); memset(CurWebPage,0x00,sizeof(CurWebPage)); strcpy(CurDNShost,host); CurDNSport = port; if(argc==4) strcpy(CurWebPage,argv[3]); memset(buff,0x00,sizeof(buff)); if(StartSearchWebPage(host,port,CurWebPage,buff)) ;//sys_error("web site connect time out\n"); /* 深度优先算法进行爬行搜索网页 Deep First Search */ loop = 0; searchCount = 0;#ifdef DFS_SEARCH while(1) { DFSParseWebPage(CurWebPage,buff); if(searchCount == 0) break; if(searchCount>DEEPLEN) { memset(&VisitWeb,0x00,sizeof(VisitWeb)); searchCount = 0; loop = 0; break; /*for test*/ } }#endif#ifdef BFS_SEARCH while(1) { /* 搜索完index.html后,保留一个其中的链接,然后从这个连接开始 广度搜索下一个页面,exam: index.html: news.baidu.com/index.html mp3.baidu.com/index.html guoxue.baidu.com/index.html ->BFS next Page CurWebPage 的值由buff中获得http://www.baidu.com/index.html */#ifdef __DEBUG__fprintf(stdout,"=>BFS page %s buff %s\n",CurWebPage,buff);#endiffprintf(stdout,"=>BFS page %s \n",CurWebPage); ret=BFSParseWebPage(CurWebPage,buff); if(ret==MAP_PAGE_EOF) break; if(searchCount == 0) break; if(searchCount>DEEPLEN) { memset(&VisitWeb,0x00,sizeof(VisitWeb)); searchCount = 0; loop = 0; break; /*for test*/ } }#endif fprintf(stdout,"\t\tTotal finded Page %ld\n",searchCount); fprintf(stdout,"\t\tPangoLin V1.0\n"); exit(0);}/* Search Web Link by DFS Algorithm*/int DFSParseWebPageTest(char *nextPage,char *buff){ int port=9000; char *host="192.9.200.100",webPage[PAGELINE]; nextPage="/cdk-doc/cdk.1.html"; return StartSearchWebPage(host,port,nextPage,buff);}/* 广度优先算法搜索网页 webPage : directory/index.html*/int BFSParseWebPage(char *webPage,char *buff){ char VisitWebPage[PAGELINE]; char *pos=NULL,*pBuff=NULL,*posOK=NULL,*posHTTP=NULL; int i=0,HostPortFlag=FALSE,port=80,ret; int LoopFlag=TRUE,k; char Buff[MAXLINE]; char CharOK; int SiteFlag=FALSE,DirFlag=FALSE,PageFlag=FALSE;#ifdef __DEBUG__ char tmp[200];#endif /* 重新初始化webPage,然后从buff获得新的http://链接串 */ memset(webPage,0x00,PAGELINE); memset(Buff,0x00,sizeof(Buff)); memcpy(Buff,buff,MAXLINE); Buff[MAXLINE]='\0'; pBuff = buff; while(pBuff) {#ifdef __DEBUG__fprintf(stdout,">>>>>>>>>>>>>>>line:%d,pBuff=(%s)\n",__LINE__,pBuff);#endif LoopFlag = TRUE; for(k=0;k<HERF_COUNT;k++) { if(pos=strstr(pBuff,SetHref[k])) break; }#ifdef __DEBUG__fprintf(stdout,"<<<<<<<<<<<<<<<line:%d(%s)\n",__LINE__,pos);#endif if(pos) { i = 0; SiteFlag=FALSE; DirFlag=FALSE; PageFlag=FALSE; pBuff=pos+strlen(SetHref[k]);#ifdef __DEBUG__fprintf(stdout,">>>>>>>>>>>>>>>line:%d,pBuff=(%s)\n",__LINE__,pBuff);#endif/* href="http://www.website.com ..."> href="directory/"> href="page.html">*/ posOK=strstr(pBuff,">"); if(posOK) { memset(VisitWebPage,0x00,sizeof(VisitWebPage)); memcpy(VisitWebPage,pBuff,posOK-pBuff); }else continue; if((posHTTP=strstr(VisitWebPage,"http://"))|| (posHTTP=strstr(VisitWebPage,"HTTP://"))) { pBuff = pBuff+(posHTTP-VisitWebPage)+7; SiteFlag = TRUE;#ifdef __DEBUG__fprintf(stdout,">>>>>>>>>>>>>>>line:%d,pBuff=(%s)\n",__LINE__,pBuff);#endif }else if(strstr(VisitWebPage+(strlen(VisitWebPage)-5),"/")) DirFlag = TRUE; else PageFlag = TRUE;#ifdef __DEBUG__fprintf(stdout,">>>>>>>>>>>>>>>line:%d,VistPage=(%s)SiteFlag=%d DirFlag=%d PageFlag=%d\n", __LINE__,VisitWebPage,SiteFlag,DirFlag,PageFlag);#endif while(LoopFlag) {#ifdef __DEBUG__fprintf(stderr,"LINE:%d webPage=[%s]\n",__LINE__,webPage);#endif /* 如果还没找到域名www.PangoLin.com:9000 */ if(HostPortFlag==FALSE) { switch(*pBuff) { case '/': case '"': case '\'': case '>': case ' ':#ifdef __DEBUG__fprintf(stderr,".................begin..............................\n");fprintf(stderr,"webPage=[%s]\n",webPage);fprintf(stderr,".................end................................\n");#endif /* www.baidu.com:8081/ www.baidu.com/ */ posOK=strstr(webPage,":"); if(!posOK) { /* http://www.pangolin.com:80 如果是这样的情况,作如下处理 */ port=80; CurDNSport = 80; /* directory/ */ if(SiteFlag) { memset(CurDNShost,0x00,sizeof(CurDNShost)); strcpy(CurDNShost,webPage); CurDNShost[strlen(webPage)]='\0'; }fprintf(stderr,">>>>>>>CurDNShost=[%s][:80]\n",CurDNShost); /* "index.asp 去掉["] */ *pBuff++; }else{ memset(CurDNShost,0x00,sizeof(CurDNShost)); memcpy(CurDNShost,webPage,(posOK-webPage));fprintf(stderr,"<<<<<<<CurDNShost=[%s][%s]\n",CurDNShost,webPage+(posOK-webPage)); port=atoi(webPage+(posOK-webPage+1)); } /* :80/cdg-bin/page.html当前服务器上的目录下的页面 由CurDNShost,CurDNSport 提供 */ CurDNSport = port; /* 域名[www.PangoLin.com]和 端口号[9000]分离完毕 */ HostPortFlag = TRUE; /* 开始寻找页面文件以及cgi参数 ./cgi-bin/webPage.html?a&b cgi-bin/webPage.html?a&b 此时清除webPage信息,Host & port 已经获得存入静态变量中 */ memset(webPage,'\0',PAGELINE); i = 0; break; default: /* 取得链接字符串http://www.baodu.com:80/ */ webPage[i++]=*pBuff++; break; } }else if(HostPortFlag){ switch(*pBuff) { case ' ': case '>': case '<': case '"': case '\'': case '?': if(*pBuff=='?') { if(strstr(pBuff,"http://")|| strstr(pBuff,"HTTP://")) break; else{ /* 页面链接找到,退出循环 */ LoopFlag=TRUE; HostPortFlag = TRUE; continue; } } LoopFlag=FALSE; HostPortFlag = FALSE; /* http://www.baidu.com/copyright.html?488438843> 去掉这个'>' */ webPage[i]='\0'; /* 如果有图片等非html,asp,php,jsp等图像 文件,不搜索之,太大了 */ if(PageFlag&& !strstr(webPage,".html")&& !strstr(webPage,".htm")&& !strstr(webPage,".asp")&& !strstr(webPage,".php")&& !strstr(webPage,".jsp")&& !strstr(webPage,"?")) { fprintf(stdout,"abanded page(gif .img .mpeg .mpg .wmx .rm)...\n http://%s:%d%s\n",CurDNShost,port,webPage); break; } /* 开始搜索下个网站的网页 */ fprintf(stdout,"Begin search next page...\n http://%s:%d/%s\n",CurDNShost,CurDNSport,webPage); /* 检查此网页是否已经被搜索到,搜索到则放弃搜索 */ memset(VisitWebPage,0x00,sizeof(VisitWebPage)); sprintf(VisitWebPage,"http://%s:%d%s",CurDNShost,port,webPage); ret=CheckVisitWebPage(&VisitWeb, VisitWebPage); if(ret!=VISIT_BOOL) {fprintf(stdout,"%s is visited!info{%d}1-visited 0-comfailed!\n",VisitWebPage,ret); break; } ret=StartSearchWebPage(CurDNShost,CurDNSport, webPage,Buff); if(ret<0) {#ifdef __DEBUG__fprintf(stderr,">>>webPage[i]:[%c]\n",webPage[i]);#endiffprintf(stdout,"abanded page http://%s:%d.....\n",CurDNShost,port); /* 记下这个网站连接不通,再次遇到这个网站就不要连接了 */ SetVisitedWebPage(webPage,FALSE); break; } SetVisitedWebPage(webPage,TRUE);fprintf(stdout,"Searched page(%ld) http://%s:%d...\n",searchCount,CurDNShost,port); SetWebPageLog(VisitWeb.WebPage[searchCount],PAGELINK); break; default: /*
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -