⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 client.c

📁 小蚂蚁的程序,很有名的哦,能和爬行算法的效果比较一下,
💻 C
📖 第 1 页 / 共 2 页
字号:
/* $ PangoLin/1.0 (tiny server from CMU ) $ Write By William Beijing $ 2006-05-01 - 2006-05-07*//*	SCO OpenServer5.0.6 config:		/etc/resolv.conf		nameserver 192.168.0.1		/etc/rc		route add default 192.168.0.1		/etc/rc2.d/		vi S99route		route add default 192.168.0.1	pangolin www.baidu.com 80	pangolin www.sina.com.cn 80*/#include "csapp.h"/*	william	MAXLINE 是一个很重要的变量,我把它设置成1024*1024	时,在DFS算法运行下,会导致stack溢出,暂时采用8192	CMU 的CS系主任所做的tinyWeb服务器用的也是8192,但是	通常的页面最小也要200kb以上(200*1024),UNIX下的程序	段不能超过64k,这也就是63356*/typedef int Boolean;//#define MAXLINE   8192	/*返回网页信息长度*/#define PAGELINE    1024	/*网页链接长度*/#define WEBSITE   "www.PangoLin.com"#define TINY_FLAG//#define DFS_SEARCH		/*深度搜索*/#define BFS_SEARCH		/*广度搜索*///#define __DEBUG__#define TRUE   1#define FALSE  0 #define VISIT_BOOL    100#define MAP_PAGE_EOF  100#define VISIT_ROBOTS  200 	/*站点没有访问过,初次访问先访问*/#define PAGELINK	0#define PAGEBUFF	1static int	loop;static long	searchCount;static FILE 	*LogPage;/*	当前的网页Cache*/static char 	CurDNShost[80];	/*谁家的域名会超过80个*/static int 	CurDNSport;static char 	CurWebPage[PAGELINE];/*	网页识别标示符号*/#define HERF_COUNT	5static char SetHref[HERF_COUNT][15]={		"href=",	"HREF=",	"location=",	"src=http://",	"http://"};#ifdef DFS_SEARCH	#define DEEPLEN	   5		/*函数迪归5次*/#else	#define DEEPLEN	   8192		/*在8192次网页内不能重复搜索*/#endiftypedef struct{	int Visit[DEEPLEN];		/*是否访问标示*/	char WebPage[DEEPLEN][PAGELINE];	/*网页链接*/}VISITWEB;static VISITWEB	VisitWeb;void SetVisitedWebPage(char *NextPage,int Flag);int BFSParseWebPage(char *NextPage,char *buff);int DFSParseWebPage(char *nextPage,char *buff);int CheckVisitWebPage(VISITWEB *VisitWeb,char *VisitWebPage);void SetWebPageLog(char *WebPage,int PageFlag);int main(int argc,char **argv){	int clientfd,port,ret=0;	char *host;	char buff[MAXLINE];	if(argc <3)	{		fprintf(stderr,"usage: %s <host> <port>\n",argv[0]);		exit(0);	}	/*	  get index.html of the first web site	  获得初始网站的缺省网页index.html,返回	  缺省页面的文件信息流	*/	memset(CurDNShost,0x00,sizeof(CurDNShost));	host = argv[1];	port = atoi(argv[2]);	memset(&VisitWeb,0x00,sizeof(VisitWeb));	memset(CurWebPage,0x00,sizeof(CurWebPage));	strcpy(CurDNShost,host);	CurDNSport = port;	if(argc==4)		strcpy(CurWebPage,argv[3]);	memset(buff,0x00,sizeof(buff));	if(StartSearchWebPage(host,port,CurWebPage,buff))		;//sys_error("web site connect time out\n");	/*   	   深度优先算法进行爬行搜索网页	   Deep First Search	*/	loop = 0;	searchCount = 0;#ifdef DFS_SEARCH	while(1)	{		DFSParseWebPage(CurWebPage,buff);		if(searchCount == 0)			break;		if(searchCount>DEEPLEN)		{			memset(&VisitWeb,0x00,sizeof(VisitWeb));			searchCount = 0;			loop = 0;			break;	/*for test*/		}	}#endif#ifdef BFS_SEARCH	while(1)	{		/*		 搜索完index.html后,保留一个其中的链接,然后从这个连接开始		 广度搜索下一个页面,exam:		index.html:		   news.baidu.com/index.html		   mp3.baidu.com/index.html		   guoxue.baidu.com/index.html ->BFS next Page		CurWebPage 的值由buff中获得http://www.baidu.com/index.html		*/#ifdef __DEBUG__fprintf(stdout,"=>BFS page %s buff %s\n",CurWebPage,buff);#endiffprintf(stdout,"=>BFS page %s \n",CurWebPage);		ret=BFSParseWebPage(CurWebPage,buff);		if(ret==MAP_PAGE_EOF)			break;		if(searchCount == 0)			break;		if(searchCount>DEEPLEN)		{			memset(&VisitWeb,0x00,sizeof(VisitWeb));			searchCount = 0;			loop = 0;			break;	/*for test*/		}	}#endif	fprintf(stdout,"\t\tTotal finded Page %ld\n",searchCount);	fprintf(stdout,"\t\tPangoLin V1.0\n");	exit(0);}/*	Search Web Link by DFS Algorithm*/int DFSParseWebPageTest(char *nextPage,char *buff){	int     port=9000;	char    *host="192.9.200.100",webPage[PAGELINE];	nextPage="/cdk-doc/cdk.1.html";	return StartSearchWebPage(host,port,nextPage,buff);}/*	广度优先算法搜索网页	webPage : directory/index.html*/int BFSParseWebPage(char *webPage,char *buff){	char	VisitWebPage[PAGELINE];	char	*pos=NULL,*pBuff=NULL,*posOK=NULL,*posHTTP=NULL;	int	i=0,HostPortFlag=FALSE,port=80,ret;	int	LoopFlag=TRUE,k;	char    Buff[MAXLINE];	char	CharOK;	int	SiteFlag=FALSE,DirFlag=FALSE,PageFlag=FALSE;#ifdef __DEBUG__		char	tmp[200];#endif	/*	 重新初始化webPage,然后从buff获得新的http://链接串	*/	memset(webPage,0x00,PAGELINE);	memset(Buff,0x00,sizeof(Buff));	memcpy(Buff,buff,MAXLINE);	Buff[MAXLINE]='\0';	pBuff = buff;	while(pBuff)	{#ifdef __DEBUG__fprintf(stdout,">>>>>>>>>>>>>>>line:%d,pBuff=(%s)\n",__LINE__,pBuff);#endif		LoopFlag = TRUE;		for(k=0;k<HERF_COUNT;k++)		{		   if(pos=strstr(pBuff,SetHref[k]))			break;		}#ifdef __DEBUG__fprintf(stdout,"<<<<<<<<<<<<<<<line:%d(%s)\n",__LINE__,pos);#endif		if(pos)		{			i = 0;			SiteFlag=FALSE;			DirFlag=FALSE;			PageFlag=FALSE;			pBuff=pos+strlen(SetHref[k]);#ifdef __DEBUG__fprintf(stdout,">>>>>>>>>>>>>>>line:%d,pBuff=(%s)\n",__LINE__,pBuff);#endif/*	href="http://www.website.com ...">	href="directory/">	href="page.html">*/			posOK=strstr(pBuff,">");			if(posOK)			{			memset(VisitWebPage,0x00,sizeof(VisitWebPage));			memcpy(VisitWebPage,pBuff,posOK-pBuff);			}else				continue;			if((posHTTP=strstr(VisitWebPage,"http://"))||			   (posHTTP=strstr(VisitWebPage,"HTTP://")))			{				pBuff = pBuff+(posHTTP-VisitWebPage)+7;				SiteFlag = TRUE;#ifdef __DEBUG__fprintf(stdout,">>>>>>>>>>>>>>>line:%d,pBuff=(%s)\n",__LINE__,pBuff);#endif			}else if(strstr(VisitWebPage+(strlen(VisitWebPage)-5),"/"))				DirFlag = TRUE;			else				PageFlag = TRUE;#ifdef __DEBUG__fprintf(stdout,">>>>>>>>>>>>>>>line:%d,VistPage=(%s)SiteFlag=%d DirFlag=%d PageFlag=%d\n",			__LINE__,VisitWebPage,SiteFlag,DirFlag,PageFlag);#endif			while(LoopFlag)			{#ifdef __DEBUG__fprintf(stderr,"LINE:%d webPage=[%s]\n",__LINE__,webPage);#endif				/*				 如果还没找到域名www.PangoLin.com:9000				*/				if(HostPortFlag==FALSE)				{				switch(*pBuff)				{				case '/':				case '"':				case '\'':				case '>':				case ' ':#ifdef __DEBUG__fprintf(stderr,".................begin..............................\n");fprintf(stderr,"webPage=[%s]\n",webPage);fprintf(stderr,".................end................................\n");#endif		/*			www.baidu.com:8081/			www.baidu.com/		*/					posOK=strstr(webPage,":");					if(!posOK)					{		/*			http://www.pangolin.com:80			如果是这样的情况,作如下处理		*/						port=80;						CurDNSport = 80;		/*			directory/		*/						if(SiteFlag)						{				memset(CurDNShost,0x00,sizeof(CurDNShost));						strcpy(CurDNShost,webPage);					CurDNShost[strlen(webPage)]='\0';						}fprintf(stderr,">>>>>>>CurDNShost=[%s][:80]\n",CurDNShost);		/*			"index.asp 去掉["]		*/						*pBuff++;					}else{				memset(CurDNShost,0x00,sizeof(CurDNShost));				memcpy(CurDNShost,webPage,(posOK-webPage));fprintf(stderr,"<<<<<<<CurDNShost=[%s][%s]\n",CurDNShost,webPage+(posOK-webPage));					port=atoi(webPage+(posOK-webPage+1));					}			/*			 :80/cdg-bin/page.html当前服务器上的目录下的页面			 由CurDNShost,CurDNSport 提供			*/					CurDNSport = port;					/*					  域名[www.PangoLin.com]和					  端口号[9000]分离完毕					*/					HostPortFlag = TRUE;					/*					 开始寻找页面文件以及cgi参数					 ./cgi-bin/webPage.html?a&b					 cgi-bin/webPage.html?a&b					 此时清除webPage信息,Host & port					 已经获得存入静态变量中					*/					memset(webPage,'\0',PAGELINE);					i = 0;					break;				default:					/*					 取得链接字符串http://www.baodu.com:80/					*/					webPage[i++]=*pBuff++;					break;				}				}else if(HostPortFlag){				switch(*pBuff)				{				case ' ':				case '>':				case '<':				case '"':				case '\'':				case '?': 					if(*pBuff=='?')					{						if(strstr(pBuff,"http://")||						   strstr(pBuff,"HTTP://"))							break;						else{						/*						 页面链接找到,退出循环						*/						LoopFlag=TRUE;						HostPortFlag = TRUE;						continue;						}					}					LoopFlag=FALSE;					HostPortFlag = FALSE;			/*			 http://www.baidu.com/copyright.html?488438843>			 去掉这个'>'			*/					webPage[i]='\0';					/*					 如果有图片等非html,asp,php,jsp等图像					 文件,不搜索之,太大了					*/					if(PageFlag&&					   !strstr(webPage,".html")&&					   !strstr(webPage,".htm")&&					   !strstr(webPage,".asp")&&					   !strstr(webPage,".php")&&					   !strstr(webPage,".jsp")&&					   !strstr(webPage,"?"))					{	fprintf(stdout,"abanded page(gif .img .mpeg .mpg .wmx .rm)...\n http://%s:%d%s\n",CurDNShost,port,webPage);					break;					}					/*					 开始搜索下个网站的网页					*/	fprintf(stdout,"Begin search next page...\n http://%s:%d/%s\n",CurDNShost,CurDNSport,webPage);			/*			 检查此网页是否已经被搜索到,搜索到则放弃搜索			*/			memset(VisitWebPage,0x00,sizeof(VisitWebPage));	sprintf(VisitWebPage,"http://%s:%d%s",CurDNShost,port,webPage);					ret=CheckVisitWebPage(&VisitWeb,							VisitWebPage);					if(ret!=VISIT_BOOL)					{fprintf(stdout,"%s is visited!info{%d}1-visited 0-comfailed!\n",VisitWebPage,ret);					break;					}				ret=StartSearchWebPage(CurDNShost,CurDNSport,								webPage,Buff);					if(ret<0)					{#ifdef __DEBUG__fprintf(stderr,">>>webPage[i]:[%c]\n",webPage[i]);#endiffprintf(stdout,"abanded page http://%s:%d.....\n",CurDNShost,port);		/*		 记下这个网站连接不通,再次遇到这个网站就不要连接了		*/					SetVisitedWebPage(webPage,FALSE);						break;					}					SetVisitedWebPage(webPage,TRUE);fprintf(stdout,"Searched page(%ld) http://%s:%d...\n",searchCount,CurDNShost,port);			SetWebPageLog(VisitWeb.WebPage[searchCount],PAGELINK);					break;				default:					/*

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -