📄 urlfunct.h

📁 网页抓取程序
💻 H
📖 第 1 页 / 共 2 页
字号:
上一页 12
		return 1;	}		for(i=0;HtmlExtensions[i][0]!=0;i++)	{		if(stricmp(rPage+strlen(rPage)-strlen(HtmlExtensions[i]),(char*)HtmlExtensions[i])==0)		{			Host->type = 1;        // Html file			return 1;		}	}		for(i=0;PlainTextExtension[i][0]!=0;i++)	{		if(stricmp(rPage+strlen(rPage)-strlen(PlainTextExtension[i]),(char*)PlainTextExtension[i])==0)		{			Host->type = 2;			return 1;		}	}		/*Support for custom extensions*/ /*TO TEST*/	for(i=0;CustomExtensions[i][0]!=0;i++)	{		if(stricmp(rPage+strlen(rPage)-strlen(CustomExtensions[i]),(char*)CustomExtensions[i])==0)		{			Host->type = 4;						return 1;		}	}		if(bArgs==1)		Host->type = 1;	else		Host->type = 3;            //discard it		return 1;}/* PortNumFromHostname* hostname -><-* hostname="www.auuuu.com:90" => hostname="www.auuuu.com"; return 90;*/unsigned int PortNumFromHostname(char* hostname){	unsigned int i;		for(i=0;i<strlen(hostname);i++)		if(hostname[i]==':')			break;				if(i!=strlen(hostname))		{			hostname[i]=0;			return (unsigned)atoi(hostname+i+1);		}				return PORT;}int GenerateURL(struct sHost Host,char* URL){	char port[5];	sprintf(port,"%d",Host.port);	strcpy(URL,"http://");	strcat(URL,Host.Host);	strcat(URL,":");	strcat(URL,port);	strcat(URL,Host.Page);		return 1;}/* ParseUrl* Url <- sHost* Url: "http://www.test.com/page.htm" ==>*	==> sHost.Url = Url &&  sHost.Host = "www.test.com" &&  sHost.Page = "page.htm"*/int ParseUrl(char* url,struct sHost* sh,struct sHost* currentHost){	char tUrl[MAXURLSIZE];	char BaseDir[MAXPAGESIZE];	unsigned int offset=0,i;	char* token1=NULL;	char* tmpPage;		if(url==NULL || sh==NULL)		return -1;		if(strlen(url)>MAXURLSIZE-1)		return -1;		if( strnicmp(url,"ftp://",6)==0    || 		strnicmp(url,"mailto:",7)==0   || 		strnicmp(url,"about:",6)==0    ||		strnicmp(url,"irc://",6)==0    ||		strnicmp(url,"news://",7)==0   ||		strnicmp(url,"https://",8)==0)    //protocols not supported		return -1;		memset(sh,0,sizeof(struct sHost));	memset(tUrl,0,MAXURLSIZE);		for(i=0;i<strlen(url);i++)	{		if(url[i]=='#')		{			url[i]=0;			break;		}	}		if(url[0]==0)		return -1;		if(strnicmp(url,"http://",7)==0)	{		if(strlen(url)==7)			return -1;		else			offset=7;	}		if(strncmp(url,"//",2)==0)	{		if(strlen(url)==7)			return -1;		else			offset=2;	}		strncpy(tUrl,url+offset,strlen(url)-offset);	tUrl[strlen(url)-offset]=0;		if(offset>0)	//url with prefix: "http://" || "//"	{		for(i=0;i<strlen(tUrl);i++)		{			if(tUrl[i]=='/' || tUrl[i]=='?')			{				token1=tUrl+i;				break;			}		}				if(token1>tUrl)		//is there a '/'?		{			strncpy(sh->Host,tUrl,token1-tUrl);		//yes: the host is the part of the string before '/' and the page the rest			strncpy(sh->Page,token1,MAXPAGESIZE-1);						if(strnicmp(sh->Page,"mailto:",7)==0)				return -1;		}		else										//no: the host is the url and the page is the index		{			strncpy(sh->Host,tUrl,MAXHOSTSIZE-1);			strcpy(sh->Page,"/");		}				sh->port = PortNumFromHostname(sh->Host);		strtrim(sh->Host,sh->Host);		        tmpPage = malloc(MAXPAGESIZE);		        strtrim(sh->Page,tmpPage);		        ReplaceStr(tmpPage,sh->Page,"&amp;","&");		        FREE(tmpPage);        /* currentHost has the same hostname and port and has an host_id */        if( currentHost && currentHost->host_id!=0 && strcmp( currentHost->Host, sh->Host ) == 0 && currentHost->port == sh->port )        {            /* yes: this page is from the same domain: use currentHost host_id */            sh->host_id = currentHost->host_id;        }				if(CheckPage(sh->Page)==-1)			return -1;		return PageType(sh);	}	else		//now we expect a relative url	{		if(strlen(url)>MAXPAGESIZE-1)			return -1;				if(currentHost==NULL)	//if we haven't a reference host we can't continue			return -1;				strncpy(sh->Host,currentHost->Host,MIN(MAXHOSTSIZE-1,strlen(currentHost->Host)));				if(tUrl[0]!='/')	//if the first char is not '/' we must consider the current directory			GetDir(currentHost->Page,BaseDir);		else			BaseDir[0]=0;				for(i=strlen(tUrl);i>0;i--)		//is there a '.' before last '/'?			if(tUrl[i]=='/')				break;			else if(tUrl[i]=='.')		//yes: this is a page Ex. "/sources.html"			{				if(strlen(BaseDir)+strlen(tUrl)>=MAXPAGESIZE)					return -1;								strcpy(sh->Page,BaseDir);				strcat(sh->Page,tUrl);				                //get the port from the current Host                sh->port = currentHost->port;								strtrim(sh->Host,sh->Host);				                tmpPage = malloc(MAXPAGESIZE);								strtrim(sh->Page,tmpPage);				                ReplaceStr(tmpPage,sh->Page,"&amp;","&");				                FREE(tmpPage);                /* currentHost has the same hostname and port and has an host_id */                if( currentHost && currentHost->host_id!=0 && strcmp( currentHost->Host, sh->Host ) == 0 && currentHost->port == sh->port )                {                    /* yes: this page is from the same domain: use currentHost host_id */                    sh->host_id = currentHost->host_id;                }								if(CheckPage(sh->Page)==-1)					return -1;				return PageType(sh);			}						//is there a '.' before last '/'? : no: if i==strlen(tUrl) this is a directory Ex. "sample/"			if((unsigned)i==strlen(tUrl)-1)			{				if(strlen(BaseDir)+strlen(tUrl)>=MAXPAGESIZE)					return -1;								strcpy(sh->Page,BaseDir);				strcat(sh->Page,tUrl);								//get the port from the current Host				sh->port = currentHost->port;								strtrim(sh->Host,sh->Host);								tmpPage = malloc(MAXPAGESIZE);								strtrim(sh->Page,tmpPage);								ReplaceStr(tmpPage,sh->Page,"&amp;","&");								FREE(tmpPage);                /* currentHost has the same hostname and port and has an host_id */                if( currentHost && currentHost->host_id!=0 && strcmp( currentHost->Host, sh->Host ) == 0 && currentHost->port == sh->port )                {                    /* yes: this page is from the same domain: use currentHost host_id */                    sh->host_id = currentHost->host_id;                }								if(CheckPage(sh->Page)==-1)					return -1;				return PageType(sh);			}			else		//in this case we have a page like: "dir1/something" we consider it a directory			{				if(strlen(BaseDir)+strlen(tUrl)+1>=MAXPAGESIZE)					return -1;								strcpy(sh->Page,BaseDir);				strcat(sh->Page,tUrl);								if(strchr(tUrl,'?')==NULL)		//if there is a '?' in the "directory" we consider it a page					strcat(sh->Page,"/");								//get the port from the current Host				sh->port = currentHost->port;								strtrim(sh->Host,sh->Host);								tmpPage = malloc(MAXPAGESIZE);								strtrim(sh->Page,tmpPage);								ReplaceStr(tmpPage,sh->Page,"&amp;","&");								FREE(tmpPage);                /* currentHost has the same hostname and port and has an host_id */                if( currentHost && currentHost->host_id!=0 && strcmp( currentHost->Host, sh->Host ) == 0 && currentHost->port == sh->port )                {                    /* yes: this page is from the same domain: use currentHost host_id */                    sh->host_id = currentHost->host_id;                }								if(CheckPage(sh->Page)==-1)					return -1;				return PageType(sh);			}	}	}/* GetHostId*  if the host exists in the table hostlist returns its id*  else returns 0*/int GetHostId(struct sHost host){	char* sqlQuery;	MYSQL_RES gRes;	MYSQL_RES** tmpRes=NULL;	MYSQL_ROW row;	unsigned int ret;	tmpRes=(MYSQL_RES**)malloc(sizeof(MYSQL_RES));		sqlQuery = malloc(MAXQUERYSIZE);		if(tmpRes==NULL || sqlQuery==NULL)		MemoryCorruptedHandler("GetHostId");	    snprintf_mysql_escaped_sql_statement(&gMysqlDB2,sqlQuery,MAXQUERYSIZE-1,"SELECT id FROM %s.hostlist WHERE hostname='%s' AND port = %d LIMIT 1", DB1, host.Host, host.port);		my_mysql_query_and_store_results(&gMysqlDB2, sqlQuery,tmpRes,&gRes,BLOCKINDEX);		FREE(sqlQuery);		row = mysql_fetch_row(&gRes);		if(row)		ret = atoi(row[0]);	else		ret = 0;		if(*tmpRes)	{		mysql_free_result(*tmpRes);	}		FREE(tmpRes);			return ret;}int pRelationships(struct sHost* links,struct sHost* linked,int level){	char* sqlQuery;	int host_id;	int linkedhost_id;		if(bTesting==1)		return 1;		if( level>0 && level<3 )	{        if( ((links? (*links) : IndexingHost ).host_id) == 0 )		    host_id			= GetHostId( (links? *links:IndexingHost) );        else            host_id = ((links? (*links) : IndexingHost ).host_id);        if( linked->host_id == 0 )		    linkedhost_id	= GetHostId( *linked );        else            linkedhost_id = linked->host_id; 		if( host_id==0 || linkedhost_id==0 )			return 0;		sqlQuery = malloc(MAXQUERYSIZE);        if(level==1)	//saves hostname only			snprintf_mysql_escaped_sql_statement(&gMysqlDB1,sqlQuery,MAXQUERYSIZE-1,"INSERT IGNORE INTO rels (host_id,linkedhost_id,page,linkedpage,textlink) VALUES(%d,%d,'/','/', '%s')",host_id,linkedhost_id , linked->Description );		else if(level==2)			snprintf_mysql_escaped_sql_statement(&gMysqlDB1,sqlQuery,MAXQUERYSIZE-1,"INSERT IGNORE INTO rels (host_id,linkedhost_id,page,linkedpage,textlink) VALUES(%d,%d,'%s','%s', '%s')",host_id,linkedhost_id,(links?links->Page:"/"),linked->Page , linked->Description );				my_mysql_query(&gMysqlDB1, sqlQuery,BLOCKDB1);		FREE(sqlQuery);				return 1;	}	else		return 0;}/* unencode* transform the gave unicoded string in an unencoded string*/void unencode(char *src, char *last, char *dest){	for(; src <= last; src++, dest++)	{		if(*src == '%') 		{			int code;			if(sscanf(src+1, "%2x", &code) != 1) 				code = '?';			*dest = (char)code;			src +=2;		}		else			*dest = *src;	}		*dest = '\0';		return;}#endif/*EOF*/
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -