⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 url.cpp

📁 Linux TSE 源代码! 保贵十分
💻 CPP
字号:
/* URL handling */#include <iostream>#include <string>#include <sys/socket.h>#include <netdb.h>#include "Tse.h"#include "Url.h"#include "Http.h"#include "Md5.h"#include "StrFun.h"/* Is X "."?  */#define DOTP(x) ((*(x) == '.') && (!*(x + 1)))/* Is X ".."?  */#define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))map<string,string> mapCacheHostLookup;extern vector<string> vsUnreachHost;pthread_mutex_t mutexCacheHost = PTHREAD_MUTEX_INITIALIZER;extern set<string> setVisitedUrlMD5;extern map<unsigned long,unsigned long> mapIpBlock;typedef map<string,string>::value_type valTypeCHL;struct scheme_data{	char *leading_string;	int default_port;	int enabled;};/* Supported schemes: */static struct scheme_data supported_schemes[] ={	{ "http://",  DEFAULT_HTTP_PORT,  1 },	{ "ftp://",   DEFAULT_FTP_PORT,   1 },	/* SCHEME_INVALID */	{ NULL,       -1,                 0 }};/* Returns the scheme type if the scheme is supported, or   SCHEME_INVALID if not.  */void CUrl::ParseScheme (const char *url){	int i;	for (i = 0; supported_schemes[i].leading_string; i++)		if (0 == strncasecmp (url, supported_schemes[i].leading_string,                          strlen (supported_schemes[i].leading_string))) {			if (supported_schemes[i].enabled){				this->m_eScheme = (enum url_scheme) i;				return;			}else{				this->m_eScheme = SCHEME_INVALID;				return;			}		}	this->m_eScheme = SCHEME_INVALID;	return;}/************************************************************************ *  Function name: ParseUrlEx *  Input argv: *  	-- strUrl: url *  Output argv: *  	-- *  Return:   	true: success   	false: fail *  Fucntion Description: break an URL into scheme, host, port and request. *  			result as member variants *  Be careful:	release the memory by the client************************************************************************/bool CUrl::ParseUrlEx(string strUrl){	char protocol[10];	char host[HOST_LEN];	char request[256];	int port = -1;	memset( protocol, 0, sizeof(protocol) );	memset( host, 0, sizeof(host) );	memset( request, 0, sizeof(request) );	this->ParseScheme(strUrl.c_str());	if( this->m_eScheme != SCHEME_HTTP ){		return false;	}	ParseUrlEx(strUrl.c_str(),			protocol, sizeof(protocol),			host, sizeof(host),			request, sizeof(request),			&port);	m_sUrl  = strUrl;	m_sHost = host;	m_sPath = request;	if( port > 0 ){		m_nPort = port;	}	return true;}/************************************************************************ *  Function name: ParseUrlEx *  Input argv: *  	-- url: host name *  	-- protocol: result protocol *  	-- lprotocol: protocol length *  	-- host: result host *  	-- lhost: host length *  	-- request: result request *  	-- lrequest: request length *  Output argv: *  	-- *  Return:   	true: success   	false: fail *  Fucntion Description: break an URL into scheme, host, port and request. *  			result as argvs *  Be careful:************************************************************************/void CUrl::ParseUrlEx(const char *url,		char *protocol, int lprotocol,		char *host, int lhost,		char *request, int lrequest,		int *port){	char *work,*ptr,*ptr2;	*protocol = *host = *request = 0;	*port = 80;	int len = strlen(url);	//pthread_mutex_lock(&mutexMemory);	work = new char[len + 1];	//pthread_mutex_unlock(&mutexMemory);	memset(work, 0, len+1);	strncpy(work, url, len);	// find protocol if any	ptr = strchr(work, ':');	if( ptr != NULL ){		*(ptr++) = 0;		strncpy( protocol, work, lprotocol );	} else {		strncpy( protocol, "HTTP", lprotocol );		ptr = work;	}	// skip past opening /'s	if( (*ptr=='/') && (*(ptr+1)=='/') )		ptr+=2;	// find host	ptr2 = ptr;	while( IsValidHostChar(*ptr2) && *ptr2 )		ptr2++;	*ptr2 = 0;	strncpy( host, ptr, lhost );	// find the request	int offset = ptr2 - work;	const char *pStr = url + offset;	strncpy( request, pStr, lrequest );	// find the port number, if any	ptr = strchr( host, ':' );	if( ptr != NULL ){		*ptr = 0;		*port = atoi(ptr+1);	}	//pthread_mutex_lock(&mutexMemory);	delete [] work;	//pthread_mutex_unlock(&mutexMemory);	work = NULL;}/* scheme://user:pass@host[:port]...  *                    ^               * We attempt to break down the URL into the components path, * params, query, and fragment.  They are ordered like this: * scheme://host[:port][/path][;params][?query][#fragment]  *//*bool CUrl::ParseUrl(string strUrl){	string::size_type idx;	this->ParseScheme(strUrl.c_str());		if( this->m_eScheme != SCHEME_HTTP )		return false;	// get host name	this->m_sHost = strUrl.substr(7);	idx = m_sHost.find('/');	if(idx != string::npos){		m_sHost = m_sHost.substr(0,idx);	}	this->m_sUrl = strUrl;	return true;}*/CUrl::CUrl(){	this->m_sUrl = ""; 	this->m_eScheme= SCHEME_INVALID;        	this->m_sHost = "";  	this->m_nPort = DEFAULT_HTTP_PORT;         	this->m_sPath = "";	/*	this->m_sParams = "";	this->m_sQuery = "";	this->m_sFragment = "";	this->m_sDir = "";	this->m_sFile = "";                this->m_sUser = "";	this->m_sPasswd = "";	*/}CUrl::~CUrl(){}/**************************************************************************** *  Function name: GetIpByHost *  Input argv: *  	-- host: host name *  Output argv: *  	-- *  Return:   	ip: sucess   	NULL: fail *  Function Description: get the ip address by host name *  Be careful: release the memory by the client****************************************************************************/char * CUrl::GetIpByHost(const char *host){	if( !host ){	// null pointer		return NULL;	}	if( !IsValidHost(host) ){	// invalid host		return NULL;	}	unsigned long inaddr = 0;	char *result = NULL;	int len = 0;	inaddr = (unsigned long)inet_addr( host );	//if ( (int)inaddr != -1){ 	if ( inaddr != INADDR_NONE){ // host is just ip		len = strlen(host);		//pthread_mutex_lock(&mutexMemory);		result = new char[len+1];		//pthread_mutex_unlock(&mutexMemory);		memset(result, 0, len+1);		memcpy(result, host, len);		return result;        } else {		//firt find from cache				map<string,string>::iterator it  = mapCacheHostLookup.find(host);		if( it != mapCacheHostLookup.end() ){	// find in host lookup cache			const char * strHostIp;			strHostIp = (*it).second.c_str();			inaddr = (unsigned long)inet_addr( strHostIp );			//if ( (int)inaddr != -1){ 			if ( inaddr != INADDR_NONE ){ 				len = strlen(strHostIp);				//pthread_mutex_lock(&mutexMemory);				result = new char[len+1];				//pthread_mutex_unlock(&mutexMemory);				memset( result, 0, len+1 );				memcpy( result, strHostIp, len );				//cout << ":)" ;								return result;        		}		}	}	// if still not find, then try by DNS server	struct hostent *hp;	/* Host entity */	hp = gethostbyname(host);	if(hp == NULL) { 		//cout << "gethostbyname() error in GetIpByHost: " << host << endl;		return NULL;	}	// cache host lookup        struct  in_addr in;	bcopy(*(hp->h_addr_list), (caddr_t)&in, hp->h_length);			char    abuf[INET_ADDRSTRLEN];        if( inet_ntop(AF_INET, (void *)&in,abuf, sizeof(abuf)) == NULL ){		cout << "inet_ntop() return error in GetIpByHost" << endl;		return NULL;	} else {		pthread_mutex_lock(&mutexCacheHost);		//if( mapCacheHostLookup.count(host) == 0){		if( mapCacheHostLookup.find(host) == mapCacheHostLookup.end() ){					//cout << endl << host << " and " << abuf << endl;			mapCacheHostLookup.insert( valTypeCHL ( host, abuf));		}		pthread_mutex_unlock(&mutexCacheHost);	}	// return result	len = strlen(abuf);	//pthread_mutex_lock(&mutexMemory);	result = new char[len + 1];	//pthread_mutex_unlock(&mutexMemory);	memset( result, 0, len+1 );	memcpy( result, abuf, len );	return result;}/********************************************************************************** *  Function name: IsValidHostChar *  Input argv: *  	-- ch: the character for testing *  Output argv: *  	--  *  Return:   	true: is valid   	false: is invalid *  Function Description: test the specified character valid *  			for a host name, i.e. A-Z or 0-9 or -.:**********************************************************************************/bool CUrl::IsValidHostChar(char ch){	return( isalpha(ch) || isdigit(ch)		|| ch=='-' || ch=='.' || ch==':' || ch=='_');}/********************************************************************************** *  Function name: IsValidHost *  Input argv: *  	-- ch: the character for testing *  Output argv: *  	--  *  Return:   	true: is valid   	false: is invalid *  Function Description: test the specified character valid *  			for a host name, i.e. A-Z or 0-9 or -.: *  Be careful:**********************************************************************************/bool CUrl::IsValidHost(const char *host){	if( !host ){		return false;	}	if( strlen(host) < 6 ){ // in case host like "www", "pku", etc.		return false;	}	char ch;	for(unsigned int i=0; i<strlen(host); i++){		ch = *(host++);		if( !IsValidHostChar(ch) ){			return false;		}	}	return true;}/********************************************************************************** *  Function name: IsVisitedUrl *  Input argv: *  	-- url: url *  Output argv: *  	--  *  Return:   	true: is visited   	false: not visited *  Function Description: test the url visited by the MD5 *  Be careful:**********************************************************************************/bool CUrl::IsVisitedUrl(const char *url){	if( !url ){		return true; // if be null, we think it have been visited	}	CMD5 iMD5;	iMD5.GenerateMD5( (unsigned char*)url, strlen(url) );	string strDigest = iMD5.ToString();	if( setVisitedUrlMD5.find(strDigest) != setVisitedUrlMD5.end() ) {		return true;	} else {		return false;	}}/********************************************************************************** *  Function name: IsValidIp *  Input argv: *  	-- ip: ip *  Output argv: *  	--  *  Return:   	true: inside the ip block   	false: outside the ip block *  Function Description: decide teh ip whether or not inside the ip block *  Be careful:**********************************************************************************/bool CUrl::IsValidIp(const char *ip){	if( ip == NULL ){		return false;	}	unsigned long inaddr = (unsigned long)inet_addr(ip);	if( inaddr == INADDR_NONE ){	// invalid ip		return false;	}	if( mapIpBlock.size() > 0 ){		map<unsigned long,unsigned long>::iterator pos;		for(pos=mapIpBlock.begin(); pos!=mapIpBlock.end(); ++pos){			unsigned long ret;			ret = inaddr & ~((*pos).second);			if( ret == (*pos).first ){	// inside				return true;			}		}		// outside		return false;	}	// if block range is not given, we think it inside also	return true;}/* * If it is, return true; otherwise false * not very precise */bool CUrl::IsForeignHost(string host){	if( host.empty() ) return true;	if( host.size() > HOST_LEN ) return true;	unsigned long inaddr = 0;	inaddr = (unsigned long)inet_addr( host.c_str() );	if ( inaddr != INADDR_NONE){ // host is just ip		return false;	}	string::size_type idx = host.rfind('.');	string tmp;	if( idx != string::npos ){		tmp = host.substr(idx+1);	}	CStrFun::Str2Lower( tmp, tmp.size() );	const char *home_host[] ={		"cn","com","net","org","info",		"biz","tv","cc", "hk", "tw"	};	int home_host_num = 10;	for(int i=0; i<home_host_num; i++){		if( tmp == home_host[i] )			return false;	}	return true;}		bool CUrl::IsImageUrl(string url){	if( url.empty() ) return false;	if( url.size() > HOST_LEN ) return false;	string::size_type idx = url.rfind('.');	string tmp;	if( idx != string::npos ){		tmp = url.substr(idx+1);	}	CStrFun::Str2Lower( tmp, tmp.size() );	const char *image_type[] ={		"gif","jpg","jpeg","png","bmp",		"tif","psd"	};	int image_type_num = 7;	for (int i=0; i<image_type_num; i++)	{		if( tmp == image_type[i] )			return true;	}	return false;}		

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -