📄 url.cpp

📁 此源码功能是捕获网页中的链接并进行分析
💻 CPP
字号:
/* URL handling */#include <iostream>#include <string>#include <sys/socket.h>#include <netdb.h>//#include "Tse.h"#include "Url.h"//#include "Http.h"//#include "Md5.h"#include "StrFun.h"/* Is X "."?  */#define DOTP(x) ((*(x) == '.') && (!*(x + 1)))/* Is X ".."?  */#define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))//map<string,string> mapCacheHostLookup;//extern vector<string> vsUnreachHost;//pthread_mutex_t mutexCacheHost = PTHREAD_MUTEX_INITIALIZER;//extern set<string> setVisitedUrlMD5;//extern map<unsigned long,unsigned long> mapIpBlock;//typedef map<string,string>::value_type valTypeCHL;struct scheme_data{	char *leading_string;	int default_port;	int enabled;};/* Supported schemes: */static struct scheme_data supported_schemes[] ={	{ "http://",  DEFAULT_HTTP_PORT,  1 },	{ "ftp://",   DEFAULT_FTP_PORT,   1 },	/* SCHEME_INVALID */	{ NULL,       -1,                 0 }};/* Returns the scheme type if the scheme is supported, or   SCHEME_INVALID if not.  */void CUrl::ParseScheme (const char *url){	int i;	for (i = 0; supported_schemes[i].leading_string; i++)		if (0 == strncasecmp (url, supported_schemes[i].leading_string,                          strlen (supported_schemes[i].leading_string))) {			if (supported_schemes[i].enabled){				this->m_eScheme = (enum url_scheme) i;				return;			}else{				this->m_eScheme = SCHEME_INVALID;				return;			}		}	this->m_eScheme = SCHEME_INVALID;	return;}/************************************************************************ *  Function name: ParseUrlEx *  Input argv: *  	-- strUrl: url *  Output argv: *  	-- *  Return:   	true: success   	false: fail *  Fucntion Description: break an URL into scheme, host, port and request. *  			result as member variants *  Be careful:	release the memory by the client************************************************************************/bool CUrl::ParseUrlEx(string strUrl){	char protocol[10];	char host[HOST_LEN];	char request[256];	int port = -1;	memset( protocol, 0, sizeof(protocol) );	memset( host, 0, sizeof(host) );	memset( request, 0, sizeof(request) );	this->ParseScheme(strUrl.c_str());	if( this->m_eScheme != SCHEME_HTTP ){		return false;	}	ParseUrlEx(strUrl.c_str(),			protocol, sizeof(protocol),			host, sizeof(host),			request, sizeof(request),			&port);	m_sUrl  = strUrl;	m_sHost = host;	m_sPath = request;	if( port > 0 ){		m_nPort = port;	}	return true;}/************************************************************************ *  Function name: ParseUrlEx *  Input argv: *  	-- url: host name *  	-- protocol: result protocol *  	-- lprotocol: protocol length *  	-- host: result host *  	-- lhost: host length *  	-- request: result request *  	-- lrequest: request length *  Output argv: *  	-- *  Return:   	true: success   	false: fail *  Fucntion Description: break an URL into scheme, host, port and request. *  			result as argvs *  Be careful:************************************************************************/void CUrl::ParseUrlEx(const char *url,		char *protocol, int lprotocol,		char *host, int lhost,		char *request, int lrequest,		int *port){	char *work,*ptr,*ptr2;	*protocol = *host = *request = 0;	*port = 80;	int len = strlen(url);	//pthread_mutex_lock(&mutexMemory);	work = new char[len + 1];	//pthread_mutex_unlock(&mutexMemory);	memset(work, 0, len+1);	strncpy(work, url, len);	// find protocol if any	ptr = strchr(work, ':');	if( ptr != NULL ){		*(ptr++) = 0;		strncpy( protocol, work, lprotocol );	} else {		strncpy( protocol, "HTTP", lprotocol );		ptr = work;	}	// skip past opening /'s	if( (*ptr=='/') && (*(ptr+1)=='/') )		ptr+=2;	// find host	ptr2 = ptr;	while( IsValidHostChar(*ptr2) && *ptr2 )		ptr2++;	*ptr2 = 0;	strncpy( host, ptr, lhost );	// find the request	int offset = ptr2 - work;	const char *pStr = url + offset;	strncpy( request, pStr, lrequest );	// find the port number, if any	ptr = strchr( host, ':' );	if( ptr != NULL ){		*ptr = 0;		*port = atoi(ptr+1);	}	//pthread_mutex_lock(&mutexMemory);	delete [] work;	//pthread_mutex_unlock(&mutexMemory);	work = NULL;}/* scheme://user:pass@host[:port]...  *                    ^               * We attempt to break down the URL into the components path, * params, query, and fragment.  They are ordered like this: * scheme://host[:port][/path][;params][?query][#fragment]  *//*bool CUrl::ParseUrl(string strUrl){	string::size_type idx;	this->ParseScheme(strUrl.c_str());		if( this->m_eScheme != SCHEME_HTTP )		return false;	// get host name	this->m_sHost = strUrl.substr(7);	idx = m_sHost.find('/');	if(idx != string::npos){		m_sHost = m_sHost.substr(0,idx);	}	this->m_sUrl = strUrl;	return true;}*/CUrl::CUrl(){	this->m_sUrl = ""; 	this->m_eScheme= SCHEME_INVALID;        	this->m_sHost = "";  	this->m_nPort = DEFAULT_HTTP_PORT;         	this->m_sPath = "";	/*	this->m_sParams = "";	this->m_sQuery = "";	this->m_sFragment = "";	this->m_sDir = "";	this->m_sFile = "";                this->m_sUser = "";	this->m_sPasswd = "";	*/}CUrl::~CUrl(){}/********************************************************************************** *  Function name: IsValidHostChar *  Input argv: *  	-- ch: the character for testing *  Output argv: *  	--  *  Return:   	true: is valid   	false: is invalid *  Function Description: test the specified character valid *  			for a host name, i.e. A-Z or 0-9 or -.:**********************************************************************************/bool CUrl::IsValidHostChar(char ch){	return( isalpha(ch) || isdigit(ch)		|| ch=='-' || ch=='.' || ch==':' || ch=='_');}/********************************************************************************** *  Function name: IsValidHost *  Input argv: *  	-- ch: the character for testing *  Output argv: *  	--  *  Return:   	true: is valid   	false: is invalid *  Function Description: test the specified character valid *  			for a host name, i.e. A-Z or 0-9 or -.: *  Be careful:**********************************************************************************/bool CUrl::IsValidHost(const char *host){	if( !host ){		return false;	}	if( strlen(host) < 6 ){ // in case host like "www", "pku", etc.		return false;	}	char ch;	for(unsigned int i=0; i<strlen(host); i++){		ch = *(host++);		if( !IsValidHostChar(ch) ){			return false;		}	}	return true;}bool CUrl::IsImageUrl(string url){	if( url.empty() ) return false;	if( url.size() > HOST_LEN ) return false;	string::size_type idx = url.rfind('.');	string tmp;	if( idx != string::npos ){		tmp = url.substr(idx+1);	}	CStrFun::Str2Lower( tmp, tmp.size() );	const char *image_type[] ={		"gif","jpg","jpeg","png","bmp",		"tif","psd"	};	int image_type_num = 7;	for (int i=0; i<image_type_num; i++)	{		if( tmp == image_type[i] )			return true;	}	return false;}
💿 文件大小 982 K
👤 上传用户 choatehou4
📂 所属分类其他
🏷️ 相关标签

#源码 #分 #页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -