📄 url.cpp
字号:
/* URL handling */#include <iostream>#include <string>#include <sys/socket.h>#include <netdb.h>//#include "Tse.h"#include "Url.h"//#include "Http.h"//#include "Md5.h"#include "StrFun.h"/* Is X "."? */#define DOTP(x) ((*(x) == '.') && (!*(x + 1)))/* Is X ".."? */#define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))//map<string,string> mapCacheHostLookup;//extern vector<string> vsUnreachHost;//pthread_mutex_t mutexCacheHost = PTHREAD_MUTEX_INITIALIZER;//extern set<string> setVisitedUrlMD5;//extern map<unsigned long,unsigned long> mapIpBlock;//typedef map<string,string>::value_type valTypeCHL;struct scheme_data{ char *leading_string; int default_port; int enabled;};/* Supported schemes: */static struct scheme_data supported_schemes[] ={ { "http://", DEFAULT_HTTP_PORT, 1 }, { "ftp://", DEFAULT_FTP_PORT, 1 }, /* SCHEME_INVALID */ { NULL, -1, 0 }};/* Returns the scheme type if the scheme is supported, or SCHEME_INVALID if not. */void CUrl::ParseScheme (const char *url){ int i; for (i = 0; supported_schemes[i].leading_string; i++) if (0 == strncasecmp (url, supported_schemes[i].leading_string, strlen (supported_schemes[i].leading_string))) { if (supported_schemes[i].enabled){ this->m_eScheme = (enum url_scheme) i; return; }else{ this->m_eScheme = SCHEME_INVALID; return; } } this->m_eScheme = SCHEME_INVALID; return;}/************************************************************************ * Function name: ParseUrlEx * Input argv: * -- strUrl: url * Output argv: * -- * Return: true: success false: fail * Fucntion Description: break an URL into scheme, host, port and request. * result as member variants * Be careful: release the memory by the client************************************************************************/bool CUrl::ParseUrlEx(string strUrl){ char protocol[10]; char host[HOST_LEN]; char request[256]; int port = -1; memset( protocol, 0, sizeof(protocol) ); memset( host, 0, sizeof(host) ); memset( request, 0, sizeof(request) ); this->ParseScheme(strUrl.c_str()); if( this->m_eScheme != SCHEME_HTTP ){ return false; } ParseUrlEx(strUrl.c_str(), protocol, sizeof(protocol), host, sizeof(host), request, sizeof(request), &port); m_sUrl = strUrl; m_sHost = host; m_sPath = request; if( port > 0 ){ m_nPort = port; } return true;}/************************************************************************ * Function name: ParseUrlEx * Input argv: * -- url: host name * -- protocol: result protocol * -- lprotocol: protocol length * -- host: result host * -- lhost: host length * -- request: result request * -- lrequest: request length * Output argv: * -- * Return: true: success false: fail * Fucntion Description: break an URL into scheme, host, port and request. * result as argvs * Be careful:************************************************************************/void CUrl::ParseUrlEx(const char *url, char *protocol, int lprotocol, char *host, int lhost, char *request, int lrequest, int *port){ char *work,*ptr,*ptr2; *protocol = *host = *request = 0; *port = 80; int len = strlen(url); //pthread_mutex_lock(&mutexMemory); work = new char[len + 1]; //pthread_mutex_unlock(&mutexMemory); memset(work, 0, len+1); strncpy(work, url, len); // find protocol if any ptr = strchr(work, ':'); if( ptr != NULL ){ *(ptr++) = 0; strncpy( protocol, work, lprotocol ); } else { strncpy( protocol, "HTTP", lprotocol ); ptr = work; } // skip past opening /'s if( (*ptr=='/') && (*(ptr+1)=='/') ) ptr+=2; // find host ptr2 = ptr; while( IsValidHostChar(*ptr2) && *ptr2 ) ptr2++; *ptr2 = 0; strncpy( host, ptr, lhost ); // find the request int offset = ptr2 - work; const char *pStr = url + offset; strncpy( request, pStr, lrequest ); // find the port number, if any ptr = strchr( host, ':' ); if( ptr != NULL ){ *ptr = 0; *port = atoi(ptr+1); } //pthread_mutex_lock(&mutexMemory); delete [] work; //pthread_mutex_unlock(&mutexMemory); work = NULL;}/* scheme://user:pass@host[:port]... * ^ * We attempt to break down the URL into the components path, * params, query, and fragment. They are ordered like this: * scheme://host[:port][/path][;params][?query][#fragment] *//*bool CUrl::ParseUrl(string strUrl){ string::size_type idx; this->ParseScheme(strUrl.c_str()); if( this->m_eScheme != SCHEME_HTTP ) return false; // get host name this->m_sHost = strUrl.substr(7); idx = m_sHost.find('/'); if(idx != string::npos){ m_sHost = m_sHost.substr(0,idx); } this->m_sUrl = strUrl; return true;}*/CUrl::CUrl(){ this->m_sUrl = ""; this->m_eScheme= SCHEME_INVALID; this->m_sHost = ""; this->m_nPort = DEFAULT_HTTP_PORT; this->m_sPath = ""; /* this->m_sParams = ""; this->m_sQuery = ""; this->m_sFragment = ""; this->m_sDir = ""; this->m_sFile = ""; this->m_sUser = ""; this->m_sPasswd = ""; */}CUrl::~CUrl(){}/********************************************************************************** * Function name: IsValidHostChar * Input argv: * -- ch: the character for testing * Output argv: * -- * Return: true: is valid false: is invalid * Function Description: test the specified character valid * for a host name, i.e. A-Z or 0-9 or -.:**********************************************************************************/bool CUrl::IsValidHostChar(char ch){ return( isalpha(ch) || isdigit(ch) || ch=='-' || ch=='.' || ch==':' || ch=='_');}/********************************************************************************** * Function name: IsValidHost * Input argv: * -- ch: the character for testing * Output argv: * -- * Return: true: is valid false: is invalid * Function Description: test the specified character valid * for a host name, i.e. A-Z or 0-9 or -.: * Be careful:**********************************************************************************/bool CUrl::IsValidHost(const char *host){ if( !host ){ return false; } if( strlen(host) < 6 ){ // in case host like "www", "pku", etc. return false; } char ch; for(unsigned int i=0; i<strlen(host); i++){ ch = *(host++); if( !IsValidHostChar(ch) ){ return false; } } return true;}bool CUrl::IsImageUrl(string url){ if( url.empty() ) return false; if( url.size() > HOST_LEN ) return false; string::size_type idx = url.rfind('.'); string tmp; if( idx != string::npos ){ tmp = url.substr(idx+1); } CStrFun::Str2Lower( tmp, tmp.size() ); const char *image_type[] ={ "gif","jpg","jpeg","png","bmp", "tif","psd" }; int image_type_num = 7; for (int i=0; i<image_type_num; i++) { if( tmp == image_type[i] ) return true; } return false;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -