⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 uri.cc

📁 著名的标准C++的html解析器
💻 CC
📖 第 1 页 / 共 2 页
字号:
#include "Uri.h"#include "wincstring.h"#include <strstream>#include <cassert>#include "tld.h"//#define DEBUG#include "debug.h"using namespace std;using namespace htmlcxx;/** Structure to store various schemes and their default ports */struct schemes_t {    /** The name of the scheme */    const char *name;    /** The default port for the scheme */    unsigned int default_port;};/* Some WWW schemes and their default ports; this is basically /etc/services *//* This will become global when the protocol abstraction comes *//* As the schemes are searched by a linear search, *//* they are sorted by their expected frequency */static schemes_t schemes[] ={    {"http",     Uri::URI_HTTP_DEFAULT_PORT},    {"ftp",      Uri::URI_FTP_DEFAULT_PORT},    {"https",    Uri::URI_HTTPS_DEFAULT_PORT},    {"gopher",   Uri::URI_GOPHER_DEFAULT_PORT},    {"ldap",     Uri::URI_LDAP_DEFAULT_PORT},    {"nntp",     Uri::URI_NNTP_DEFAULT_PORT},    {"snews",    Uri::URI_SNEWS_DEFAULT_PORT},    {"imap",     Uri::URI_IMAP_DEFAULT_PORT},    {"pop",      Uri::URI_POP_DEFAULT_PORT},    {"sip",      Uri::URI_SIP_DEFAULT_PORT},    {"rtsp",     Uri::URI_RTSP_DEFAULT_PORT},    {"wais",     Uri::URI_WAIS_DEFAULT_PORT},    {"z39.50r",  Uri::URI_WAIS_DEFAULT_PORT},    {"z39.50s",  Uri::URI_WAIS_DEFAULT_PORT},    {"prospero", Uri::URI_PROSPERO_DEFAULT_PORT},    {"nfs",      Uri::URI_NFS_DEFAULT_PORT},    {"tip",      Uri::URI_TIP_DEFAULT_PORT},    {"acap",     Uri::URI_ACAP_DEFAULT_PORT},    {"telnet",   Uri::URI_TELNET_DEFAULT_PORT},    {"ssh",      Uri::URI_SSH_DEFAULT_PORT},    { NULL, 0xFFFF }     /* unknown port */};static unsigned int port_of_Scheme(const char *scheme_str){    schemes_t *scheme;    if (scheme_str) {        for (scheme = schemes; scheme->name != NULL; ++scheme) {            if (strcasecmp(scheme_str, scheme->name) == 0) {                return scheme->default_port;            }        }    }    return 0;}/* We have a apr_table_t that we can index by character and it tells us if the * character is one of the interesting delimiters.  Note that we even get * compares for NUL for free -- it's just another delimiter. */#define T_COLON           0x01        /* ':' */#define T_SLASH           0x02        /* '/' */#define T_QUESTION        0x04        /* '?' */#define T_HASH            0x08        /* '#' */#define T_NUL             0x80        /* '\0' *//* the uri_delims.h file is autogenerated by gen_uri_delims.c *//* this file is automatically generated by gen_uri_delims, do not edit */static const unsigned char uri_delims[256] = {    T_NUL,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,T_HASH,0,0,0,0,    0,0,0,0,0,0,0,T_SLASH,0,0,0,0,0,0,0,0,0,0,T_COLON,0,    0,0,0,T_QUESTION,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };/* it works like this:    if (uri_delims[ch] & NOTEND_foobar) {        then we're not at a delimiter for foobar    }*//* Note that we optimize the scheme scanning here, we cheat and let the * compiler know that it doesn't have to do the & masking. */#define NOTEND_SCHEME     (0xff)#define NOTEND_HOSTINFO   (T_SLASH | T_QUESTION | T_HASH | T_NUL)#define NOTEND_PATH       (T_QUESTION | T_HASH | T_NUL)static size_t wwwPrefixOffset(const std::string& hostname);Uri::Uri(): mScheme(), mUser(), mPassword(), mHostname(), mPath(), mQuery(), mFragment(), mExistsQuery(false), mExistsFragment(false), mPort(0){}Uri::Uri(const string &uri_str): mScheme(), mUser(), mPassword(), mHostname(), mPath(), mQuery(), mFragment(), mExistsQuery(false), mExistsFragment(false), mPort(0){	init(uri_str);}void Uri::init(const string &uri_str){	DEBUGP("Parsing uri %s\n", uri_str.c_str());	if(uri_str.empty()) return;	const char *uri = uri_str.c_str();	const char *s;	const char *s1;	const char *hostinfo;	char *endstr;		/* We assume the processor has a branch predictor like most --	 * it assumes forward branches are untaken and backwards are taken.  That's	 * the reason for the gotos.  -djg	 */	if (uri[0] == '/') {		deal_with_path:		DEBUGP("Dealing with path\n");		/* we expect uri to point to first character of path ... remember		 * that the path could be empty -- http://foobar?query for example		 */		s = uri;		while ((uri_delims[*(unsigned char *)s] & NOTEND_PATH) == 0) {			++s;		}		if (s != uri) {			mPath.assign(uri, s - uri);			DEBUGP("Path is %s\n", mPath.c_str());		}		if (*s == 0) {			return;		}		if (*s == '?') {			++s;			s1 = strchr(s, '#');			if (s1) {				mFragment.assign(s1 + 1);				mExistsFragment = true;				DEBUGP("Fragment is %s\n", mFragment.c_str());				mQuery.assign(s, s1 - s);				mExistsQuery = true;				DEBUGP("Query is %s\n", mQuery.c_str());			}			else {				mQuery.assign(s);				mExistsQuery = true;				DEBUGP("Query is %s\n", mQuery.c_str());			}			return;		}		/* otherwise it's a fragment */		mFragment.assign(s + 1);		mExistsFragment = true;		DEBUGP("Fragment is %s\n", mFragment.c_str());		return;	}	DEBUGP("Dealing with scheme\n");	/* find the scheme: */	if (!isalpha(*uri)) goto deal_with_path;	s = uri;	while ((uri_delims[*(unsigned char *)s] & NOTEND_SCHEME) == 0) {		++s;	}	/* scheme must be non-empty and followed by :// */	if (s == uri || s[0] != ':' || s[1] != '/' || s[2] != '/') {		goto deal_with_path;        /* backwards predicted taken! */	}	mScheme.assign(uri, s - uri);	DEBUGP("Scheme is %s\n", mScheme.c_str());	s += 3;	DEBUGP("Finding hostinfo\n");	hostinfo = s;	DEBUGP("Hostinfo is %s\n", hostinfo);	while ((uri_delims[*(unsigned char *)s] & NOTEND_HOSTINFO) == 0) {		++s;	}	uri = s;        /* whatever follows hostinfo is start of uri *///	mHostinfo.assign(hostinfo, uri - hostinfo);	/* If there's a username:password@host:port, the @ we want is the last @...	 * too bad there's no memrchr()... For the C purists, note that hostinfo	 * is definately not the first character of the original uri so therefore	 * &hostinfo[-1] < &hostinfo[0] ... and this loop is valid C.	 */	do {		--s;	} while (s >= hostinfo && *s != '@');	if (s < hostinfo) {		/* again we want the common case to be fall through */deal_with_host:		DEBUGP("Dealing with host\n");		/* We expect hostinfo to point to the first character of		 * the hostname.  If there's a port it is the first colon.		 */		s = (char *)memchr(hostinfo, ':', uri - hostinfo);		if (s == NULL) {			/* we expect the common case to have no port */			mHostname.assign(hostinfo, uri - hostinfo);			DEBUGP("Hostname is %s\n", mHostname.c_str());			goto deal_with_path;		}		mHostname.assign(hostinfo, s - hostinfo);		DEBUGP("Hostname is %s\n", mHostname.c_str());		++s;		if (uri != s) {			mPortStr.assign(s, uri - s);			mPort = strtol(mPortStr.c_str(), &endstr, 10);			if (*endstr == '\0') {				goto deal_with_path;			}			/* Invalid characters after ':' found */			DEBUGP("Throwing invalid url exception\n");			throw Exception("Invalid character after ':'");		}		this->mPort = port_of_Scheme(mScheme.c_str());		goto deal_with_path;	}	/* first colon delimits username:password */	s1 = (char *)memchr(hostinfo, ':', s - hostinfo);	if (s1) {		mUser.assign(hostinfo, s1 - hostinfo);		++s1;		mPassword.assign(s1, s - s1);	}	else {		mUser.assign(hostinfo, s - hostinfo);	}	hostinfo = s + 1;	goto deal_with_host;}Uri::~Uri() {}string Uri::scheme() const { return mScheme; }void Uri::scheme(string scheme) {	mScheme = scheme;}string Uri::user() const { return mUser; }void Uri::user(string user) {	mUser = user;}string Uri::password() const { return mPassword; }void Uri::password(string password) {	mPassword = password;}string Uri::hostname() const { return mHostname; }void Uri::hostname(string hostname) {	mHostname = hostname;}string Uri::path() const { return mPath; }void Uri::path(string path) {	mPath = path;}bool Uri::existsFragment() const { return mExistsFragment; }void Uri::existsFragment(bool existsFragment) {	mExistsFragment = existsFragment;}bool Uri::existsQuery() const { return mExistsQuery; }void Uri::existsQuery(bool existsQuery) {	mExistsQuery = existsQuery;}string Uri::query() const { return mQuery; }

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -