📄 utils.cc

📁 著名的标准C＋＋的html解析器
💻 CC
字号:
#include <algorithm>#include <cctype>#include <strstream>#include "Uri.h"#include "utils.h"using namespace std;namespace htmlcxx {	namespace HTML {		bool detect_utf8(const char *begin, int size)		{			const char *ptr;			const char *end = begin+size;			const char *signature = "";			char previous_byte = 0;			unsigned count_bad_utf = 0;			unsigned count_good_utf = 0;			if (!strncmp(begin, signature, 3)) return true;						for (ptr = begin; ptr != end; ++ptr)			{				if ((*ptr & 0xC0) == 0x80)				{					if ((previous_byte & 0xC0) == 0xC0)					{						count_good_utf ++;					}					else if ((previous_byte & 0x80) == 0x00)					{						count_bad_utf ++;					}				}				else if ((previous_byte & 0xC0) == 0xC0)				{					count_bad_utf ++;				}				previous_byte = *ptr;			}			return count_good_utf > count_bad_utf;		}		string single_blank(const string &str) {			unsigned int count = 0;			bool first_space = true;			const char *ptr = str.c_str();			string ret(str.length(), ' ');						// Skip space at beginning			while (isspace(*ptr)) ++ptr;						while (*ptr)			{				if (isspace(*ptr))				{					if (first_space)					{						first_space = false;						ret[count++] = ' ';					}				}				else				{					first_space = true;					ret[count++] = *ptr;				}								++ptr;			}			// Trim space at the end			string::size_type a;			a = ret.find_last_not_of(' ', count);			if (a != string::npos)				ret.erase(a+1);			else			{				a = 0;				ret.erase(a);			}			return ret;		}		string strip_comments(const string &str) {			string ret;			ret.reserve(str.size());			const char *ptr = str.c_str();			const char *end = ptr + str.length();			bool inside_comment = false;			while(1) {				if(!inside_comment) {					if(ptr  + 4 < end) {						if(*ptr == '<' && *(ptr+1) == '!' && *(ptr+2) =='-' && *(ptr + 3) == '-' && isspace(*(ptr + 4))) {							inside_comment = true;						}					}				} else {					if(ptr + 2 < end) {						if(*ptr == '-' && *(ptr+1) == '-' && *(ptr+2) == '>' ) {							inside_comment = false;							ptr += 3;						}					}				}				if(ptr == end) break;				if(!inside_comment) ret += *ptr;				ptr++;			}			ret.resize(ret.size());			return ret;		}		static struct {			char *str;			unsigned char chr;		} entities[] = {			/* 00 */			{ "quot", 34 },			{ "amp", 38 },			{ "lt", 60 },			{ "gt", 62 },			{ "nbsp", ' ' },			{ "iexcl", 161 },			{ "cent", 162 },			{ "pound", 163 },			{ "curren", 164 },			{ "yen", 165 },			/* 10 */			{ "brvbar", 166 },			{ "sect", 167 },			{ "uml", 168 },			{ "copy", 169 },			{ "ordf", 170 },			{ "laquo", 171 },			{ "not", 172 },			{ "shy", 173 },			{ "reg", 174 },			{ "macr", 175 },			/* 20 */			{ "deg", 176 },			{ "plusmn", 177 },			{ "sup2", 178 },			{ "sup3", 179 },			{ "acute", 180 },			{ "micro", 181 },			{ "para", 182 },			{ "middot", 183 },			{ "cedil", 184 },			{ "sup1", 185 },			/* 30 */			{ "ordm", 186 },			{ "raquo", 187 },			{ "frac14", 188 },			{ "frac12", 189 },			{ "frac34", 190 },			{ "iquest", 191 },			{ "Agrave", 192 },			{ "Aacute", 193 },			{ "Acirc", 194 },			{ "Atilde", 195 },			/* 40 */			{ "Auml", 196 },			{ "ring", 197 },			{ "AElig", 198 },			{ "Ccedil", 199 },			{ "Egrave", 200 },			{ "Eacute", 201 },			{ "Ecirc", 202 },			{ "Euml", 203 },			{ "Igrave", 204 },			{ "Iacute", 205 },			/* 50 */			{ "Icirc", 206 },			{ "Iuml", 207 },			{ "ETH", 208 },			{ "Ntilde", 209 },			{ "Ograve", 210 },			{ "Oacute", 211 },			{ "Ocirc", 212 },			{ "Otilde", 213 },			{ "Ouml", 214 },			{ "times", 215 },			/* 60 */			{ "Oslash", 216 },			{ "Ugrave", 217 },			{ "Uacute", 218 },			{ "Ucirc", 219 },			{ "Uuml", 220 },			{ "Yacute", 221 },			{ "THORN", 222 },			{ "szlig", 223 },			{ "agrave", 224 },			{ "aacute", 225 },			/* 70 */			{ "acirc", 226 },			{ "atilde", 227 },			{ "auml", 228 },			{ "aring", 229 },			{ "aelig", 230 },			{ "ccedil", 231 },			{ "egrave", 232 },			{ "eacute", 233 },			{ "ecirc", 234 },			{ "euml", 235 },			/* 80 */			{ "igrave", 236 },			{ "iacute", 237 },			{ "icirc", 238 },			{ "iuml", 239 },			{ "ieth", 240 },			{ "ntilde", 241 },			{ "ograve", 242 },			{ "oacute", 243 },			{ "ocirc", 244 },			{ "otilde", 245 },			/* 90 */			{ "ouml", 246 },			{ "divide", 247 },			{ "oslash", 248 },			{ "ugrave", 249 },			{ "uacute", 250 },			{ "ucirc", 251 },			{ "uuml", 252 },			{ "yacute", 253 },			{ "thorn", 254 },			{ "yuml", 255 },			/* 100 */			{ NULL, 0 },		};		string decode_entities(const string &str)		{			unsigned int count = 0;			const char *ptr = str.c_str();			const char *end;			string ret(str);			string entity;			ptr = strchr(ptr, '&');			if (ptr == NULL) return ret;			count += static_cast<unsigned int>(ptr - str.c_str());//			printf("url_init: %s\n", str.c_str());			while (*ptr)			{				if (*ptr == '&' && ((end = strchr(ptr, ';')) != NULL))				{					entity.assign(ptr + 1, end);//					printf("Entity: %d %s\n", entity.length(), entity.c_str());					if (!entity.empty() && entity[0] == '#')					{						entity.erase(0, 1);						int chr = atoi(entity.c_str());						if (chr > 0 && chr <= UCHAR_MAX)						{							ret[count++] = chr;						}						ptr = end + 1;					}					else					{						bool found = false;						for (int i = 0; entities[i].str != NULL; i++)						{							if (entity == entities[i].str)							{								found = true;								ret[count++] = entities[i].chr;								ptr = end + 1;								break;							}						}						if (!found)						{							ret[count++] = *ptr++;						}					}				}				else				{					ret[count++] = *ptr++;				}			}			ret.erase(count);//			printf("url_end: %s\n", ret.c_str());			return ret;		}		string get_attribute(const string& tag, const string& attr) {			string val;			string low_tag(tag);			string low_attr(attr);			transform(low_attr.begin(), low_attr.end(), low_attr.begin(), ::tolower);			transform(low_tag.begin(), low_tag.end(), low_tag.begin(), ::tolower);			string::size_type a;			a = low_tag.find(low_attr);			if (a == string::npos)				return val;			a += attr.length();			while (a < tag.length() && isspace(tag[a])) a++;			if (a == tag.length() || tag[a] != '=')				return val;			a++;			while (a < tag.length() && isspace(tag[a])) a++;			if (a == tag.length())				return val;			if (tag[a] == '"') {				string::size_type b = tag.find('"', a+1);				if (b == string::npos) return val;				val = tag.substr(a+1, b-a-1);			} else if (tag[a] == '\'') {				string::size_type b = tag.find('\'', a+1);				if (b == string::npos) return val;				val = tag.substr(a+1, b-a-1);			} else {				while (a < tag.length() && !isspace(tag[a]) && tag[a] != '>') {					val += tag[a++];				}			}			return val;		}		string normalize_slashs(const string &url)		{			const int NONE = 0;			const int LASTSLASH = 1;			const int LASTDOTSLASH = 2;			const int LASTDOTDOTSLASH = 3;			int state = NONE;			const char *question_dash;			const char *question;			const char *dash;			unsigned int count = 0;			const char *ptr = url.c_str();			string ret(url);			question = strchr(ptr, '?');			dash = strchr(ptr, '#');			if (question &&(!dash || question < dash)) question_dash = question;			else question_dash = dash;			if (question_dash == 0) question_dash = url.c_str() + url.length();			const char *problem;			const char *problem1 = strstr(ptr, "//");			const char *problem2 = strstr(ptr, "/.");			if (problem1 && (!problem2 || problem1 < problem2)) problem = problem1;			else problem = problem2;			if (problem && problem < question_dash)			{				ptr = problem;				count = static_cast<unsigned int>(ptr - url.c_str());				while (*ptr && ptr < question_dash)				{					switch (state)					{						case LASTSLASH:							if (*ptr == '/')							{								++ptr;								state = LASTSLASH;							}							else if (*ptr == '.')							{								++ptr;								state = LASTDOTSLASH;							}							else							{								ret[count++] = *ptr;								++ptr;								state = NONE;							}							break;						case LASTDOTSLASH:							if (*ptr == '/')							{								++ptr;								state = LASTSLASH;							}							else if (*ptr == '.')							{								++ptr;								state = LASTDOTDOTSLASH;							}							else							{								ret[count++] = '.';								ret[count++] = *ptr;								++ptr;								state = NONE;							}							break;						case LASTDOTDOTSLASH:							if (*ptr == '/')							{								const char *last_slash = ret.c_str() + count - 2;								while (last_slash >= ret.c_str() && *last_slash != '/')									--last_slash;								if (last_slash >= ret.c_str())									count = static_cast<unsigned int>(last_slash - ret.c_str() + 1);								++ptr;								state = LASTSLASH;							}							else							{								ret[count++] = '.';								ret[count++] = '.';								ret[count++] = *ptr;								++ptr;								state = NONE;							}							break;						default:							if (*ptr == '/')							{								ret[count++] = *ptr;								++ptr;								state = LASTSLASH;							}							else							{								ret[count++] = *ptr;								++ptr;								state = NONE;							}					}				}				if (question_dash)				{					while (*ptr)					{						ret[count++] = *ptr;						++ptr;					}				}				ret.erase(count);			}			return ret;		}		string convert_link(const string& relative, const Uri& root)		{			string url(relative);						url = HTML::decode_entities(url);			string::size_type a;			a = 0;			while ((a = url.find_first_of(" \r\n", a)) != string::npos)			{				switch (url[a])				{					case ' ':						url.replace(a, 1, "%20");						break;					case '\r':						url.erase(a, 1);						break;					case '\n':						url.erase(a, 1);						break;				}			}			Uri uri;			try			{				Uri rel(url);				uri = rel.absolute(root);				uri.path(normalize_slashs(uri.path()));			}			catch (Uri::Exception)			{				return string();			}			return uri.unparse(Uri::REMOVE_FRAGMENT);		}		string __serialize_gml(const tree<HTML::Node> &tr, tree<HTML::Node>::iterator it, tree<HTML::Node>::iterator end, unsigned int parent_id, unsigned int& label) {			using namespace std;			ostrstream ret;			tree<HTML::Node>::sibling_iterator sib = tr.begin(it);			while(sib != tr.end(it)) {				ret << "node [ id " << ++label << "\n label \"" << label << "\"\n]\n";				ret << "edge [ \n source " << parent_id << "\n target " << label << "\n]" << endl;				ret << __serialize_gml(tr, sib, end, label, label);				++sib;			}				ret << ends;			string str = ret.str();			ret.freeze(0);			return str;		}		string serialize_gml(const tree<HTML::Node> &tr) {			using namespace std;			tree<HTML::Node>::pre_order_iterator it = tr.begin();			tree<HTML::Node>::pre_order_iterator end = tr.end();			string ret;			ret += "graph [";			ret += "directed 1\n";			ret += "node [ id 0\n label \"0\"\n ]\n";			unsigned int label = 0;			ret += __serialize_gml(tr, it, end, 0, label);			ret += "]";			return ret;		}	}//namespace html}//namespace htmlcxx
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -