htmldocument.h

来自「k-means源码(K均值聚类算法源码)」· C头文件 代码 · 共 116 行

H
116
字号
#ifndef _HTMLDOCUMENT_H_#define _HTMLDOCUMENT_H_#include <string>#include <fstream>#include <sstream>#include <ostream>class HTMLDocument {private:	std::string filename;	std::string& replace_all(std::string& s, const std::string& from, const std::string& to)	{		size_t lookHere = 0;		size_t foundHere;		while ((foundHere = s.find(from, lookHere)) != std::string::npos) {			s.replace(foundHere, from.size(), to);			lookHere = foundHere + to.size();		}		return s;	}	std::string& strip_tags(std::string& s) {		size_t leftPos;		while ((leftPos = s.find('<')) != std::string::npos) {			size_t rightPos = s.find('>', leftPos+1);			if (rightPos != std::string::npos) {				s.replace(leftPos, rightPos - leftPos + 1, " ");			}		}				//HTML special chars		replace_all(s, "&lt;", " ");		replace_all(s, "&gt;", " ");		replace_all(s, "&amp;", " ");		replace_all(s, "&nbsp;", " ");				//punctation and symbols		replace_all(s, ",", " ");		replace_all(s, ".", " ");		replace_all(s, ";", " ");		replace_all(s, ":", " ");		replace_all(s, "'", " ");		replace_all(s, "_", " ");		replace_all(s, "~", " ");		replace_all(s, "`", " ");		replace_all(s, "-", " ");		replace_all(s, "+", " ");		replace_all(s, "=", " ");		replace_all(s, "!", " ");		replace_all(s, "?", " ");		replace_all(s, "[", " ");		replace_all(s, "]", " ");		replace_all(s, "{", " ");		replace_all(s, "}", " ");		replace_all(s, ")", " ");		replace_all(s, "(", " ");		replace_all(s, "<", " ");		replace_all(s, ">", " ");		replace_all(s, "/", " ");		replace_all(s, "\\", " ");		replace_all(s, "\"", " ");		replace_all(s, "\n", " ");		replace_all(s, "@", " ");		replace_all(s, "#", " ");		replace_all(s, "$", " ");		replace_all(s, "%", " ");		replace_all(s, "^", " ");		replace_all(s, "&", " ");		replace_all(s, "*", " ");		replace_all(s, "|", " ");		replace_all(s, "0", " ");		replace_all(s, "1", " ");		replace_all(s, "2", " ");		replace_all(s, "3", " ");		replace_all(s, "4", " ");		replace_all(s, "5", " ");		replace_all(s, "6", " ");		replace_all(s, "7", " ");		replace_all(s, "8", " ");		replace_all(s, "9", " ");				// to lowercase		transform(s.begin(), s.end(), s.begin(), tolower);		return s;	}public:	HTMLDocument(){};	// TODO: provide a decent constructor.	HTMLDocument(std::string& filename_) : filename(filename_) {};		std::string& get_filename() 	{		return filename;	}		// FIXME: second argument should be const HTMLDocument but compiler complains!!!	friend std::ostream& operator << (std::ostream& os, HTMLDocument& d)	{			std::ifstream in(d.filename.c_str());		std::ostringstream ss;		ss << in.rdbuf();		std::string content = ss.str();		os << d.strip_tags(content);		return os;	}};#endif /* _HTMLDOCUMENT_H_ */

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?