⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 htmlparse.cpp

📁 html文件编码转换与解析
💻 CPP
字号:
#include "filetobuf.h"#include "htmlparse.h"//Get the Url from <a href = "...">bool GetAnchorURL(string &tag){	int i = 0;	while(!((tag[i] == 'h') && (tag[i+1] =='r') && (tag[i+2] == 'e') && (tag[i+3] == 'f'))){		i ++;	}	i  = i + 4;	while(tag[i++] != '='){	}	while(tag[i] == ' '){		i++;	}	string url;	while(tag[i] != '\0')	{		url += tag[i++];	}	string realurl;	string tempurl = RemoveTarget(url);	i = 1;	if(tempurl[0] == 34)	{		while(tempurl[i] != 34){			realurl += tempurl[i];			i++;		}	}	else if(tempurl[0] == 39)	{		while(tempurl[i] != 39){			realurl += tempurl[i];			i++;		}	}//	int j = 0;	else{		realurl = tempurl;	}//	cout<<"url:"<<realurl<<endl;	fanchor<<realurl<<endl;	return true;}//initialization without argumentHtmlParser::HtmlParser() {}//initialization with argumentHtmlParser::HtmlParser(string input) {	URL = input;	SetBaseURL(URL);	HtmlParser();}//finalizationHtmlParser::~HtmlParser() {}//Turning the revoluteURL to the AbusoluteURLstring HtmlParser::GetAbusoluteURL(string input) {	string AbusoluteURL;	AbusoluteURL = baseURL + input;	return AbusoluteURL;}//set This parsing file's Base URLbool HtmlParser::SetBaseURL(string URL) {	baseURL = URL;	return true;}//just get the URL for testingstring HtmlParser::GetURL() {	return URL;}//just get the Base URL for testingstring HtmlParser::GetBaseURL() {	return baseURL;}//Parse the html filebool HtmlParser::Parse() {	//	cout << "HtmlParser Parse: ParsePosition  " << parseposition << endl;	tagtype type;	while (parseposition != 1024) {		if (buf[bufparseflag][parseposition] == '<') {			type = ParseTag();			switch (type) {			case title:				if(!ParseTitle()){					return false;				}				break;			case anchor:				GetAnchorURL(tag);				if(!ParseA()){					return false;				}				break;			case script:				if(!ParseScript()){					return false;				}				break;			case style:				if(!ParseStyle()){					return false;				}				break;			case fileend:				return false;			default:				break;			}		} 		else{			ParseText();//				cout<<buf[bufparseflag][parseposition]<<endl;//				cout<< parseposition<<endl;			parseposition ++;		}	}	return true;}//Parse the Tag to clearfy the tagtypetagtype HtmlParser::ParseTag() {	//	cout << "HtmlParser ParseTag:" << endl;	tag.clear();	int i = parseposition;	if(!IsBufEnd(i)){		return fileend;	}//	tagstart = i;	//check the tag end	while (buf[bufparseflag][i] != '>') {		tag += buf[bufparseflag][i];		if(!IsBufEnd(i)){			return fileend;		}	}	IsBufEnd(i);	parseposition = i;	//case tag title	MakeLower(tag);	if ((tag[0] == 't') && (tag[1] == 'i') && (tag[2] == 't') && (tag[3] == 'l') && (tag[4] == 'e')) {		return title;	}	//case tag anchor	else if ((tag[0] == 'a') && (tag[1] == ' ')) {		return anchor;	}	else if ((tag[0] == '/') && (tag[1] == 't') && (tag[2] == 'i') && (tag[3] == 't') && (tag[4] == 'l') && (tag[5] == 'e')) {		return titleend;	}	else if ((tag[0] == '/') && (tag[1] == 'a')) {		return anchorend;	} 	else if ((tag[0] == 's') && (tag[1] == 'c') && (tag[2] == 'r') && (tag[3] == 'i') && (tag[4] == 'p') && (tag[5] == 't')) {		return script;	}	else if ((tag[0] == '!') && (tag[1] == '-') && tag[2] == '-') {		int j = tag.size();//		cout<<j<<endl;//		cout<<tag[j-1]<<endl;		if((tag[j-1] != '-') && (tag[j-2] != '-')){			i = parseposition;			if(!IsBufEnd(i)){				return fileend;			}			wstring com;			j = 0;			while (!((buf[bufparseflag][i] == '>') && (com[j-1] == '-') && (com[j-2] == '-'))) {				com[j++] = buf[bufparseflag][i];				if(!IsBufEnd(i)){					return fileend;				}			}			parseposition = i + 1;		}		return comment;	}	else if ((tag[0] == 's') && (tag[1] == 't') && (tag[2] == 'y') && (tag[3] == 'l') && (tag[4] == 'e') ) {		return style;	}	else if ((tag[0] == '/') && (tag[1] == 's') && (tag[2] == 't') && (tag[3] == 'y') && (tag[4] == 'l') && (tag[5] == 'e')) {		return styleend;	}	else return other;}//Parse the <title> tagbool HtmlParser::ParseTitle() {	//	cout << "HtmlParser::ParseTitle()" << endl;	//	int i = parseposition;	int i;	//IsBufEnd(i);	string title;	int end = 1;	while (end == 1) {		i = parseposition;		while (buf[bufparseflag][i] != '<') {			title += buf[bufparseflag][i];			if(!IsBufEnd(i)){				return false;			};		}		parseposition = i;		if (ParseTag() == titleend) {			end = 0;		}	}//	cout <<"title:"<< title << endl;	ftext << title <<endl;	return true;}//Parse the <a> tagbool HtmlParser::ParseA() {	//	cout << "HtmlParser::ParseA" << endl;	//	int i = parseposition;	int i;	string anchor;	int end = 1;	while (end == 1) {		i = parseposition;		while (buf[bufparseflag][i] != '<') {			anchor += buf[bufparseflag][i];			if(!IsBufEnd(i)){				return false;			}		}		parseposition = i;		tagtype type = ParseTag();		if (type == anchorend) {			end = 0;		}	}//	cout<<"anchor:"<< anchor <<endl;	fanchor << anchor << endl;	return true;}//Parse the Scriptbool HtmlParser::ParseScript(){//	cout<< "ParseScript!"<<endl;	int end = 1;	char s[9];	int sflag = 0;	int j;	int i = parseposition;	while(end == 1)	{		s[sflag] = buf[bufparseflag][i];		if((s[sflag] >= 65) && (s[sflag] <= 90)){					s[sflag] += 32;		}		sflag = (sflag + 1) % 9;		j = sflag;		if((s[j] == '<') && (s[(j+1) %9] == '/') && (s[(j+2) %9] == 's') && (s[(j+3) %9] == 'c')				&& (s[(j+4) %9] == 'r') && (s[(j+5) %9] == 'i') && (s[(j+6) %9] == 'p') 				&& (s[(j+7) %9] == 't') && (s[(j+8) %9] == '>')){			end = 0;		}		if(!IsBufEnd(i)){			return false;		}	}	parseposition = i;//	cout<< anchor <<endl;	return true;}//Parse the CSS stylebool HtmlParser::ParseStyle(){//	cout<< "ParseScript!"<<endl;	int i;//	string anchor;	int end = 1;	while (end == 1) {		i = parseposition;		while (buf[bufparseflag][i] != '<') {//			anchor += buf[bufparseflag][i];			if(!IsBufEnd(i)){				return false;			}		}		parseposition = i;		tagtype type = ParseTag();		if (type == styleend) {			end = 0;		}	}//	cout<< anchor <<endl;	return true;}//Parse the Textbool HtmlParser::ParseText() {	if ((buf[bufparseflag][parseposition] != '\n')			&& (buf[bufparseflag][parseposition] != '\t') 			&&	(buf[bufparseflag][parseposition] != '\v')			&&	(buf[bufparseflag][parseposition] != '\r')) {		text += buf[bufparseflag][parseposition];	}	return true;}int main() {	char filename[100];	cout<< "Please enter the file to be parsed:"<<endl;	cin >> filename;	initread(filename);	cout<< "Please enter the file of saving the anchor" <<endl;	char anchorfilename[100];	cin >> anchorfilename;	fanchor.open(anchorfilename, ios::out|ios::trunc);	char textfilename[100];	cout<< "Please enter the file of saving the text" <<endl;	cin >> textfilename;	ftext.open(textfilename, ios::out|ios::trunc);	string URL;	HtmlParser* hp = new HtmlParser(URL);	do {		endflag = readbuf();		parseposition = 0;		bufparseflag = (bufparseflag + 1) % 5;		hp->Parse();	} while (!endflag);//	cout<<"text:"<< text << endl;	ftext << text;	/*	string URL;	 cout<< hp->GetURL() << endl;	 cout<< hp->GetBaseURL() << endl;	 hp->ParseA(); */	ReadEnd();	return 0;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -