📄 htmlparse.cpp
字号:
#include "filetobuf.h"#include "htmlparse.h"//Get the Url from <a href = "...">bool GetAnchorURL(string &tag){ int i = 0; while(!((tag[i] == 'h') && (tag[i+1] =='r') && (tag[i+2] == 'e') && (tag[i+3] == 'f'))){ i ++; } i = i + 4; while(tag[i++] != '='){ } while(tag[i] == ' '){ i++; } string url; while(tag[i] != '\0') { url += tag[i++]; } string realurl; string tempurl = RemoveTarget(url); i = 1; if(tempurl[0] == 34) { while(tempurl[i] != 34){ realurl += tempurl[i]; i++; } } else if(tempurl[0] == 39) { while(tempurl[i] != 39){ realurl += tempurl[i]; i++; } }// int j = 0; else{ realurl = tempurl; }// cout<<"url:"<<realurl<<endl; fanchor<<realurl<<endl; return true;}//initialization without argumentHtmlParser::HtmlParser() {}//initialization with argumentHtmlParser::HtmlParser(string input) { URL = input; SetBaseURL(URL); HtmlParser();}//finalizationHtmlParser::~HtmlParser() {}//Turning the revoluteURL to the AbusoluteURLstring HtmlParser::GetAbusoluteURL(string input) { string AbusoluteURL; AbusoluteURL = baseURL + input; return AbusoluteURL;}//set This parsing file's Base URLbool HtmlParser::SetBaseURL(string URL) { baseURL = URL; return true;}//just get the URL for testingstring HtmlParser::GetURL() { return URL;}//just get the Base URL for testingstring HtmlParser::GetBaseURL() { return baseURL;}//Parse the html filebool HtmlParser::Parse() { // cout << "HtmlParser Parse: ParsePosition " << parseposition << endl; tagtype type; while (parseposition != 1024) { if (buf[bufparseflag][parseposition] == '<') { type = ParseTag(); switch (type) { case title: if(!ParseTitle()){ return false; } break; case anchor: GetAnchorURL(tag); if(!ParseA()){ return false; } break; case script: if(!ParseScript()){ return false; } break; case style: if(!ParseStyle()){ return false; } break; case fileend: return false; default: break; } } else{ ParseText();// cout<<buf[bufparseflag][parseposition]<<endl;// cout<< parseposition<<endl; parseposition ++; } } return true;}//Parse the Tag to clearfy the tagtypetagtype HtmlParser::ParseTag() { // cout << "HtmlParser ParseTag:" << endl; tag.clear(); int i = parseposition; if(!IsBufEnd(i)){ return fileend; }// tagstart = i; //check the tag end while (buf[bufparseflag][i] != '>') { tag += buf[bufparseflag][i]; if(!IsBufEnd(i)){ return fileend; } } IsBufEnd(i); parseposition = i; //case tag title MakeLower(tag); if ((tag[0] == 't') && (tag[1] == 'i') && (tag[2] == 't') && (tag[3] == 'l') && (tag[4] == 'e')) { return title; } //case tag anchor else if ((tag[0] == 'a') && (tag[1] == ' ')) { return anchor; } else if ((tag[0] == '/') && (tag[1] == 't') && (tag[2] == 'i') && (tag[3] == 't') && (tag[4] == 'l') && (tag[5] == 'e')) { return titleend; } else if ((tag[0] == '/') && (tag[1] == 'a')) { return anchorend; } else if ((tag[0] == 's') && (tag[1] == 'c') && (tag[2] == 'r') && (tag[3] == 'i') && (tag[4] == 'p') && (tag[5] == 't')) { return script; } else if ((tag[0] == '!') && (tag[1] == '-') && tag[2] == '-') { int j = tag.size();// cout<<j<<endl;// cout<<tag[j-1]<<endl; if((tag[j-1] != '-') && (tag[j-2] != '-')){ i = parseposition; if(!IsBufEnd(i)){ return fileend; } wstring com; j = 0; while (!((buf[bufparseflag][i] == '>') && (com[j-1] == '-') && (com[j-2] == '-'))) { com[j++] = buf[bufparseflag][i]; if(!IsBufEnd(i)){ return fileend; } } parseposition = i + 1; } return comment; } else if ((tag[0] == 's') && (tag[1] == 't') && (tag[2] == 'y') && (tag[3] == 'l') && (tag[4] == 'e') ) { return style; } else if ((tag[0] == '/') && (tag[1] == 's') && (tag[2] == 't') && (tag[3] == 'y') && (tag[4] == 'l') && (tag[5] == 'e')) { return styleend; } else return other;}//Parse the <title> tagbool HtmlParser::ParseTitle() { // cout << "HtmlParser::ParseTitle()" << endl; // int i = parseposition; int i; //IsBufEnd(i); string title; int end = 1; while (end == 1) { i = parseposition; while (buf[bufparseflag][i] != '<') { title += buf[bufparseflag][i]; if(!IsBufEnd(i)){ return false; }; } parseposition = i; if (ParseTag() == titleend) { end = 0; } }// cout <<"title:"<< title << endl; ftext << title <<endl; return true;}//Parse the <a> tagbool HtmlParser::ParseA() { // cout << "HtmlParser::ParseA" << endl; // int i = parseposition; int i; string anchor; int end = 1; while (end == 1) { i = parseposition; while (buf[bufparseflag][i] != '<') { anchor += buf[bufparseflag][i]; if(!IsBufEnd(i)){ return false; } } parseposition = i; tagtype type = ParseTag(); if (type == anchorend) { end = 0; } }// cout<<"anchor:"<< anchor <<endl; fanchor << anchor << endl; return true;}//Parse the Scriptbool HtmlParser::ParseScript(){// cout<< "ParseScript!"<<endl; int end = 1; char s[9]; int sflag = 0; int j; int i = parseposition; while(end == 1) { s[sflag] = buf[bufparseflag][i]; if((s[sflag] >= 65) && (s[sflag] <= 90)){ s[sflag] += 32; } sflag = (sflag + 1) % 9; j = sflag; if((s[j] == '<') && (s[(j+1) %9] == '/') && (s[(j+2) %9] == 's') && (s[(j+3) %9] == 'c') && (s[(j+4) %9] == 'r') && (s[(j+5) %9] == 'i') && (s[(j+6) %9] == 'p') && (s[(j+7) %9] == 't') && (s[(j+8) %9] == '>')){ end = 0; } if(!IsBufEnd(i)){ return false; } } parseposition = i;// cout<< anchor <<endl; return true;}//Parse the CSS stylebool HtmlParser::ParseStyle(){// cout<< "ParseScript!"<<endl; int i;// string anchor; int end = 1; while (end == 1) { i = parseposition; while (buf[bufparseflag][i] != '<') {// anchor += buf[bufparseflag][i]; if(!IsBufEnd(i)){ return false; } } parseposition = i; tagtype type = ParseTag(); if (type == styleend) { end = 0; } }// cout<< anchor <<endl; return true;}//Parse the Textbool HtmlParser::ParseText() { if ((buf[bufparseflag][parseposition] != '\n') && (buf[bufparseflag][parseposition] != '\t') && (buf[bufparseflag][parseposition] != '\v') && (buf[bufparseflag][parseposition] != '\r')) { text += buf[bufparseflag][parseposition]; } return true;}int main() { char filename[100]; cout<< "Please enter the file to be parsed:"<<endl; cin >> filename; initread(filename); cout<< "Please enter the file of saving the anchor" <<endl; char anchorfilename[100]; cin >> anchorfilename; fanchor.open(anchorfilename, ios::out|ios::trunc); char textfilename[100]; cout<< "Please enter the file of saving the text" <<endl; cin >> textfilename; ftext.open(textfilename, ios::out|ios::trunc); string URL; HtmlParser* hp = new HtmlParser(URL); do { endflag = readbuf(); parseposition = 0; bufparseflag = (bufparseflag + 1) % 5; hp->Parse(); } while (!endflag);// cout<<"text:"<< text << endl; ftext << text; /* string URL; cout<< hp->GetURL() << endl; cout<< hp->GetBaseURL() << endl; hp->ParseA(); */ ReadEnd(); return 0;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -