📄 page.cpp
字号:
/*Page handling */#include <iostream>#include <string>#include <map>#include <vector>#include <iterator>#include "Url.h"#include "Page.h"#include "StrFun.h"CPage::CPage(){ this->m_sLocation = ""; this->m_sCharset = ""; this->m_sContentEncoding = ""; this->m_sContentType = ""; this->m_sContentLinkInfo = ""; this->m_sLinkInfo4SE = ""; this->m_sLinkInfo4History = ""; this->m_sContentNoTags = ""; this->m_nRefLink4SENum = 0; this->m_nRefLink4HistoryNum = 0; this->m_eType = PLAIN_TEXT; for(int i=0; i< MAX_URL_REFERENCES; i++ ){ this->m_RefLink4SE[i].link = NULL; this->m_RefLink4SE[i].anchor_text = NULL; this->m_RefLink4SE[i].strCharset = ""; if(i < MAX_URL_REFERENCES/2){ this->m_RefLink4History[i].link = NULL; } }}CPage::CPage( string strUrl, string strLocation, char* header, char* body, int nLenBody){ assert( header != NULL ); assert( body != NULL ); assert( nLenBody > 0 ); //CPage(); this->m_sLocation = ""; this->m_sCharset = ""; this->m_sContentEncoding = ""; this->m_sContentType = ""; this->m_sContentLinkInfo = ""; this->m_sLinkInfo4SE = ""; this->m_sLinkInfo4History = ""; this->m_sContentNoTags = ""; this->m_nRefLink4SENum = 0; this->m_nRefLink4HistoryNum = 0; this->m_eType = PLAIN_TEXT; for(int i=0; i< MAX_URL_REFERENCES; i++ ){ this->m_RefLink4SE[i].link = NULL; this->m_RefLink4SE[i].anchor_text = NULL; this->m_RefLink4SE[i].strCharset = ""; if(i < MAX_URL_REFERENCES/2){ this->m_RefLink4History[i].link = NULL; } } this->m_sUrl = strUrl; this->m_sLocation = strLocation; //this->m_sHeader.assign(header,strlen(header)); this->m_sHeader = header; this->m_nLenHeader = strlen(header); this->m_sContent.assign(body, nLenBody); this->m_nLenContent = nLenBody;}CPage::~CPage(){}int CPage::GetContentLinkInfo(){ assert( this->m_sContent.empty() == false ); this->m_sContentLinkInfo = this->m_sContent; string& s = this->m_sContentLinkInfo; // transform all separators into one space character const string delims(" \t\r\n"); string::size_type idx=0,pre_idx; while( (idx = s.find_first_of(delims, idx )) != string::npos ){ pre_idx = idx; s.replace(idx,1,1,' '); idx++; while( (idx = s.find_first_of(delims, idx)) != string::npos ){ if( idx-pre_idx == 1){ s.erase(idx, 1); }else{ break; } } idx--; } // transform all "<br>" into one space character //replace(s.begin(), s.end(), "<br>", " "); while( (idx = s.find("<br>")) != string::npos ){ s.replace(idx,4,1,' '); } if( s.length() < 20 ) return -1; // Keep only <img ...>, <area ...>,<script ...> and <a href ...> tags. string::size_type idxHref=0,idxArea=0,idxImg=0; string dest; do{ if( s.empty() ) break; //idxHref = CStrFun::FindCase(s, "<a href"); idxHref = CStrFun::FindCase(s, "href"); idxArea = CStrFun::FindCase(s, "<area"); idxImg = CStrFun::FindCase(s, "<img"); pre_idx = idxHref > idxArea? idxArea: idxHref; pre_idx = idxImg > pre_idx? pre_idx: idxImg; if( pre_idx == string::npos) break; s = s.substr(pre_idx); idx = s.find_first_of('<',1); if( idx != string::npos ){ dest = dest + s.substr(0,idx); }else{ break; } s = s.substr(idx); idxHref=0; idxArea=0; idxImg=0; }while(1); s = dest; /* erase all '\' character * too avoid the following situations: * document.write("<A href=\"/~webg/refpaper/index.html\">t2</A>"); */ while( (idx = s.find_first_of('\\', idx )) != string::npos ){ s.erase(idx, 1); } return( s.length() < 20 ? -1: 0 ); return 0;}int CPage::GetLinkInfo4SE(){ assert( this->m_sContentLinkInfo.empty() == false ); this->m_sLinkInfo4SE = this->m_sContentLinkInfo; string& s = this->m_sLinkInfo4SE; // Keep only <area ...>,and <a href ...> tags. string::size_type idxHref=0,idxArea=0, idx,pre_idx; string dest; do{ if( s.empty() ) break; //idxHref = CStrFun::FindCase(s, "<a href"); idxHref = CStrFun::FindCase(s, "href"); idxArea = CStrFun::FindCase(s, "<area "); pre_idx = idxHref > idxArea? idxArea: idxHref; //pre_idx = idxHref; if( pre_idx == string::npos) break; s = s.substr(pre_idx); idx = s.find_first_of('<',1); if( !(s.length() < 4) ){ idxHref = CStrFun::FindCaseFrom(s, "href", 4); idx = idx > idxHref ? idxHref: idx; } if( idx != string::npos ){ dest = dest + s.substr(0,idx); }else if (idx == string::npos && pre_idx != string::npos){ dest = dest + s; break; }else{ break; } s = s.substr(idx); idxHref=0; idxArea=0; }while(1); s = dest; if( s.length() < 20 ) return -1; //cout << endl << "in it: " << s << endl; // erase all '"' , '\'', " ". while( (idx = s.find('"')) != string::npos ){ s.erase(idx,1); } while( (idx = s.find('\'')) != string::npos ){ s.erase(idx,1); } while( (idx = s.find(" ")) != string::npos ){ s.erase(idx,6); } //cout << endl << "in it: " << s << endl; // Keep URLs and anchor text. idxHref=0; const string delims( " #>"); dest.clear(); do{ if( s.empty() ) break; idxHref = CStrFun::FindCase(s, "href"); if( idxHref == string::npos) break; pre_idx = idxHref; //#### idx = s.find('=', idxHref); if( idx == string::npos ) break; s = s.substr(idx+1); while( s.length() > 0 && s[0] == ' ' ){ s.erase(0,1); } if( s.length() == 0 ) break; idx = s.find_first_of(delims,1); //cout << endl << s.substr(0, idx) << endl; if( idx == string::npos ) break; dest += '"' + s.substr(0, idx); //cout << endl << dest << endl; idx = s.find('>'); if( idx == string::npos ) break; dest += '>'; s = s.substr(idx +1); idx = s.find('<'); if( !s.empty() ){ idxHref = CStrFun::FindCase(s, "href"); idx = idx > idxHref ? idxHref: idx; } if( idx == string::npos ){ dest += s; break; }/* if( idx == idxHref ){ dest += '"' + s.substr(0,idx); }else{*/ dest += s.substr(0,idx); //} //#### idxHref=0; }while(1); // look for empty filenames. idx = 0; while( (idx = dest.find("\"\"",idx)) != string::npos ){ dest.erase(idx, 1); } s = dest; return( s.length() < 20 ? -1: 0 );}int CPage::GetLinkInfo4History(){ assert( this->m_sContentLinkInfo.empty() == false ); this->m_sLinkInfo4History = this->m_sContentLinkInfo; string& s = this->m_sLinkInfo4History; // Keep only <img ...> tags. string::size_type idxImg=0, idx,pre_idx; string dest; do{ if( s.empty() ) break; idxImg = CStrFun::FindCase(s, "<img"); if( idxImg == string::npos) break; pre_idx = idxImg; s = s.substr(pre_idx); idx = s.find_first_of('<',1); if( idx != string::npos ){ dest = dest + s.substr(0,idx); }else if (idx == string::npos && pre_idx != string::npos){ dest = dest + s; break; }else{ break; } s = s.substr(idx); idxImg=0; }while(1); s = dest; if( s.length() < 20 ) return -1; // erase all '"'. while( (idx = s.find('"')) != string::npos ){ s.erase(idx,1); } // Keep URLs and anchor text. idxImg=0; string::size_type idxSrc = 0; const string delims( " \"#>"); dest.clear(); do{ if( s.empty() ) break; idxImg = CStrFun::FindCase(s, "img"); if( idxImg == string::npos) break; pre_idx = idxImg; s = s.substr(idxImg+3); // skip "img" //#### idx = s.find('>', idxImg); if( idxImg == string::npos) break; if( s.empty() ) break; idxSrc = CStrFun::FindCase(s, "src"); if( idxSrc > idxImg ) continue; s = s.substr(idxSrc); idx = s.find('=', idxImg); if( idx == string::npos ) break; s = s.substr(idx+1); while( s.length() > 0 && s[0] == ' ' ){ s.erase(0,1); } if( s.length() == 0 ) break; idx = s.find_first_of(delims,1); if( idx == string::npos ) break; if( s.at(0) == '"'){ dest += s.substr(0, idx); }else{ dest += '"' + s.substr(0, idx); } idx = s.find('>'); if( idx == string::npos ) break; dest += '>'; s = s.substr(idx +1); idx = s.find('<'); if( idx == string::npos ){ dest += s; break; } dest += s.substr(0,idx); //#### idxImg=0; }while(1); // look for empty filenames. idx = 0; while( (idx = dest.find("\"\"",idx)) != string::npos ){ dest.erase(idx, 1); } s = dest; return( s.length() < 20 ? -1: 0 );}int CPage::GetCharset(){ string headerBuf=this->m_sHeader, strCharset; string::size_type pre_idx,idx; const string delims(" \",;>"); CStrFun::Str2Lower( headerBuf, headerBuf.length() ); idx = headerBuf.find("charset="); if( idx != string::npos) { strCharset = headerBuf.substr(idx + sizeof("charset=") -1); } headerBuf = this->m_sContent; headerBuf = headerBuf.substr(0,2024) ; CStrFun::Str2Lower( headerBuf, headerBuf.length() ); idx = headerBuf.find("charset=");
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -