📄 webdatafile.cpp
字号:
// WebDataFile.cpp: implementation of the CWebDataFile class.
//
//////////////////////////////////////////////////////////////////////
#include <iostream>
#include <string>
#include <string.h>
#include <assert.h>
#include <fstream>
#include "WebDataFile.h"
#include "Page.h"
#include "StrFun.h"
using namespace std;
// Set buffer length to default: 1MB
int CWebDataFile::m_nDefaultBufferLen = 32 * 1024 * 1024;
void SaveLink4SE(CPage *iPage, string );
CWebDataFile::CWebDataFile()
{
m_bRawFormat = false;
m_bPageError = false;
m_bFileOpen = false;
m_sFileName = NULL;
m_fRaw = NULL;
m_nReadFileLen = 0;
m_sDataBuffer = NULL;
m_nBufferLen = 0;
m_sUnzipBuffer = NULL;
m_nDataLen = 0;
m_nProcessedLen = 0;
m_bEOF = false;
}
CWebDataFile::~CWebDataFile()
{
CloseFile();
if(m_sFileName != NULL){
delete[] m_sFileName;
m_sFileName = NULL;
}
if(m_sDataBuffer != NULL){
delete[] m_sDataBuffer;
}
if(m_sUnzipBuffer != NULL){
delete[] m_sUnzipBuffer;
}
}
bool CWebDataFile::OpenFile(const char * sFileName)
{
InitDataBuffer();
if(m_sFileName != NULL){
delete[] m_sFileName;
m_sFileName = NULL;
}
m_sFileName = new char[strlen(sFileName) +1];
strcpy(m_sFileName, sFileName);
m_fRaw = fopen(m_sFileName, "rb");
m_bFileOpen = (m_fRaw != NULL);
// Init file relative variables.
m_bRawFormat = false;
m_nReadFileLen = 0;
m_nDataLen = 0;
m_nProcessedLen = 0;
m_bEOF = false;
if(m_bFileOpen){
char sFormat[16];
if(fread(sFormat, 1, 15, m_fRaw) < 8){
//fclose(m_fRaw);
//m_fRaw = NULL;
//m_bFileOpen = false;
m_bRawFormat = true;
}else{
rewind(m_fRaw);
m_bRawFormat = (strncmp(sFormat, "version:", 8) == 0);
}
}
return m_bFileOpen;
}
void CWebDataFile::CloseFile()
{
if(m_fRaw != NULL){
fclose(m_fRaw);
m_fRaw = NULL;
}
m_bFileOpen = false;
}
bool CWebDataFile::InitDataBuffer(unsigned int nLen)
{
if(m_sDataBuffer != NULL)
return true;
m_sDataBuffer = new char[nLen];
if(m_sDataBuffer == NULL){
return false;
}
m_nBufferLen = nLen;
return true;
}
bool CWebDataFile::GetAWebPage(string sFileName)
{
assert(m_sDataBuffer != NULL);
assert(m_fRaw != NULL);
m_bPageError = false;
// Read from file system.
ReadFromFile();
if(m_bRawFormat){
return GetARawWebPage(sFileName);
}
return false;
}
bool CWebDataFile::GetARawWebPage(string sFileName)
{
// Process from last point.
bool bFound = false;
int nPageBegin = m_nProcessedLen;
// Try to find end of page.
int nHeadEnd = nPageBegin;
do{
for( ; m_nProcessedLen<m_nDataLen; m_nProcessedLen++){
if(m_sDataBuffer[m_nProcessedLen] == '\n'){
if(m_nProcessedLen == nPageBegin){
continue;
}
if(m_sDataBuffer[m_nProcessedLen-1] == '\n'){
bFound = true;
nHeadEnd = m_nProcessedLen;
break;
}
}
}
if(! bFound){
// Now m_nProcessedLen = m_nDataLen;
if(m_bEOF){
if(m_nProcessedLen > (nPageBegin+3)){
m_bPageError = true;
cout << "This file seems to be not correctly ended." << endl;
}
cout << "return from bFound" << endl;
return false;
}else{
if(nPageBegin == 0){
if(m_nDataLen == m_nBufferLen){
assert(m_nBufferLen > (4 * 1024));
m_bPageError = true;
cout << "Fail to get head's end." << endl;
return false;
}
}else{
// move buffer to the first.
for(int i=nPageBegin; i<m_nDataLen; i++){
m_sDataBuffer[i-nPageBegin] = m_sDataBuffer[i];
}
m_nDataLen -= nPageBegin;
m_nProcessedLen -= nPageBegin;
nPageBegin = 0;
}
ReadFromFile();
}
}
}while(! bFound); // Must find a Web page. except of EOF file.
// Get the URL.
int nLineBegin = nPageBegin;
int nCount = 0;
for(int j=nHeadEnd-2; j>nPageBegin; j--){
if(m_sDataBuffer[j] == '\n'){
nCount++;
if( nCount==5) { // incase origin:
nLineBegin = j + 1;
break;
}
}
}
string strLenLine(m_sDataBuffer+nLineBegin, nHeadEnd-nLineBegin-1);
char sURL[256];
memset(sURL, 0, 256);
//cout << strLenLine << endl;
char *charIndex = strstr(strLenLine.c_str(), "url:");
if(sscanf(charIndex, "url:%s", sURL) != 1){
m_bPageError = true;
cout << "Fail to get url." << endl;
return false;
}
cout << sURL << endl;
// Get data size.
nLineBegin = nPageBegin;
for(int j=nHeadEnd-2; j>nPageBegin; j--){
if(m_sDataBuffer[j] == '\n'){
nLineBegin = j + 1;
break;
}
}
string strLenLine1(m_sDataBuffer+nLineBegin, nHeadEnd-nLineBegin-1);
int nDataLen;
if(sscanf(strLenLine1.c_str(), "length:%d", &nDataLen) != 1){
m_bPageError = true;
cout << "Fail to get data size." << endl;
return false;
}
//cout << nDataLen << endl;
// Create a page.
int nHeadLen = nHeadEnd - nPageBegin;
int nPageLen = nHeadLen + nDataLen + 2;
if(nPageLen > m_nBufferLen){
cout << "Find a web page whose size is larger than " << m_nDataLen << "!" << endl;
m_bPageError = true;
return false;
}
m_nProcessedLen = nPageBegin + nPageLen;
if(m_nProcessedLen > m_nBufferLen){
// move buffer to the first.
for(int i=nPageBegin; i<m_nDataLen; i++){
m_sDataBuffer[i-nPageBegin] = m_sDataBuffer[i];
}
m_nDataLen -= nPageBegin;
m_nProcessedLen -= nPageBegin;
nPageBegin = 0;
}
while(m_nProcessedLen > m_nDataLen){
if(m_bEOF){
cout << "This file is not correctly ended." << endl;
m_bPageError = true;
return false;
}
ReadFromFile();
}
if(m_sDataBuffer[m_nProcessedLen-1] != '\n'){
cout << "Find a page that is not correctly ended." << endl;
m_bPageError = true;
return false;
}
m_sDataBuffer[m_nProcessedLen-1] = '\0';
// print the record: a bodyfirst and a bodysecond
string sBuf1;
string sBuf2;
for(int i=nHeadEnd+1; i<nPageBegin+nPageLen-1; i++){
if (m_sDataBuffer[i]=='\n' && m_sDataBuffer[i-1]=='\n') {
string strBodyFirst(m_sDataBuffer+nHeadEnd+1, i-1-(nHeadEnd+1));
string strBodySecond(m_sDataBuffer+i+1,nPageBegin+nPageLen-1-1-i);
sBuf1 = strBodyFirst;
sBuf2 = strBodySecond;
//cout << "\nBodyFirst:\n" << strBodyFirst;
//cout << "\nBodySecond:\n" << strBodySecond;
break;
}
}
// yhf
CPage iPage;
iPage.m_sUrl = sURL;
if (sURL=="http://www.yanruyu.net/service/domainwhois.asp"){
cout << "\n" << sBuf1 << endl;
}
string headerBuf = sBuf1;
iPage.m_sContent = sBuf2;
if (iPage.m_sContent.size() > 600*1024){
//return false;
return true; // donot stop
}
//cout << iPage.m_sContent << endl;
iPage.ParseHeaderInfo(headerBuf);
if (iPage.m_sContentType == "text/html") {
if (iPage.ParseHyperLinks() == false){
cout << "error! ParseHyperLinks" << endl;
//return false;
return true; // donot stop
}
}
SaveLink4SE( &iPage, sFileName);
return true;
}
void CWebDataFile::ReadFromFile()
{
if(m_bEOF) // end of file.
return;
if(m_nDataLen < m_nBufferLen){
int nRead = fread(m_sDataBuffer+m_nDataLen, 1,
(m_nBufferLen - m_nDataLen), m_fRaw);
m_nDataLen += nRead;
m_nReadFileLen += nRead;
}
m_bEOF = (feof(m_fRaw) != 0);
}
int CWebDataFile::GetProcessedFileLen()
{
return m_nReadFileLen + m_nProcessedLen - m_nDataLen;
}
void CWebDataFile::SeekToLength(int nLen)
{
if(m_bFileOpen){
fseek(m_fRaw, nLen, SEEK_SET);
m_nReadFileLen = nLen;
m_nProcessedLen = 0;
m_nDataLen = 0;
}
}
void SaveLink4SE(CPage *iPage, string sFileName)
{
string linkFileName;
string::size_type idx = 0;
idx = sFileName.rfind('/');
if(idx==string::npos) {
linkFileName = "link." + sFileName;
}else{
linkFileName = "link." + sFileName.substr(idx+1);
}
ofstream m_ofsLink4SEFile(linkFileName.c_str(), ios::out|ios::app|ios::binary);
if( m_ofsLink4SEFile && iPage->m_nRefLink4SENum>0 ){
m_ofsLink4SEFile << "root_url: " << iPage->m_sUrl << endl;
m_ofsLink4SEFile << "charset: " << iPage->m_sCharset << endl;
m_ofsLink4SEFile << "number: " << iPage->m_nRefLink4SENum << endl;
map<string,string>::iterator it4SE = iPage->m_mapLink4SE.begin();
for( ; it4SE!= iPage->m_mapLink4SE.end(); ++it4SE ){
m_ofsLink4SEFile << (*it4SE).first << '\t' << (*it4SE).second << endl;
}
m_ofsLink4SEFile << endl;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -