⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 webdatafile.cpp

📁 此源码功能是捕获网页中的链接并进行分析
💻 CPP
字号:
// WebDataFile.cpp: implementation of the CWebDataFile class.
//
//////////////////////////////////////////////////////////////////////

#include <iostream>
#include <string>
#include <string.h>
#include <assert.h>
#include <fstream>

#include "WebDataFile.h"
#include "Page.h"
#include "StrFun.h"
                                  
using namespace std;

// Set buffer length to default: 1MB
int CWebDataFile::m_nDefaultBufferLen = 32 * 1024 * 1024;

void SaveLink4SE(CPage *iPage, string );

CWebDataFile::CWebDataFile()
{
	m_bRawFormat = false;
	m_bPageError = false;
	
	m_bFileOpen = false;
	m_sFileName = NULL;
	m_fRaw = NULL;
	m_nReadFileLen = 0;

	m_sDataBuffer = NULL;
	m_nBufferLen = 0;
	m_sUnzipBuffer = NULL;

	m_nDataLen = 0;
	m_nProcessedLen = 0;
	m_bEOF = false;
}

CWebDataFile::~CWebDataFile()
{
	CloseFile();
	if(m_sFileName != NULL){
		delete[] m_sFileName;
		m_sFileName = NULL;
	}
	if(m_sDataBuffer != NULL){
		delete[] m_sDataBuffer;
	}
	if(m_sUnzipBuffer != NULL){
		delete[] m_sUnzipBuffer;
	}
}

bool CWebDataFile::OpenFile(const char * sFileName)
{
	InitDataBuffer();
	if(m_sFileName != NULL){
		delete[] m_sFileName;
		m_sFileName = NULL;
	}
	m_sFileName = new char[strlen(sFileName) +1];
	strcpy(m_sFileName, sFileName);

	m_fRaw = fopen(m_sFileName, "rb");
	m_bFileOpen = (m_fRaw != NULL);

	// Init file relative variables.
	m_bRawFormat = false;
	m_nReadFileLen = 0;
	m_nDataLen = 0;
	m_nProcessedLen = 0;
	m_bEOF = false;
	
	if(m_bFileOpen){
                char sFormat[16];
                if(fread(sFormat, 1, 15, m_fRaw) < 8){
                        //fclose(m_fRaw);
                        //m_fRaw = NULL;
                        //m_bFileOpen = false;
                        m_bRawFormat = true;
                }else{
                        rewind(m_fRaw);
                        m_bRawFormat = (strncmp(sFormat, "version:", 8) == 0);
                }
        }
	
	return m_bFileOpen;
}

void CWebDataFile::CloseFile()
{
	if(m_fRaw != NULL){
		fclose(m_fRaw);
		m_fRaw = NULL;
	}
	m_bFileOpen = false;
}

bool CWebDataFile::InitDataBuffer(unsigned int nLen)
{
	if(m_sDataBuffer != NULL)
		return true;
		
	m_sDataBuffer = new char[nLen];
	if(m_sDataBuffer == NULL){
		return false;
	}
	m_nBufferLen = nLen;
	return true;
}

bool CWebDataFile::GetAWebPage(string sFileName)
{
	assert(m_sDataBuffer != NULL);
	assert(m_fRaw != NULL);
	
	m_bPageError = false;
	// Read from file system.
	ReadFromFile();
	
	if(m_bRawFormat){
		return GetARawWebPage(sFileName);
	}

	return false;
}

bool CWebDataFile::GetARawWebPage(string sFileName)
{
	// Process from last point.
	bool bFound = false;
	int nPageBegin = m_nProcessedLen; 
	// Try to find end of page.
	int nHeadEnd = nPageBegin;

	do{
		for( ; m_nProcessedLen<m_nDataLen; m_nProcessedLen++){
			if(m_sDataBuffer[m_nProcessedLen] == '\n'){
				if(m_nProcessedLen == nPageBegin){
					continue;
				}
				if(m_sDataBuffer[m_nProcessedLen-1] == '\n'){
					bFound = true;
					nHeadEnd = m_nProcessedLen;
					break;
				}
			}
		}
		if(! bFound){
			// Now m_nProcessedLen = m_nDataLen;
			if(m_bEOF){
				if(m_nProcessedLen > (nPageBegin+3)){
					m_bPageError = true;
					cout << "This file seems to be not correctly ended." << endl;
				}

				cout << "return from bFound" << endl;
				return false;
			}else{
				if(nPageBegin == 0){
					if(m_nDataLen == m_nBufferLen){
						assert(m_nBufferLen > (4 * 1024));
						m_bPageError = true;
						cout << "Fail to get head's end." << endl;
						return false;
					}
				}else{
					// move buffer to the first.
					for(int i=nPageBegin; i<m_nDataLen; i++){
						m_sDataBuffer[i-nPageBegin] = m_sDataBuffer[i];
					}
					m_nDataLen -= nPageBegin;
					m_nProcessedLen -= nPageBegin;
					nPageBegin = 0;
				}
				ReadFromFile();
			}
		}
	}while(! bFound); // Must find a Web page. except of EOF file.
	
	// Get the URL.
	int nLineBegin = nPageBegin;
	int nCount = 0;
	for(int j=nHeadEnd-2; j>nPageBegin; j--){
		if(m_sDataBuffer[j] == '\n'){
			nCount++;
			if( nCount==5) {	// incase origin:
				nLineBegin = j + 1;
				break;
			}
		}
	}
	string strLenLine(m_sDataBuffer+nLineBegin, nHeadEnd-nLineBegin-1);
	char sURL[256];
	memset(sURL, 0, 256);		

	//cout << strLenLine << endl;
	char *charIndex = strstr(strLenLine.c_str(), "url:");
	if(sscanf(charIndex, "url:%s", sURL) != 1){
		m_bPageError = true;
		cout << "Fail to get url." << endl;
		return false;
	}
	cout << sURL << endl;

	// Get data size.
	nLineBegin = nPageBegin;
	for(int j=nHeadEnd-2; j>nPageBegin; j--){
		if(m_sDataBuffer[j] == '\n'){
			nLineBegin = j + 1;
			break;
		}
	}

	string strLenLine1(m_sDataBuffer+nLineBegin, nHeadEnd-nLineBegin-1);
	int nDataLen;

	if(sscanf(strLenLine1.c_str(), "length:%d", &nDataLen) != 1){
		m_bPageError = true;
		cout << "Fail to get data size." << endl;
		return false;
	}
	//cout << nDataLen << endl;
    
	// Create a page.
	int nHeadLen = nHeadEnd - nPageBegin;
	int nPageLen = nHeadLen + nDataLen + 2;

	if(nPageLen > m_nBufferLen){
		cout << "Find a web page whose size is larger than " << m_nDataLen << "!" << endl;
		m_bPageError = true;
		return false;
	}

	m_nProcessedLen = nPageBegin + nPageLen;
	if(m_nProcessedLen > m_nBufferLen){
		// move buffer to the first.
		for(int i=nPageBegin; i<m_nDataLen; i++){
			m_sDataBuffer[i-nPageBegin] = m_sDataBuffer[i];
		}
		m_nDataLen -= nPageBegin;
		m_nProcessedLen -= nPageBegin;
		nPageBegin = 0;
	}
	while(m_nProcessedLen > m_nDataLen){
		if(m_bEOF){
			cout << "This file is not correctly ended." << endl;
			m_bPageError = true;
			return false;
		}
		ReadFromFile();
	}

	if(m_sDataBuffer[m_nProcessedLen-1] != '\n'){
		cout << "Find a page that is not correctly ended." << endl;
		m_bPageError = true;
		return false;
	}
	m_sDataBuffer[m_nProcessedLen-1] = '\0';
	
	// print the record: a bodyfirst and a bodysecond
	string sBuf1;
	string sBuf2;
	for(int i=nHeadEnd+1; i<nPageBegin+nPageLen-1; i++){
		if (m_sDataBuffer[i]=='\n' && m_sDataBuffer[i-1]=='\n') {
			string strBodyFirst(m_sDataBuffer+nHeadEnd+1, i-1-(nHeadEnd+1));
			string strBodySecond(m_sDataBuffer+i+1,nPageBegin+nPageLen-1-1-i);
			sBuf1 = strBodyFirst;
			sBuf2 = strBodySecond;
			//cout << "\nBodyFirst:\n" << strBodyFirst;
			//cout << "\nBodySecond:\n" << strBodySecond;
			break;
		}
	} 

   // yhf
    CPage iPage;
    iPage.m_sUrl = sURL;

    if (sURL=="http://www.yanruyu.net/service/domainwhois.asp"){
	cout << "\n" << sBuf1 << endl;
    }

    string headerBuf = sBuf1;
    iPage.m_sContent = sBuf2;
    if (iPage.m_sContent.size() > 600*1024){
            //return false;
            return true;	// donot stop
    }

	//cout << iPage.m_sContent << endl;

    iPage.ParseHeaderInfo(headerBuf);
    if (iPage.m_sContentType == "text/html") {
        if (iPage.ParseHyperLinks() == false){
            cout << "error! ParseHyperLinks" << endl;
            //return false;
            return true;	// donot stop
        }
    }
    SaveLink4SE( &iPage, sFileName);

	
	return true;
}

void CWebDataFile::ReadFromFile()
{
	if(m_bEOF) // end of file.
		return;
	if(m_nDataLen < m_nBufferLen){
		int nRead = fread(m_sDataBuffer+m_nDataLen, 1, 
			(m_nBufferLen - m_nDataLen), m_fRaw);
		m_nDataLen += nRead;
		m_nReadFileLen += nRead;
	}
	m_bEOF = (feof(m_fRaw) != 0);
}

int CWebDataFile::GetProcessedFileLen()
{
	return m_nReadFileLen + m_nProcessedLen - m_nDataLen;
}


void CWebDataFile::SeekToLength(int nLen)
{
	if(m_bFileOpen){
		fseek(m_fRaw, nLen, SEEK_SET);
		m_nReadFileLen = nLen;
		m_nProcessedLen = 0;
		m_nDataLen = 0;
	}
}

void SaveLink4SE(CPage *iPage, string sFileName)
{
    string linkFileName;

	string::size_type idx = 0;
	idx = sFileName.rfind('/');
	if(idx==string::npos) {
		linkFileName = "link." + sFileName;
	}else{
		linkFileName = "link." + sFileName.substr(idx+1);
	}
	

    ofstream m_ofsLink4SEFile(linkFileName.c_str(), ios::out|ios::app|ios::binary);
    if( m_ofsLink4SEFile && iPage->m_nRefLink4SENum>0 ){
        m_ofsLink4SEFile << "root_url: " << iPage->m_sUrl << endl;
        m_ofsLink4SEFile << "charset: " << iPage->m_sCharset << endl;
        m_ofsLink4SEFile << "number: " << iPage->m_nRefLink4SENum << endl;

        map<string,string>::iterator it4SE = iPage->m_mapLink4SE.begin();
        for( ; it4SE!= iPage->m_mapLink4SE.end(); ++it4SE ){
            m_ofsLink4SEFile << (*it4SE).first << '\t' << (*it4SE).second << endl;
        }
        m_ofsLink4SEFile << endl;
    }
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -