⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 pagestat.cpp

📁 概述:数据的纵向收集
💻 CPP
字号:
#include "stdafx.h"
#include "PageStat.h"
#include "Template.h"
#include "Priorizer.h"
#include <WinInet.h>
#include "BloomFilter.h"

//////////////////////////////////////////////////////////////////////////
FDBHANDLE CPageStat::s_hDbStat = INVALID_FDBHANDLE;
long	CPageStat::s_uOpCount = 0;

//////////////////////////////////////////////////////////////////////////
//对于被动模式的objPage都由这个直接算出来了,第一次爬行时候,全部都为被动模式
bool CPageStat::Build(CPageStat& objPage,LPCSTR lpszUrl,LPCSTR lpszRefer,bool bPassiveMode,DWORD dwType)
{
/*	CString m_strUrl;
	CString m_strRefer;
	DWORD	m_dwContentFingerprint;
	WORD	m_wPriority;
	__int64 m_int64LastModifyTime;
	__int64 m_int64LastCrawlTime;
	WORD	m_wTotalCrawlCount;
	WORD	m_wHintCount;
	BYTE	m_bPassiveMode;	
*/
	objPage.m_strUrl = lpszUrl;
	objPage.m_strRefer = lpszRefer;
	objPage.m_dwContentFingerprint = 0;
	objPage.m_wPriority = 0;
	objPage.m_int64LastModifyTime = 0;
	objPage.m_int64LastCrawlTime = 0;
	objPage.m_wTotalCrawlCount = 0;
	objPage.m_wHintCount = 0;
	objPage.m_bPassiveMode = bPassiveMode;

	//设置推荐的Priority
	if(dwType & TT_PARSE)
	{
		objPage.m_wPriority = _PRIORITY_BASE_PARSE_;
	}
	else if(dwType & TT_STORE)
	{
		objPage.m_wPriority = _PRIORITY_BASE_STORE_;
	}
	else
	{
		return false;
	}

	return true;
}

//////////////////////////////////////////////////////////////////////////
void CPageStat::Del(const CPageStat& objPage)
{
	if(s_hDbStat==INVALID_FDBHANDLE)
		return;

	FITHANDLE key;

	key = fdb_getitem();

	fdb_writeb(key,objPage.m_strUrl,objPage.m_strUrl.GetLength());
	fdb_del(s_hDbStat,key);

	fdb_putitem(key);

	::InterlockedIncrement(& s_uOpCount);
	if(s_uOpCount%20==0)
	{
		fdb_sync(s_hDbStat);
	}
}

void CPageStat::Build(CPageStat& objPage,FITHANDLE key,FITHANDLE value,bool bPassiveMode)
{
/*	CString m_strUrl;
	CString m_strRefer;
	DWORD	m_dwContentFingerprint;
	WORD	m_wPriority;
	__int64 m_int64LastModifyTime;
	__int64 m_int64LastCrawlTime;
	WORD	m_wTotalCrawlCount;
	WORD	m_wHintCount;
*/
	char buffer[INTERNET_MAX_PATH_LENGTH] ={0};
	WORD length = INTERNET_MAX_PATH_LENGTH;

	length = INTERNET_MAX_PATH_LENGTH;
	fdb_readb(key,buffer,length);
	buffer[length]=0;
	objPage.m_strUrl=buffer;

	length = INTERNET_MAX_PATH_LENGTH;
	fdb_readb(value,buffer,length);
	buffer[length]=0;
	objPage.m_strRefer=buffer;

	fdb_read4(value,objPage.m_dwContentFingerprint);
	fdb_read2(value,objPage.m_wPriority);
	fdb_read8(value,objPage.m_int64LastModifyTime);
	fdb_read8(value,objPage.m_int64LastCrawlTime);
	fdb_read2(value,objPage.m_wTotalCrawlCount);
	fdb_read2(value,objPage.m_wHintCount);
	objPage.m_bPassiveMode = bPassiveMode;
}

bool CPageStat::Get(CPageStat& objPage)
{
	if(s_hDbStat==INVALID_FDBHANDLE)
		return false;

	bool bResult = false;
	FITHANDLE key,value;

	key = fdb_getitem();
	value = fdb_getitem();
	fdb_writeb(key,objPage.m_strUrl,objPage.m_strUrl.GetLength());

	if(fdb_get(s_hDbStat,key,value))
	{
		Build(objPage,key,value,true);
		bResult=true;

		::InterlockedIncrement(& s_uOpCount);
		if(s_uOpCount%20==0)
		{
			fdb_sync(s_hDbStat);
		}
	}

	fdb_putitem(key);
	fdb_putitem(value);

	return bResult;
}

//去一条数据看看是否能取到来测试是否为空
bool CPageStat::IsEmpty()
{
	if(s_hDbStat==INVALID_FDBHANDLE)
		return true;
	
	bool bResult = true;
	FITHANDLE key,value;
	
	key = fdb_getitem();
	value = fdb_getitem();

	FDBHANDLE cursor = fdb_tfirst(s_hDbStat,key,value);
	if(cursor!=INVALID_FDBHANDLE)
	{
		bResult=false;
		fdb_tclose(s_hDbStat,cursor);
	}

	fdb_putitem(key);
	fdb_putitem(value);

	return bResult;
}

//注意被动模式的合并,被动和主动模式的合并,应该是业务层面自己的事情
//对于主动模式的,自身携带信息,在下载完毕后修改状态调用put
//对于被动模式的,自身开始时候没有携带信息,所以要先调用get,然后修改相关状态并调用put
void CPageStat::Put(const CPageStat& objPage)
{
	if(s_hDbStat==INVALID_FDBHANDLE)
		return;
/*
	CString m_strUrl;
	CString m_strRefer;
	DWORD	m_dwContentFingerprint;
	WORD	m_wPriority;
	__int64 m_int64LastModifyTime;
	__int64 m_int64LastCrawlTime;
	WORD	m_wTotalCrawlCount;
	WORD	m_wHintCount;
*/
	FITHANDLE key,value;

	key=fdb_getitem();
	value=fdb_getitem();

	fdb_writeb(key,objPage.m_strUrl,objPage.m_strUrl.GetLength());

	fdb_writeb(value,objPage.m_strRefer,objPage.m_strRefer.GetLength());
	fdb_write4(value,objPage.m_dwContentFingerprint);
	fdb_write2(value,objPage.m_wPriority);
	fdb_write8(value,objPage.m_int64LastModifyTime);
	fdb_write8(value,objPage.m_int64LastCrawlTime);
	fdb_write2(value,objPage.m_wTotalCrawlCount);
	fdb_write2(value,objPage.m_wHintCount);

	fdb_put(s_hDbStat,key,value);

	fdb_putitem(key);
	fdb_putitem(value);

	::InterlockedIncrement(& s_uOpCount);
	if(s_uOpCount%20==0)
	{
		fdb_sync(s_hDbStat);
	}
}

void CPageStat::Init()
{
	if(s_hDbStat!=INVALID_FDBHANDLE)
		return;

	CString strHost;
	strHost.Format("%d",g_nSid);
	s_hDbStat = fdb_openrw(ST_PAGESTAT,strHost);
}

void CPageStat::Fini()
{
	if(s_hDbStat==INVALID_FDBHANDLE)
		return;

	fdb_close(s_hDbStat);
	s_hDbStat = INVALID_FDBHANDLE;
}

//////////////////////////////////////////////////////////////////////////
bool CPageStat::Updater()
{
	if(s_hDbStat==INVALID_FDBHANDLE)
		return false;

	FITHANDLE key,value;
	CPageStat objPage;

	key=fdb_getitem();
	value=fdb_getitem();

	FDBHANDLE cursor = fdb_tfirst(s_hDbStat,key,value);
	if(cursor!=INVALID_FDBHANDLE)
	{
		do 
		{ 
			Build(objPage,key,value,false);
			if(!CBloomFilter::Instance()->find(objPage.m_strUrl))
				CPriorizer::Push(objPage);

			fdb_resetitem(key);
			fdb_resetitem(value);
		} while(fdb_tnext(s_hDbStat,cursor,key,value));

		fdb_tclose(s_hDbStat,cursor);
	}

	fdb_putitem(key);
	fdb_putitem(value);

	return true;
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -