⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 priorizer.cpp

📁 概述:数据的纵向收集
💻 CPP
字号:
#include "stdafx.h"
#include "Priorizer.h"
#include "BloomFilter.h"
#include <time.h>
#include <WinInet.h>

//////////////////////////////////////////////////////////////////////////
FDBHANDLE CPriorizer::s_hDbTask = INVALID_FDBHANDLE;
long	CPriorizer::s_uOpCount = 0;

//////////////////////////////////////////////////////////////////////////
//这里不用判断链接的属性,只要做到点,属性和seen判断由外部决定
void CPriorizer::Push(const CPageStat& objPage)
{
	if(s_hDbTask==INVALID_FDBHANDLE)
		return;

	//
	//对于被动模式,不计算到点,外部计算是否seen;都设置bitmap
	//主动的,计算到点的,就设置bitmap;外部计算是否seen(上次没有爬完的和主动的会冲突所以要算)
	//seen判断也有外部来做
	//
	if (!objPage.m_bPassiveMode)
	{
		__int64 nowTime;
		_time64( &nowTime ); //秒钟
		if(nowTime-objPage.m_int64LastModifyTime < (1 << objPage.m_wPriority)*60)
		{
			return;
		}
	}
	CBloomFilter::Instance()->insert(objPage.m_strUrl);

/*
	CString m_strUrl;
	CString m_strRefer;
	DWORD	m_dwContentFingerprint;
	WORD	m_wPriority;
	__int64 m_int64LastModifyTime;
	__int64 m_int64LastCrawlTime;
	WORD	m_wTotalCrawlCount;
	WORD	m_wHintCount;
	BYTE	m_bPassiveMode;	//被动模式
*/
	FITHANDLE key,value;

	key=fdb_getitem();
	value=fdb_getitem();
	
	fdb_writeb(key,objPage.m_strUrl,objPage.m_strUrl.GetLength());
	
	fdb_writeb(value,objPage.m_strRefer,objPage.m_strRefer.GetLength());
	fdb_write4(value,objPage.m_dwContentFingerprint);
	fdb_write2(value,objPage.m_wPriority);
	fdb_write8(value,objPage.m_int64LastModifyTime);
	fdb_write8(value,objPage.m_int64LastCrawlTime);
	fdb_write2(value,objPage.m_wTotalCrawlCount);
	fdb_write2(value,objPage.m_wHintCount);
	fdb_write1(value,objPage.m_bPassiveMode);
	
	fdb_put(s_hDbTask,key,value);
	
	fdb_putitem(key);
	fdb_putitem(value);

	::InterlockedIncrement(& s_uOpCount);
	if(s_uOpCount%20==0)
	{
		fdb_sync(s_hDbTask);
	}
}

bool CPriorizer::Pop(CPageStat& objPage)
{
	if(s_hDbTask==INVALID_FDBHANDLE)
		return false;
/*
	CString m_strUrl;
	CString m_strRefer;
	DWORD	m_dwContentFingerprint;
	WORD	m_wPriority;
	__int64 m_int64LastModifyTime;
	__int64 m_int64LastCrawlTime;
	WORD	m_wTotalCrawlCount;
	WORD	m_wHintCount;
	BYTE	m_bPassiveMode;	//被动模式
*/
	FITHANDLE key,value;

	key=fdb_getitem();
	value=fdb_getitem();

	if(!fdb_tpop(s_hDbTask,key,value))
	{
		fdb_putitem(key);
		fdb_putitem(value);
		return false;
	}

	char buffer[INTERNET_MAX_PATH_LENGTH] ={0};
	WORD length = INTERNET_MAX_PATH_LENGTH;
	
	length = INTERNET_MAX_PATH_LENGTH;
	fdb_readb(key,buffer,length);
	buffer[length]=0;
	objPage.m_strUrl=buffer;

	length = INTERNET_MAX_PATH_LENGTH;
	fdb_readb(value,buffer,length);
	buffer[length]=0;
	objPage.m_strRefer=buffer;

	fdb_read4(value,objPage.m_dwContentFingerprint);
	fdb_read2(value,objPage.m_wPriority);
	fdb_read8(value,objPage.m_int64LastModifyTime);
	fdb_read8(value,objPage.m_int64LastCrawlTime);
	fdb_read2(value,objPage.m_wTotalCrawlCount);
	fdb_read2(value,objPage.m_wHintCount);
	fdb_read1(value,objPage.m_bPassiveMode);

	fdb_putitem(key);
	fdb_putitem(value);

	::InterlockedIncrement(& s_uOpCount);
	if(s_uOpCount%20==0)
	{
		fdb_sync(s_hDbTask);
	}

	return true;
}

//////////////////////////////////////////////////////////////////////////
void CPriorizer::Init()
{
	if(s_hDbTask!=INVALID_FDBHANDLE)
		return;

	CString strHost;
	strHost.Format("%d",g_nSid);
	s_hDbTask = fdb_openrw(ST_PAGETASK,strHost);
}

void CPriorizer::Fini()
{
	if(s_hDbTask==INVALID_FDBHANDLE)
		return;

	fdb_close(s_hDbTask);
	s_hDbTask = INVALID_FDBHANDLE;
}

void CPriorizer::Load()
{
	if(s_hDbTask==INVALID_FDBHANDLE)
		return;

	FITHANDLE key,value;
	CPageStat objPage;

	char buffer[INTERNET_MAX_PATH_LENGTH] ={0};
	WORD length = INTERNET_MAX_PATH_LENGTH;

	key=fdb_getitem();
	value=fdb_getitem();

	FDBHANDLE cursor = fdb_tfirst(s_hDbTask,key,value);
	if(cursor!=INVALID_FDBHANDLE)
	{
		do 
		{ 
			length = INTERNET_MAX_PATH_LENGTH;
			fdb_readb(key,buffer,length);
			buffer[length]=0;
			CBloomFilter::Instance()->insert(buffer); //最早的地方所以不用判断是否seen

			fdb_resetitem(key);
			fdb_resetitem(value);
		} while(fdb_tnext(s_hDbTask,cursor,key,value));

		fdb_tclose(s_hDbTask,cursor);
	}

	fdb_putitem(key);
	fdb_putitem(value);
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -