⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 crawlermgr.cpp

📁 概述:数据的纵向收集
💻 CPP
字号:
#include "stdafx.h"
#include "CrawlerMgr.h"
#include "Crawler.h"
#include "Template.h"
#include "IniUtil.h"
#include "StoreMgr.h"
#include "Priorizer.h"
#include "BloomFilter.h"

CCrawlerMgr::CCrawlerMgr()
{
	DebugString("[CCrawlerMgr::CCrawlerMgr]","construct");
	
	m_arrTaskDoing.clear();
	m_arrTaskError.clear();

	m_bTaskAssigned=FALSE;

	//2007.3.21
	CPriorizer::Init();
	CPageStat::Init();
	CBloomFilter::Instance();
}
CCrawlerMgr::~CCrawlerMgr()
{
	DebugString("[CCrawlerMgr::~CCrawlerMgr]","destruct");

	m_arrTaskDoing.clear();
	m_arrTaskError.clear();

	m_bTaskAssigned=FALSE;

	//sunwang,2007.2.1
	CCrawler::FreeConnectAll();

	//sunwang,2007.3.20
	CStoreMgr::FreeDB();

	//2007.3.21
	CPriorizer::Fini();
	CPageStat::Fini();
	CBloomFilter::Release();
}

void CCrawlerMgr::StopAll()
{
	CThreadMgr::StopAll();

	DebugString("[CCrawlerMgr::StopAll]","invoke FreeConnectAll");
}

BOOL CCrawlerMgr::DoWork(int nIndex)
{
	//DebugString("[CCrawlerMgr::DoWork]","Worker(%d) doing...",nIndex);
	
	//指示线程退出
	if(GetExitFlag())
	{
		DebugString("[CCrawlerMgr::DoWork]","Worker(%d) stop for exitflag",nIndex);
		return TRUE;
	}

	//使用第一个线程分配工作
	if(nIndex==0 && !m_bTaskAssigned)
	{
		DebugString("[CCrawlerMgr::DoWork]","Worker(%d) assign task now",nIndex);
		AssignTask();
		return FALSE;
	}

	//如果没有分配工作,就空循环
	if(!m_bTaskAssigned)
	{
		DebugString("[CCrawlerMgr::DoWork]","Worker(%d) loop for dont assign task",nIndex);
		Sleep(1000);
		return FALSE;
	}

	//分配工作了,并且没有太多任务正在处理中
	if((int)m_arrTaskDoing.size() <= CTemplate::Instance()->GetConnect())
	{
		CPageStat objTask;
		if(CPriorizer::Pop(objTask))
		{
			DebugString("[CCrawlerMgr::DoWork]","Worker(%d) get task:(%s)",nIndex,objTask.m_strUrl);

			m_arrTaskDoing.insert(objTask.m_strUrl);
			if(HandleTask(objTask))
			{
				DebugString("[CCrawlerMgr::DoWork]","Worker(%d) put task to Done:(%s)",nIndex,objTask.m_strUrl);
			}
			else
			{
				DebugString("[CCrawlerMgr::DoWork]","Worker(%d) put task to Error:(%s)",nIndex,objTask.m_strUrl);
				m_arrTaskError.insert(objTask.m_strUrl);
			}
			m_arrTaskDoing.erase(objTask.m_strUrl);

			return FALSE;
		}
		else
		{
			//工作线程不准自动退出,只准主线程退出;并且要没有悬挂的任务
			//2007.2.5
			if(nIndex==0 && m_arrTaskDoing.size()==0)
			{
				DebugString("[CCrawlerMgr::DoWork]","Worker(%d) task is over so stop",nIndex);
				return TRUE;
			}
			else
			{
				Sleep(1000);
				return FALSE;
			}
		}
	}
	else
	{
		DebugString("[CCrawlerMgr::DoWork]","Worker(%d) loop for too many task in duque",nIndex);
		Sleep(50);
	}

	//指示线程退出
	if(GetExitFlag())
	{
		DebugString("[CCrawlerMgr::DoWork]","Worker(%d) stop for exitflag",nIndex);
		return TRUE;
	}

	//还需要继续循环
	return FALSE;
}

BOOL CCrawlerMgr::HandleTask(CPageStat& objTask)
{
	return CCrawler::HandleTask(objTask);
}

//////////////////////////////////////////////////////////////////////////
void CCrawlerMgr::AssignTask()
{
	if(!m_bTaskAssigned)
	{
		DebugString("[CCrawlerMgr::AssignTask]","begin assign task");

		//检查PageStat库是否为空
		if(CPageStat::IsEmpty())
		{
			AssignTaskFromTemplate();
		}
		else
		{
			AssignTaskFromPageStat();
		}
	}
	m_bTaskAssigned=TRUE;
}

void CCrawlerMgr::AssignTaskFromTemplate()
{
	DebugString("[CCrawlerMgr::AssignTaskFromTemplate]","begin assign task");

	CPageStat objTask;
	CString strTask;
	int nCount = 0;

	CTemplate* objTemplate = CTemplate::Instance();
	if(g_nSid!=objTemplate->GetSid())
	{
		DebugString("[CCrawlerMgr::AssignTask]","stop for g_nSid(%d)!=objTemplate->GetSid()(%d)",g_nSid,objTemplate->GetSid());
		return;
	}

	const std::vector<CSeedItem>& arrSeed = objTemplate->GetSeedArray();
	for(int i=0;i<(int)arrSeed.size();i++)
	{
		if(arrSeed[i].nType==ST_PATTERN)
		{
			DebugString("[CCrawlerMgr::AssignTask]","find seed pattern supported:(%s)",arrSeed[i].objPattern.strPattern);				
			for(int j=arrSeed[i].objPattern.nStart;j<=arrSeed[i].objPattern.nStop;j++)
			{
				strTask.Format(arrSeed[i].objPattern.strPattern,j);
				CPageStat::Build(objTask,strTask,NULL,true,objTemplate->GetTaskTypeX(strTask));
				CPriorizer::Push(objTask); //最初始设置种子,所以没有必要判断seen
				nCount++;
			}
		}
		else if(arrSeed[i].nType==ST_PAGE)
		{
			for(int j=0;j<(int)arrSeed[i].objPage.arrPage.size();j++)
			{
				DebugString("[CCrawlerMgr::AssignTask]","find task page supported:(%s)",arrSeed[i].objPage.arrPage[j]);				
				strTask = arrSeed[i].objPage.arrPage[j];
				CPageStat::Build(objTask,strTask,NULL,true,objTemplate->GetTaskTypeX(strTask));
				CPriorizer::Push(objTask);
				nCount++;
			}
		}
		else
		{
			DebugString("[CCrawlerMgr::AssignTask]","task is not supported:(%d)",arrSeed[i].nType);				
		}
	}

	DebugString("[CCrawlerMgr::AssignTask]","assign task finish,task count=(%d)",nCount);
}

void CCrawlerMgr::AssignTaskFromPageStat()
{
	DebugString("[CCrawlerMgr::AssignTaskFromPageStat]","begin assign task");

	//上次没有爬完的继续,首先如bloom
	CPriorizer::Load();
	CPageStat::Updater();
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -