📄 crawlermgr.cpp
字号:
#include "stdafx.h"
#include "CrawlerMgr.h"
#include "Crawler.h"
#include "Template.h"
#include "IniUtil.h"
#include "StoreMgr.h"
#include "Priorizer.h"
#include "BloomFilter.h"
CCrawlerMgr::CCrawlerMgr()
{
DebugString("[CCrawlerMgr::CCrawlerMgr]","construct");
m_arrTaskDoing.clear();
m_arrTaskError.clear();
m_bTaskAssigned=FALSE;
//2007.3.21
CPriorizer::Init();
CPageStat::Init();
CBloomFilter::Instance();
}
CCrawlerMgr::~CCrawlerMgr()
{
DebugString("[CCrawlerMgr::~CCrawlerMgr]","destruct");
m_arrTaskDoing.clear();
m_arrTaskError.clear();
m_bTaskAssigned=FALSE;
//sunwang,2007.2.1
CCrawler::FreeConnectAll();
//sunwang,2007.3.20
CStoreMgr::FreeDB();
//2007.3.21
CPriorizer::Fini();
CPageStat::Fini();
CBloomFilter::Release();
}
void CCrawlerMgr::StopAll()
{
CThreadMgr::StopAll();
DebugString("[CCrawlerMgr::StopAll]","invoke FreeConnectAll");
}
BOOL CCrawlerMgr::DoWork(int nIndex)
{
//DebugString("[CCrawlerMgr::DoWork]","Worker(%d) doing...",nIndex);
//指示线程退出
if(GetExitFlag())
{
DebugString("[CCrawlerMgr::DoWork]","Worker(%d) stop for exitflag",nIndex);
return TRUE;
}
//使用第一个线程分配工作
if(nIndex==0 && !m_bTaskAssigned)
{
DebugString("[CCrawlerMgr::DoWork]","Worker(%d) assign task now",nIndex);
AssignTask();
return FALSE;
}
//如果没有分配工作,就空循环
if(!m_bTaskAssigned)
{
DebugString("[CCrawlerMgr::DoWork]","Worker(%d) loop for dont assign task",nIndex);
Sleep(1000);
return FALSE;
}
//分配工作了,并且没有太多任务正在处理中
if((int)m_arrTaskDoing.size() <= CTemplate::Instance()->GetConnect())
{
CPageStat objTask;
if(CPriorizer::Pop(objTask))
{
DebugString("[CCrawlerMgr::DoWork]","Worker(%d) get task:(%s)",nIndex,objTask.m_strUrl);
m_arrTaskDoing.insert(objTask.m_strUrl);
if(HandleTask(objTask))
{
DebugString("[CCrawlerMgr::DoWork]","Worker(%d) put task to Done:(%s)",nIndex,objTask.m_strUrl);
}
else
{
DebugString("[CCrawlerMgr::DoWork]","Worker(%d) put task to Error:(%s)",nIndex,objTask.m_strUrl);
m_arrTaskError.insert(objTask.m_strUrl);
}
m_arrTaskDoing.erase(objTask.m_strUrl);
return FALSE;
}
else
{
//工作线程不准自动退出,只准主线程退出;并且要没有悬挂的任务
//2007.2.5
if(nIndex==0 && m_arrTaskDoing.size()==0)
{
DebugString("[CCrawlerMgr::DoWork]","Worker(%d) task is over so stop",nIndex);
return TRUE;
}
else
{
Sleep(1000);
return FALSE;
}
}
}
else
{
DebugString("[CCrawlerMgr::DoWork]","Worker(%d) loop for too many task in duque",nIndex);
Sleep(50);
}
//指示线程退出
if(GetExitFlag())
{
DebugString("[CCrawlerMgr::DoWork]","Worker(%d) stop for exitflag",nIndex);
return TRUE;
}
//还需要继续循环
return FALSE;
}
BOOL CCrawlerMgr::HandleTask(CPageStat& objTask)
{
return CCrawler::HandleTask(objTask);
}
//////////////////////////////////////////////////////////////////////////
void CCrawlerMgr::AssignTask()
{
if(!m_bTaskAssigned)
{
DebugString("[CCrawlerMgr::AssignTask]","begin assign task");
//检查PageStat库是否为空
if(CPageStat::IsEmpty())
{
AssignTaskFromTemplate();
}
else
{
AssignTaskFromPageStat();
}
}
m_bTaskAssigned=TRUE;
}
void CCrawlerMgr::AssignTaskFromTemplate()
{
DebugString("[CCrawlerMgr::AssignTaskFromTemplate]","begin assign task");
CPageStat objTask;
CString strTask;
int nCount = 0;
CTemplate* objTemplate = CTemplate::Instance();
if(g_nSid!=objTemplate->GetSid())
{
DebugString("[CCrawlerMgr::AssignTask]","stop for g_nSid(%d)!=objTemplate->GetSid()(%d)",g_nSid,objTemplate->GetSid());
return;
}
const std::vector<CSeedItem>& arrSeed = objTemplate->GetSeedArray();
for(int i=0;i<(int)arrSeed.size();i++)
{
if(arrSeed[i].nType==ST_PATTERN)
{
DebugString("[CCrawlerMgr::AssignTask]","find seed pattern supported:(%s)",arrSeed[i].objPattern.strPattern);
for(int j=arrSeed[i].objPattern.nStart;j<=arrSeed[i].objPattern.nStop;j++)
{
strTask.Format(arrSeed[i].objPattern.strPattern,j);
CPageStat::Build(objTask,strTask,NULL,true,objTemplate->GetTaskTypeX(strTask));
CPriorizer::Push(objTask); //最初始设置种子,所以没有必要判断seen
nCount++;
}
}
else if(arrSeed[i].nType==ST_PAGE)
{
for(int j=0;j<(int)arrSeed[i].objPage.arrPage.size();j++)
{
DebugString("[CCrawlerMgr::AssignTask]","find task page supported:(%s)",arrSeed[i].objPage.arrPage[j]);
strTask = arrSeed[i].objPage.arrPage[j];
CPageStat::Build(objTask,strTask,NULL,true,objTemplate->GetTaskTypeX(strTask));
CPriorizer::Push(objTask);
nCount++;
}
}
else
{
DebugString("[CCrawlerMgr::AssignTask]","task is not supported:(%d)",arrSeed[i].nType);
}
}
DebugString("[CCrawlerMgr::AssignTask]","assign task finish,task count=(%d)",nCount);
}
void CCrawlerMgr::AssignTaskFromPageStat()
{
DebugString("[CCrawlerMgr::AssignTaskFromPageStat]","begin assign task");
//上次没有爬完的继续,首先如bloom
CPriorizer::Load();
CPageStat::Updater();
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -