📄 crawler.cpp
字号:
#include "stdafx.h"
#include "Crawler.h"
#include "TimeUtil.h"
#include "Template.h"
#include <WinInet.h>
#include "ThreadMgr.h"
#include "TemplateEx.h"
#include "IniUtil.h"
#include "ZLib.h"
#include "ZUtils.h"
#include <strstream>
#include <fstream>
#include "BloomFilter.h"
#include "Priorizer.h"
#include "Md5.h"
//////////////////////////////////////////////////////////////////////////
CIOCriticalSection CCrawler::s_csLinkParse;
CHttpCrawler CCrawler::crawler;
//input:strTask,需要parse和store的,所以首先必须下载,然后pattern到是parse还是store
//parse的网页链接分析在这里,并且将需要parse和store的任务加入到TaskNeedDo里面
BOOL CCrawler::HandleTask(CPageStat& objTask)
{
DebugString("[CCrawler::HandleTask]","task:(%s)",objTask.m_strUrl);
//set timeout
crawler.SetTimeOut(CTemplate::Instance()->GetCTimeOut());
//insert accept
crawler.SetAcceptType(CTemplate::Instance()->GetContentTypeX());
//insert user-agent
if (CTemplate::Instance()->GetUserAgent().GetLength())
crawler.SetUserAgent(CTemplate::Instance()->GetUserAgent());
//insert url
CHttpCrawler::Task task(objTask.m_strUrl,"");
//insert referer
//if (objTask.m_strRefer.GetLength())
task.SetRefer(objTask.m_strRefer);
//insert cookie
if (CTemplate::Instance()->GetCookie().GetLength())
task.SetCookie(CTemplate::Instance()->GetCookie());
crawler.Start();
crawler.Stop();
crawler.Wait(INFINITE);
if(crawler.HandleTask(&task))
{
if(task.GetRetCode() == 200)
{
CString strHeader, strBody;
//std::ostrstream ostrBody;
std::map<CString, CString> strHeaders;
task.GetHeaderArray(strHeaders);
strHeader.Append(task.GetHeader(),task.GetHeaderLen());
strBody.Append(task.GetResponse(),task.GetResponseLen());
//替换body中的00为20
if(CTemplate::Instance()->GetNULLInBody())
{
strBody.Replace('\0',' ');
}
//Utf-8的转换
if(strBody.GetLength() > 3 && (BYTE)strBody[0] == 0xef && (BYTE)strBody[1] == 0xbb && (BYTE)strBody[2] == 0xbf)
{
NIniUtil::DecodePureUTF((BYTE*)strBody.GetBuffer()+3,0);
strBody.ReleaseBuffer();
strBody.Delete(0,3);
}
else if(CTemplate::Instance()->GetUtf8())
{
NIniUtil::DecodePureUTF((BYTE*)strBody.GetBuffer(),0);
strBody.ReleaseBuffer();
}
//检查网页指纹并更新相关信息,如果发现没有变动就返回false以便于直接退出
if(!CheckContent(objTask,strBody))
{
DebugString("[CCrawler::HandleTask]","content no changed. URL:(%s)",objTask.m_strUrl);
return FALSE;
}
//变动了,处理父链接
if(objTask.m_strRefer.GetLength())
{
if(!CBloomFilter::Instance()->find(objTask.m_strRefer))
{
CPageStat objTemp;
objTemp.m_strUrl=objTask.m_strRefer;
if(!CPageStat::Get(objTemp))
{
CPageStat::Build(objTemp,objTask.m_strRefer,NULL,true,
CTemplate::Instance()->GetTaskTypeX(objTask.m_strRefer));
}
CPriorizer::Push(objTemp);
}
}
//处理任务,变动了,检查一级子链接
DWORD dwType = CTemplate::Instance()->GetTaskTypeX(objTask.m_strUrl);
if(dwType&TT_SKIP)
{
DebugString("[CCrawler::HandleTask]","link skip. URL:(%s)",objTask.m_strUrl);
return FALSE;
}
if(dwType&TT_PARSE)
{
DoTaskParse(objTask.m_strUrl,strBody);
}
if(dwType&TT_STORE)
{
DoTaskStore(objTask.m_strUrl,strHeader, strBody);
}
return TRUE;
}
else
{
DebugString("[CCrawler::HandleTask]","error ret code:(%d) URL:(%s)",task.GetRetCode(),objTask.m_strUrl);
return FALSE;
}
}
else
{
LOGE("HandleTask crash!!!!");
LOGE("url=(%s)",objTask.m_strUrl);
LOGE("refer=(%s)",objTask.m_strRefer);
DebugString("[CCrawler::HandleTask]","HandleTask crash!!!!");
return FALSE;
}
}
void CCrawler::DoTaskStore(const CString& strUrl,const CString& strHeader,const CString& strContent)
{
CTemplateEx::CallStoreCallback(strUrl);
CTemplateEx::CallStoreCallbackEx(strUrl,strContent);
CTemplateEx::CallStoreCallbackEx2(strUrl,strHeader, strContent);
}
int CCrawler::OnFindLink(const char *elem, const char *attr, struct uri *uri, void *arg)
{
CSimpleArray<CString>* arrUrls = (CSimpleArray<CString>*)arg;
char uristr[INTERNET_MAX_PATH_LENGTH]={0};
memset(uristr,0,INTERNET_MAX_PATH_LENGTH);
if (luri_recombine(uri, uristr, INTERNET_MAX_PATH_LENGTH, C_URI ) >= 0)
{
arrUrls->Add(uristr);
DebugString("[CCrawler::OnFindLink]","find link:(%d:%s)",arrUrls->GetSize(),uristr);
}
luri_destroy(uri);
lmm_free(uri);
return 1;
}
//分析网页超级链接,由于libhlink不是多线程安全的,所以这里要加锁
//由于网页分析耗时间比较多,所以要锁的作用域要小,所以把对网页的处理拿到锁的外面来处理
//2007.2.28,sunwang
void CCrawler::DoTaskParse(const CString& strUrl,const CString& strContent)
{
CSimpleArray<CString> arrUrls;
CPageStat objTask;
//parse
do
{
CIOLocker locker(&s_csLinkParse);
struct uri page_uri;
luri_parse_string(strUrl,&page_uri);
DebugString("[CCrawler::DoTaskParse]","page link:(%s)",strUrl);
lhlink_detect_string(strContent,&page_uri,OnFindLink,&arrUrls);
luri_destroy(&page_uri);
DebugString("[CCrawler::DoTaskParse]","total find (%d) links",arrUrls.GetSize());
} while(FALSE);
//analysis
for (int i=0;i<arrUrls.GetSize();i++)
{
DWORD dwType = CTemplate::Instance()->GetTaskTypeX(arrUrls[i]);
do
{
if(dwType&TT_SKIP)
{
break;
}
if(dwType&TT_REPLACE)
{
CTemplateEx::CallReplaceCallback(arrUrls[i]);
break;
}
if((dwType&TT_PARSE) || (dwType&TT_STORE))
{
if(!CBloomFilter::Instance()->find(arrUrls[i]))
{
objTask.m_strUrl=arrUrls[i];
if(!CPageStat::Get(objTask))
{
CPageStat::Build(objTask,arrUrls[i],strUrl,true,dwType);
}
CPriorizer::Push(objTask);
}
break;
}
} while(FALSE);
}
}
//没有变动返回false
/*
CString m_strUrl;
CString m_strRefer;
DWORD m_dwContentFingerprint;
WORD m_wPriority;
__int64 m_int64LastModifyTime;
__int64 m_int64LastCrawlTime;
WORD m_wTotalCrawlCount;
WORD m_wHintCount;
*/
bool CCrawler::CheckContent(CPageStat& objTask,CString& strContent)
{
bool bResult= false;
CMD5 md5;
CString strFingerContent = strContent;
CString strFingerPattern;
bool bFingerFound = false;
const std::vector<CFingerItem>& arrFilter = CTemplate::Instance()->GetContentFingerArray();
for (int i=0;i < (int)arrFilter.size();i++)
{
const char* lpszError = NULL;
int nErrorOffset=0;
int rc=0;
int ovector[30]={0};/* should be a multiple of 3 */
pcre* re = pcre_compile(arrFilter[i].strURLPattern,0,&lpszError,&nErrorOffset,NULL);
if(re)
{
rc = pcre_exec(re,NULL,objTask.m_strUrl,objTask.m_strUrl.GetLength(),0,0,ovector,30);
if(rc<0)
{
switch(rc)
{
case PCRE_ERROR_NOMATCH:
break;
default:
break;
}
}
else
{
//找到匹配。。
strFingerPattern =arrFilter[i].strFingerPattern;
bFingerFound = true;
if(lpszError)
{
pcre_free_substring(lpszError);
}
pcre_free(re);
break;
}
if(lpszError)
{
pcre_free_substring(lpszError);
}
pcre_free(re);
}//endif
}//end for
if(bFingerFound)
{
CString strResult;
CTemplateEx::CheckContent(strFingerPattern,strContent,strResult);
if(strResult.GetLength() >0)
{
strFingerContent = strResult;
}
}
//计算指纹
md5.GenerateMD5((unsigned char*)(LPCSTR)strFingerContent,strFingerContent.GetLength());
DWORD dwContentFingerprint = md5.m_data[0];
_time64(&objTask.m_int64LastCrawlTime);
objTask.m_wTotalCrawlCount++;
if(dwContentFingerprint==objTask.m_dwContentFingerprint)
{
if(objTask.m_bPassiveMode)
{
}
else
{
if(objTask.m_wPriority < _PRIORITY_MAX_)
objTask.m_wPriority++;
}
}
else
{
if(objTask.m_dwContentFingerprint != 0)
LOG("update finger.newfinger=%d pattern=%s url=%s",
dwContentFingerprint,strFingerContent,objTask.m_strUrl);
bResult = true;
objTask.m_dwContentFingerprint=dwContentFingerprint;
_time64(&objTask.m_int64LastModifyTime);
if(objTask.m_bPassiveMode)
{
}
else
{
if(objTask.m_wPriority > _PRIORITY_MIN_)
objTask.m_wPriority--;
objTask.m_wHintCount++;
}
}
CPageStat::Put(objTask);
return bResult;
}
void CCrawler::FreeConnectAll()
{
crawler.FreeConnectAll();
}
void CCrawler::DownloadAndRegexPage(char* szURL,char* szCookie,char* szPattern,CString& strResult)
{
DebugString("[CCrawler::DownloadPage]","task:(%s)",szURL);
//set timeout
crawler.SetTimeOut(CTemplate::Instance()->GetCTimeOut());
//insert user-agent
if (CTemplate::Instance()->GetUserAgent().GetLength())
crawler.SetUserAgent(CTemplate::Instance()->GetUserAgent());
//insert url
CHttpCrawler::Task task(szURL,"");
//insert cookie
if (szCookie && strlen(szCookie) > 0)
task.SetCookie(szCookie);
crawler.Start();
crawler.Stop();
crawler.Wait(INFINITE);
if(crawler.HandleTask(&task))
{
CString strBody;
//匹配正则
strBody.Append(task.GetResponse(),task.GetResponseLen());
if(szPattern)
CTemplateEx::CheckContent(szPattern,strBody,strResult);
else
strResult = strBody;
}
}
void CCrawler::DownloadAndRegexHeader(char* szURL,char* szCookie,char* szPattern,CString& strResult)
{
DebugString("[CCrawler::DownloadPageHeader]","task:(%s)",szURL);
//set timeout
crawler.SetTimeOut(CTemplate::Instance()->GetCTimeOut());
//insert user-agent
if (CTemplate::Instance()->GetUserAgent().GetLength())
crawler.SetUserAgent(CTemplate::Instance()->GetUserAgent());
//insert url
CHttpCrawler::Task task(szURL,"");
//insert cookie
if (szCookie && strlen(szCookie) > 0)
task.SetCookie(szCookie);
crawler.Start();
crawler.Stop();
crawler.Wait(INFINITE);
if(crawler.HandleTask(&task))
{
CString strHttpHeader;
//匹配正则
strHttpHeader.Append(task.GetHeader(),task.GetHeaderLen());
if(szPattern)
CTemplateEx::CheckContent(szPattern,strHttpHeader,strResult);
else
strResult = strHttpHeader;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -