📄 httpcrawler.cpp
字号:
#include "StdAfx.h"
#include <assert.h>
#include "httpcrawler.h"
#include "ZUtils.h"
#pragma comment(lib, "ws2_32.lib")
////////////////////////////////////////////////////////////////////////////////
//
// class CHttpCrawler::Task
//
////////////////////////////////////////////////////////////////////////////////
CHttpCrawler::Task::Task(const char* szURL, const char* szHeadEx)
: m_pPostData(NULL), m_szHeadEx(szHeadEx), m_szMethod("GET"), m_nPort(0), m_rawURL(szURL),m_nRetCode(0)
{
assert(szURL != NULL);
assert(szHeadEx != NULL);
#if 1
CString url = szURL;
CString szProtocol;
CString szHost_Port;
CString szPath;
do {
int pos = 0;
int ret = 0;
ret = url.Find(':', pos);
if (ret <= 0 || ret > url.GetLength() - 3)
break;
szProtocol = url.Mid(0, ret);
pos = ret+1;
if (url[pos] != '/' || url[pos+1] != '/')
break;
pos+=2;
ret = url.Find('/', pos);
if (ret == 0)
break;
if (ret < 0)
{
szHost_Port = url.Mid(pos);
m_szPath = "/";
}
else
{
szHost_Port = url.Mid(pos, ret-pos);
m_szPath = url.Mid(ret);
}
if ((ret = szHost_Port.Find(':')) >= 0)
{
m_szHost = szHost_Port.Mid(0, ret);
m_nPort = atoi(szHost_Port.Mid(ret+1));
}
else
{
m_szHost = szHost_Port;
if (szProtocol.CompareNoCase("http") == 0)
{
m_nPort = 80;
}
}
} while(false);
#else
struct uri task_uri;
if (::luri_parse_string(szURL, &task_uri) > 0)
{
if (task_uri.authority->host != 0)
m_szHost = task_uri.authority->host;
if (task_uri.path != 0)
m_szPath = task_uri.path;
if (task_uri.query != 0)
{
m_szPath += "?";
m_szPath += task_uri.query;
}
if (task_uri.authority->port != 0)
m_nPort = atoi(task_uri.authority->port);
else if (0 == strcmpi(task_uri.scheme, "http"))
m_nPort = 80;
luri_destroy(&task_uri);
}
#endif
}
CHttpCrawler::Task::~Task()
{
if (m_pPostData)
delete[] m_pPostData;
}
void CHttpCrawler::Task::EnabledPostMethod(bool enabled/* = true*/)
{
if (enabled) m_szMethod = "POST";
else m_szMethod = "GET";
}
void CHttpCrawler::Task::SetPostData(BYTE* pPostData, size_t size)
{
if (m_pPostData)
delete[] m_pPostData;
m_pPostData = new BYTE[size+1];
memcpy(m_pPostData, pPostData, size);
m_pPostData[size] = 0;
}
void CHttpCrawler::Task::SetCookie(const char* szCookie)
{
m_szCookie = szCookie;
}
void CHttpCrawler::Task::SetRefer(const char* szRefer)
{
m_szRefer = szRefer;
}
int CHttpCrawler::Task::GetResponseCookies(map<CString, CString>& cookies)
{
CString szString = m_head_m["set-cookie"];
int curPos = 0;
do {
CString szTemp = szString.Tokenize(";", curPos);
if (szTemp.IsEmpty())
{
break;
}
else
{
int tmpPos = 0;
CString key = szTemp.Tokenize("=", tmpPos);
key.Trim();
if (key.GetLength() > 0 && tmpPos < szTemp.GetLength()-1)
{
CString value = szTemp.Mid(tmpPos+1);
cookies[key] = value.Trim();
}
}
}while(true);
return (int)cookies.size();
}
////////////////////////////////////////////////////////////////////////////////
//
// class CHttpCrawler
//
////////////////////////////////////////////////////////////////////////////////
CHttpCrawler::CHttpCrawler(UINT thread_num/* = 5*/, UINT task_max/* = 0xFFFFFFFF*/, UINT timeout/* = 10*/)
: m_thread_num(thread_num)
, m_bGzip(true)
, m_hThreads(NULL)
, m_notify(NULL)
, m_task_num(0)
, m_iocp(::CreateIoCompletionPort(INVALID_HANDLE_VALUE, 0, 0, 0))
, m_szUserAgent("Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; EmbeddedWB 14.52; .NET CLR 1.1.4322; .NET CLR 2.0.50727)")
, m_szAcceptType("*/*")
, m_task_max(task_max)
, m_timeout(timeout)
, m_bStop(false)
{
assert(m_iocp != NULL);
assert(m_thread_num > 0 && m_thread_num < 1024);
::curl_global_init(CURL_GLOBAL_WIN32);
m_hThreads = new HANDLE[thread_num];
for (UINT i = 0; i < m_thread_num; ++i)
{
m_hThreads[i] = ::CreateThread(0, 0, (LPTHREAD_START_ROUTINE)WorkerThreadProc, this, CREATE_SUSPENDED, 0);
assert(m_hThreads[i] != NULL);
}
}
CHttpCrawler::~CHttpCrawler(void)
{
Stop();
Wait(INFINITE);
for (UINT i = 0; i < m_thread_num; ++i)
{
CloseHandle(m_hThreads[i]);
}
delete[] m_hThreads;
CloseHandle(m_iocp);
FreeConnectAll();
::curl_global_cleanup();
}
void CHttpCrawler::Start(void)
{
if (m_bStop)
return;
for (UINT i = 0; i < m_thread_num; ++i)
{
ResumeThread(m_hThreads[i]);
}
}
void CHttpCrawler::Stop(void)
{
if (m_bStop)
return;
m_bStop = true;
for (UINT i = 0; i < m_thread_num; ++i)
{
::PostQueuedCompletionStatus(m_iocp, 0, 0, 0);
}
}
bool CHttpCrawler::Wait(DWORD dwTimeout)
{
if (m_bStop)
return true;
return (WAIT_TIMEOUT != WaitForMultipleObjects(m_thread_num, m_hThreads, TRUE, dwTimeout));
}
void CHttpCrawler::SetUserAgent(const char* szAgent)
{
assert(szAgent != NULL);
m_szUserAgent = szAgent;
}
void CHttpCrawler::SetAcceptType(const char* szAccept)
{
assert(szAccept != NULL);
m_szAcceptType = szAccept;
}
void CHttpCrawler::SetNotify(CHttpCrawlerNotify* pNotify)
{
m_notify = pNotify;
}
void CHttpCrawler::EnabledGzip(bool bEnabled/* = true*/)
{
m_bGzip = bEnabled;
}
bool CHttpCrawler::AddTask(const char* szURL, const char* szHeadEx)
{
if (m_bStop)
return false;
if ((UINT)InterlockedExchange(&m_task_num, m_task_num) < m_task_max)
{
InterlockedIncrement(&m_task_num);
::PostQueuedCompletionStatus(m_iocp, 0, reinterpret_cast<ULONG_PTR>(new CHttpCrawler::Task(szURL, szHeadEx)), 0);
}
else
{
return false;
}
return true;
}
bool CHttpCrawler::ParseHead(const char* buffer, int size, map<CString, CString>& head_m,int* pRetCode)
{
assert(buffer != NULL);
assert(size > 0);
const char* temp = strstr(buffer, "\r\n\r\n");
if (temp && temp-buffer < size)
{
temp = buffer;
const char* splitc = strchr(temp, ' ');
const char* splitl = NULL;
if(splitc)
{
splitl = strchr(splitc+1, ' ');
if(splitl)
{
CString strRetCode(splitc+1, splitl-splitc-1);
*pRetCode = atoi(strRetCode);
}
}
for(;;)
{
temp = strstr(temp, "\r\n");
if (temp == NULL || (temp[2] == '\r' && temp[3] == '\n'))
break;
temp += 2;
const char* splitc = strchr(temp, ':');
const char* splitl = strchr(temp, '\r');
if (splitc && splitl && splitc<splitl)
{
CString key(temp, splitc-temp);
CString value(splitc+1, splitl-splitc-1);
key.MakeLower();
value.Trim();
if (head_m.find(key) != head_m.end())
{
head_m[key].Append(";");
head_m[key].Append(value);
}
else
{
head_m[key] = value;
}
}
}
}
else
{
return false;
}
return true;
}
DWORD CALLBACK CHttpCrawler::WorkerThreadProc(CHttpCrawler* pThis)
{
CURL *curl = ::curl_easy_init();
for(;;)
{
DWORD dwNumberOfBytes = 0;
LPOVERLAPPED pOverlapped = NULL;
ULONG_PTR pKey = NULL;
if (::GetQueuedCompletionStatus(pThis->m_iocp, &dwNumberOfBytes, &pKey, &pOverlapped, INFINITE))
{
if (pKey == 0)
break; // return thread
CHttpCrawler::Task* pTask = reinterpret_cast<CHttpCrawler::Task*>(pKey);
pThis->ProcessTask(curl, pTask);
delete pTask;
::InterlockedDecrement(&pThis->m_task_num);
}
else
{
// get queue failed!
assert(false);
break;
}
}
::curl_easy_cleanup(curl);
return 0;
}
int CHttpCrawler::libcurl_head_callback(BYTE *pData, int nSize, int nMemb,void *pArg)
{
int nResult=0;
//CString* pBuffer = (CString*) pArg;
CBuffer* pBuffer = (CBuffer*) pArg;
if(pBuffer && pData)
{
pBuffer->Write(pData, nSize*nMemb);
nResult=nSize*nMemb;
}
return nResult;
}
int CHttpCrawler::libcurl_body_callback(BYTE *pData, int nSize, int nMemb,void *pArg)
{
int nResult=0;
//CString* pBuffer = (CString*) pArg;
CBuffer* pBuffer = (CBuffer*) pArg;
if(pBuffer && pData)
{
pBuffer->Write(pData, nSize*nMemb);
nResult=nSize*nMemb;
}
return nResult;
}
bool CHttpCrawler::HandleTask(CHttpCrawler::Task* pTask)
{
CURL *curl = NULL;
DWORD threadid = ::GetCurrentThreadId();
map<DWORD, CURL*>::iterator itr;
itr = m_curl_connect.find(threadid);
if (itr == m_curl_connect.end())
{
curl = ::curl_easy_init();
m_curl_connect[threadid] = curl;
}
else
{
curl = itr->second;
}
return ProcessTask(curl, pTask);
}
bool CHttpCrawler::ProcessTask(CURL* curl, CHttpCrawler::Task* pTask)
{
assert(curl != NULL);
assert(pTask != NULL);
bool bRet = false;
#ifdef _DEBUG
pTask->m_dwStartTick = ::GetTickCount();
#endif
int retry = 0;
do {
CBuffer body,head;
CURLcode res = CURLE_OK;
curl_slist *slist = NULL;
// reset curl handle
curl_easy_reset(curl);
//if (pTask->m_szRefer.GetLength() > 0)
curl_easy_setopt(curl, CURLOPT_REFERER, (LPCSTR)pTask->m_szRefer);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, libcurl_body_callback);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &body);
curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, libcurl_head_callback);
curl_easy_setopt(curl, CURLOPT_HEADERDATA, &head);
curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, m_timeout);
curl_easy_setopt(curl, CURLOPT_TIMEOUT, m_timeout);
//curl_easy_setopt(curl, CURLOPT_COOKIE, pTask->getc);
//header
curl_easy_setopt(curl, CURLOPT_URL, pTask->m_rawURL);
curl_easy_setopt(curl, CURLOPT_USERAGENT, m_szUserAgent);
curl_easy_setopt(curl, CURLOPT_ENCODING, "gzip, deflate");
slist = curl_slist_append(slist, "Connection: Keep-Alive");
slist = curl_slist_append(slist, "Accept-Language: zh-cn");
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, slist);
res = curl_easy_perform(curl);
curl_slist_free_all(slist);
#ifdef _DEBUG
pTask->m_dwStopTick = ::GetTickCount();
#endif
if(CURLE_OK == res)
{
if (ParseHead((char*)head.GetBuffer(), head.GetBufferLen(), pTask->m_head_m, &pTask->m_nRetCode) &&
pTask->m_nRetCode == 200)
{
pTask->m_httpHeader.Append((char*)head.GetBuffer(), head.GetBufferLen());
if(pTask->m_head_m["content-encoding"].Find("gzip") >= 0 )
{
//ungzip based on zlib (deflate算法)
size_t dwIn = body.GetBufferLen();
size_t dwOut = dwIn*25;
if (dwIn > 0)
{
char* pBuffer = new char[dwOut];
UnZFilter ungzip;
if( ungzip( body.GetBuffer(), dwIn, (unsigned char*)pBuffer, dwOut ) )
{
if( dwOut < dwIn*25 )
{
pBuffer[dwOut]=0;
pTask->m_szResponseBody = pBuffer;
}
}
delete[] pBuffer;
}
}
else
{
pTask->m_szResponseBody.Empty();
pTask->m_szResponseBody.Append((char*)body.GetBuffer(),body.GetBufferLen());
}
if (m_notify)
m_notify->OnCompleted(pTask);
bRet = true;
}
break; // 完成
}
}while((++retry) < 3);
return bRet;
}
void CHttpCrawler::FreeConnectAll()
{
map<DWORD, CURL*>::iterator itr = m_curl_connect.begin();
while(itr != m_curl_connect.end())
{
::curl_easy_cleanup(itr->second);
itr++;
}
m_curl_connect.clear();
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -