📄 httpcrawler.h
字号:
#pragma once
#include <winsock2.h>
#include <map>
#include <list>
#include <atlstr.h>
#include "Buffer.h"
using std::map;
using std::list;
using std::pair;
using std::make_pair;
class CHttpCrawlerNotify;
class CHttpCrawler
{
class Connection;
public:
class Task;
public:
explicit CHttpCrawler(UINT thread_num = 5, UINT task_max = 0xFFFFFFFF, UINT timeout = 10);
~CHttpCrawler(void);
void Start(void);
void Stop(void);
bool Stoped(void) { return m_bStop; }
bool Wait(DWORD dwTimeout);
void SetUserAgent(const char* szAgent);
void SetAcceptType(const char* szAccept);
void SetNotify(CHttpCrawlerNotify* pNotify);
void SetTimeOut(int timeout){m_timeout=timeout;}
void EnabledGzip(bool bEnabled = true);
bool AddTask(const char* szURL, const char* szHeadEx);
long GetTaskCount(void) { return InterlockedExchange(&m_task_num, m_task_num); }
bool HandleTask(CHttpCrawler::Task* pTask);
void FreeConnectAll();
protected:
bool ProcessTask(CURL* curl, CHttpCrawler::Task* pTask);
bool ParseHead(const char* buffer, int size, map<CString, CString>& head_m,int* pRetCode);
private:
static int libcurl_body_callback(BYTE *pData, int nSize, int nMemb,void *pArg);
static int libcurl_head_callback(BYTE *pData, int nSize, int nMemb,void *pArg);
private:
bool m_bStop;
UINT m_timeout; // min
UINT m_task_max; // max task number
UINT m_thread_num; // working thread number
long volatile m_task_num; // current task number
HANDLE m_iocp; // task queue
HANDLE* m_hThreads; // thread handle
bool m_bGzip; // allow gzip encode
CString m_szAcceptType; // http accept
CString m_szUserAgent; // http user agent
CHttpCrawlerNotify* m_notify; // complated notify object pointer
map<DWORD, CURL*> m_curl_connect;
private:
static DWORD CALLBACK WorkerThreadProc(CHttpCrawler* pThis);
private:
// don't cpy object
CHttpCrawler(const CHttpCrawler& cpy);
CHttpCrawler& operator=(const CHttpCrawler& cpy);
};
// notify
class CHttpCrawlerNotify
{
friend class CHttpCrawler;
virtual void OnCompleted(CHttpCrawler::Task* pTask) = 0;
};
// task
class CHttpCrawler::Task
{
friend class CHttpCrawler;
public:
explicit Task(const char* szURL, const char* szHeadEx);
~Task();
const char* GetURL(void) { return m_rawURL; }
void EnabledPostMethod(bool enabled = true);
void SetPostData(BYTE* pPostData, size_t size);
const char* GetHost(void) { return m_szHost; }
const char* GetPath(void) { return m_szPath; }
const char* GetMethod(void) { return m_szMethod; }
const char* GetResponse(void) { return (LPCSTR)m_szResponseBody; }
int GetResponseLen(void) { return m_szResponseBody.GetLength(); }
const char* GetHeader(void) { return (LPCSTR)m_httpHeader; }
int GetHeaderLen(void) { return m_httpHeader.GetLength();}
const char* GetContentType(void) { return (LPCSTR)m_head_m["content-type"]; }
void SetCookie(const char* szCookie);
void SetRefer(const char* szRefer);
int GetResponseCookies(map<CString, CString>& cookies);
void GetHeaderArray(map<CString, CString>& array){array = m_head_m;}
int GetRetCode(){ return m_nRetCode;}
#ifdef _DEBUG
DWORD m_dwStartTick;
DWORD m_dwStopTick;
#endif //_DEBUG
private:
map<CString, CString> m_head_m;
CString m_szResponseBody;
CString m_rawURL;
CString m_szHost;
u_short m_nPort;
CString m_szPath;
CString m_szMethod;
CString m_szHeadEx;
CString m_szCookie;
CString m_szRefer;
CString m_httpHeader;
LPBYTE m_pPostData;
int m_nRetCode;
};
// socket connection
class CHttpCrawler::Connection
{
public:
Connection(int timeout = 10);
~Connection();
bool Connect(sockaddr* addr);
bool IsConnection(void) { return (m_socket != INVALID_SOCKET); }
int Send(const char* buffer, int size);
int Recv(char* buffer, int size, int& ReadedBytes, LPCSTR pToken = NULL);
private:
int m_timeout; // ms
SOCKET m_socket;
CBuffer m_recv_buffer;
private:
// don't copy
Connection(const Connection&);
Connection& operator=(const Connection&);
};
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -