⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 httpcrawler.h

📁 概述:数据的纵向收集
💻 H
字号:
#pragma once
#include <winsock2.h>
#include <map>
#include <list>
#include <atlstr.h>
#include "Buffer.h"

using std::map;
using std::list;
using std::pair;
using std::make_pair;

class CHttpCrawlerNotify;

class CHttpCrawler
{
    class Connection;
public:
    class Task;

public:
    explicit CHttpCrawler(UINT thread_num = 5, UINT task_max = 0xFFFFFFFF, UINT timeout = 10);
    ~CHttpCrawler(void);

    void Start(void);
    void Stop(void);
    bool Stoped(void) { return m_bStop; }
    bool Wait(DWORD dwTimeout);
    void SetUserAgent(const char* szAgent);
    void SetAcceptType(const char* szAccept);
    void SetNotify(CHttpCrawlerNotify* pNotify);
    void SetTimeOut(int timeout){m_timeout=timeout;}
    void EnabledGzip(bool bEnabled = true);

    bool AddTask(const char* szURL, const char* szHeadEx);
    long GetTaskCount(void) { return InterlockedExchange(&m_task_num, m_task_num); }
    bool HandleTask(CHttpCrawler::Task* pTask);
    void FreeConnectAll();

protected:
    bool   ProcessTask(CURL* curl, CHttpCrawler::Task* pTask);
    bool   ParseHead(const char* buffer, int size, map<CString, CString>& head_m,int* pRetCode);
private:
    static int  libcurl_body_callback(BYTE *pData, int nSize, int nMemb,void *pArg);
    static int  libcurl_head_callback(BYTE *pData, int nSize, int nMemb,void *pArg);

private:
    bool m_bStop;
    UINT  m_timeout;      // min
    UINT  m_task_max;     // max task number
    UINT  m_thread_num;   // working thread number

    long volatile m_task_num;   // current task number
    HANDLE  m_iocp;     // task queue
    HANDLE* m_hThreads; // thread handle

    bool    m_bGzip;         // allow gzip encode
    CString m_szAcceptType;  // http accept
    CString m_szUserAgent;   // http user agent

    CHttpCrawlerNotify* m_notify;   // complated notify object pointer

    map<DWORD, CURL*>   m_curl_connect;

private:
    static DWORD CALLBACK WorkerThreadProc(CHttpCrawler* pThis);

private:
    // don't cpy object
    CHttpCrawler(const CHttpCrawler& cpy);
    CHttpCrawler& operator=(const CHttpCrawler& cpy);
};


// notify
class CHttpCrawlerNotify
{
    friend class CHttpCrawler;
    virtual void OnCompleted(CHttpCrawler::Task* pTask) = 0;
};

// task
class CHttpCrawler::Task
{
    friend class CHttpCrawler;

public:
    explicit Task(const char* szURL, const char* szHeadEx);
    ~Task();

    const char* GetURL(void) { return m_rawURL; }

    void EnabledPostMethod(bool enabled = true);
    void SetPostData(BYTE* pPostData, size_t size);

    const char* GetHost(void) { return m_szHost; }
    const char* GetPath(void) { return m_szPath; }
    const char* GetMethod(void) { return m_szMethod; }

    const char* GetResponse(void)    { return (LPCSTR)m_szResponseBody; }
    int         GetResponseLen(void) { return m_szResponseBody.GetLength(); }
    const char* GetHeader(void)		 { return (LPCSTR)m_httpHeader; }
    int			GetHeaderLen(void)	 { return m_httpHeader.GetLength();}
    const char* GetContentType(void) { return (LPCSTR)m_head_m["content-type"]; }
    void  SetCookie(const char* szCookie);
    void  SetRefer(const char* szRefer);

    int GetResponseCookies(map<CString, CString>& cookies);
    void GetHeaderArray(map<CString, CString>& array){array = m_head_m;}
    int GetRetCode(){ return m_nRetCode;}

#ifdef  _DEBUG
    DWORD   m_dwStartTick;
    DWORD   m_dwStopTick;
#endif //_DEBUG

private:
    map<CString, CString> m_head_m;
    CString m_szResponseBody;

    CString m_rawURL;
    CString m_szHost;
    u_short m_nPort;
    CString m_szPath;

    CString m_szMethod;
    CString m_szHeadEx;
    CString m_szCookie;
    CString m_szRefer;
    CString m_httpHeader;
    LPBYTE  m_pPostData;
    int  m_nRetCode; 
};

// socket connection
class CHttpCrawler::Connection
{
public:
    Connection(int timeout = 10);
    ~Connection();

    bool Connect(sockaddr* addr);
    bool IsConnection(void) { return (m_socket != INVALID_SOCKET); }

    int Send(const char* buffer, int size);
    int Recv(char* buffer, int size, int& ReadedBytes, LPCSTR pToken = NULL);

private:
    int     m_timeout; // ms
    SOCKET  m_socket;
    CBuffer m_recv_buffer;

private:
    // don't copy
    Connection(const Connection&);
    Connection& operator=(const Connection&);
};

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -