⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 httpcrawler.cpp

📁 概述:数据的纵向收集
💻 CPP
字号:
#include "StdAfx.h"
#include <assert.h>
#include "httpcrawler.h"
#include "ZUtils.h"

#pragma comment(lib, "ws2_32.lib")


////////////////////////////////////////////////////////////////////////////////
//
// class CHttpCrawler::Task
//
////////////////////////////////////////////////////////////////////////////////


CHttpCrawler::Task::Task(const char* szURL, const char* szHeadEx)
: m_pPostData(NULL), m_szHeadEx(szHeadEx), m_szMethod("GET"), m_nPort(0), m_rawURL(szURL),m_nRetCode(0)
{
    assert(szURL != NULL);
    assert(szHeadEx != NULL);

#if 1
    CString url = szURL;

    CString szProtocol;
    CString szHost_Port;
    CString szPath;


    do {
        int pos = 0;
        int ret = 0;

        ret = url.Find(':', pos);
        if (ret <= 0 || ret > url.GetLength() - 3)
            break;

        szProtocol = url.Mid(0, ret);
        pos = ret+1;

        if (url[pos] != '/' || url[pos+1] != '/')
            break;

        pos+=2;
        ret = url.Find('/', pos);
        if (ret == 0)
            break;

        if (ret < 0)
        {
            szHost_Port = url.Mid(pos);
            m_szPath = "/";
        }
        else
        {
            szHost_Port = url.Mid(pos, ret-pos);
            m_szPath = url.Mid(ret);
        }

        if ((ret = szHost_Port.Find(':')) >= 0)
        {
            m_szHost = szHost_Port.Mid(0, ret);
            m_nPort = atoi(szHost_Port.Mid(ret+1));
        }
        else
        {
            m_szHost = szHost_Port;
            if (szProtocol.CompareNoCase("http") == 0)
            {
                m_nPort = 80;
            }
        }

    } while(false);

#else

    struct uri task_uri;

    if (::luri_parse_string(szURL, &task_uri) > 0)
    {
        if (task_uri.authority->host != 0)
            m_szHost = task_uri.authority->host;

        if (task_uri.path != 0)
            m_szPath = task_uri.path;

        if (task_uri.query != 0)
        {
            m_szPath += "?";
            m_szPath += task_uri.query;
        }

        if (task_uri.authority->port != 0)
            m_nPort = atoi(task_uri.authority->port);
        else if (0 == strcmpi(task_uri.scheme, "http"))
            m_nPort = 80;

        luri_destroy(&task_uri);
    }

#endif
}

CHttpCrawler::Task::~Task()
{
    if (m_pPostData)
        delete[] m_pPostData;

}

void CHttpCrawler::Task::EnabledPostMethod(bool enabled/* = true*/)
{
    if (enabled) m_szMethod = "POST";
    else         m_szMethod = "GET";
}

void CHttpCrawler::Task::SetPostData(BYTE* pPostData, size_t size)
{
    if (m_pPostData)
        delete[] m_pPostData;

    m_pPostData = new BYTE[size+1];
    memcpy(m_pPostData, pPostData, size);
    m_pPostData[size] = 0;
}

void  CHttpCrawler::Task::SetCookie(const char* szCookie)
{
    m_szCookie = szCookie;
}

void  CHttpCrawler::Task::SetRefer(const char* szRefer)
{
    m_szRefer = szRefer;
}

int CHttpCrawler::Task::GetResponseCookies(map<CString, CString>& cookies)
{
    CString szString = m_head_m["set-cookie"];

    int curPos  = 0;
    do {
        CString szTemp = szString.Tokenize(";", curPos);
        if (szTemp.IsEmpty())
        {
            break;
        }
        else
        {
            int tmpPos = 0;
            CString key = szTemp.Tokenize("=", tmpPos);
            key.Trim();
            if (key.GetLength() > 0 && tmpPos < szTemp.GetLength()-1)
            {
                CString value = szTemp.Mid(tmpPos+1);
                cookies[key] = value.Trim();
            }
        }
    }while(true);

    return (int)cookies.size();
}


////////////////////////////////////////////////////////////////////////////////
//
// class CHttpCrawler
//
////////////////////////////////////////////////////////////////////////////////

CHttpCrawler::CHttpCrawler(UINT thread_num/* = 5*/, UINT task_max/* = 0xFFFFFFFF*/, UINT timeout/* = 10*/)
: m_thread_num(thread_num)
, m_bGzip(true)
, m_hThreads(NULL)
, m_notify(NULL)
, m_task_num(0)
, m_iocp(::CreateIoCompletionPort(INVALID_HANDLE_VALUE, 0, 0, 0))
, m_szUserAgent("Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1;  EmbeddedWB 14.52; .NET CLR 1.1.4322; .NET CLR 2.0.50727)")
, m_szAcceptType("*/*")
, m_task_max(task_max)
, m_timeout(timeout)
, m_bStop(false)
{
    assert(m_iocp != NULL);
    assert(m_thread_num > 0 && m_thread_num < 1024);

    ::curl_global_init(CURL_GLOBAL_WIN32);


    m_hThreads = new HANDLE[thread_num];
    for (UINT i = 0; i < m_thread_num; ++i)
    {
        m_hThreads[i] = ::CreateThread(0, 0, (LPTHREAD_START_ROUTINE)WorkerThreadProc, this, CREATE_SUSPENDED, 0);
        assert(m_hThreads[i] != NULL);
    }
}

CHttpCrawler::~CHttpCrawler(void)
{
    Stop();
    Wait(INFINITE);

    for (UINT i = 0; i < m_thread_num; ++i)
    {
        CloseHandle(m_hThreads[i]);
    }
    delete[] m_hThreads;

	CloseHandle(m_iocp);

    FreeConnectAll();

    ::curl_global_cleanup();
}

void CHttpCrawler::Start(void)
{
	if (m_bStop)
		return;

    for (UINT i = 0; i < m_thread_num; ++i)
    {
        ResumeThread(m_hThreads[i]);
    }
}

void CHttpCrawler::Stop(void)
{
	if (m_bStop)
		return;

    m_bStop = true;
    for (UINT i = 0; i < m_thread_num; ++i)
    {
        ::PostQueuedCompletionStatus(m_iocp, 0, 0, 0);
    }
}

bool CHttpCrawler::Wait(DWORD dwTimeout)
{
	if (m_bStop)
		return true;

    return (WAIT_TIMEOUT != WaitForMultipleObjects(m_thread_num, m_hThreads, TRUE, dwTimeout));
}

void CHttpCrawler::SetUserAgent(const char* szAgent)
{
    assert(szAgent != NULL);
    m_szUserAgent = szAgent;
}

void CHttpCrawler::SetAcceptType(const char* szAccept)
{
    assert(szAccept != NULL);
    m_szAcceptType = szAccept;
}

void CHttpCrawler::SetNotify(CHttpCrawlerNotify* pNotify)
{
    m_notify = pNotify;
}

void CHttpCrawler::EnabledGzip(bool bEnabled/* = true*/)
{
    m_bGzip = bEnabled;
}

bool CHttpCrawler::AddTask(const char* szURL, const char* szHeadEx)
{
    if (m_bStop)
        return false;

    if ((UINT)InterlockedExchange(&m_task_num, m_task_num) < m_task_max)
    {
        InterlockedIncrement(&m_task_num);
        ::PostQueuedCompletionStatus(m_iocp, 0,  reinterpret_cast<ULONG_PTR>(new CHttpCrawler::Task(szURL, szHeadEx)), 0);
    }
    else
    {
        return false;
    }

    return true;
}

bool   CHttpCrawler::ParseHead(const char* buffer, int size, map<CString, CString>& head_m,int* pRetCode)
{
    assert(buffer != NULL);
    assert(size > 0);

    const char* temp = strstr(buffer, "\r\n\r\n");
    if (temp && temp-buffer < size)
    {
        temp = buffer;
        const char* splitc = strchr(temp, ' ');
        const char* splitl = NULL;
        if(splitc)
        {
            splitl = strchr(splitc+1, ' ');
            if(splitl)
            {
                CString strRetCode(splitc+1, splitl-splitc-1);
                *pRetCode = atoi(strRetCode);
            }
        }

        for(;;)
        {
            temp = strstr(temp, "\r\n");
            if (temp == NULL || (temp[2] == '\r' && temp[3] == '\n'))
                break;
            temp += 2;

            const char* splitc = strchr(temp, ':');
            const char* splitl = strchr(temp, '\r');
            if (splitc && splitl && splitc<splitl)
            {
                CString key(temp, splitc-temp);
                CString value(splitc+1, splitl-splitc-1);

                key.MakeLower();
                value.Trim();
                if (head_m.find(key) != head_m.end())
                {
                    head_m[key].Append(";");
                    head_m[key].Append(value);
                }
                else
                {
                    head_m[key] = value;
                }
            }
        }
    }
    else
    {
        return false;
    }

    return true;
}

DWORD CALLBACK CHttpCrawler::WorkerThreadProc(CHttpCrawler* pThis)
{
    CURL *curl = ::curl_easy_init();
    for(;;)
    {
        DWORD        dwNumberOfBytes = 0;
        LPOVERLAPPED pOverlapped     = NULL;
        ULONG_PTR    pKey            = NULL;

        if (::GetQueuedCompletionStatus(pThis->m_iocp, &dwNumberOfBytes, &pKey, &pOverlapped, INFINITE))
        {
            if (pKey == 0)
                break; // return thread

            CHttpCrawler::Task* pTask = reinterpret_cast<CHttpCrawler::Task*>(pKey);
            
            pThis->ProcessTask(curl, pTask);

            delete pTask;
            ::InterlockedDecrement(&pThis->m_task_num);
        }
        else
        {
            // get queue failed!
            assert(false);
            break;
        }
    }
    ::curl_easy_cleanup(curl);

    return 0;
}

int CHttpCrawler::libcurl_head_callback(BYTE *pData, int nSize, int nMemb,void *pArg)
{
	int nResult=0;

	//CString* pBuffer = (CString*) pArg;
	CBuffer* pBuffer = (CBuffer*) pArg;
	if(pBuffer && pData)
	{
		pBuffer->Write(pData, nSize*nMemb);
		nResult=nSize*nMemb;
	}

	return nResult;
}

int CHttpCrawler::libcurl_body_callback(BYTE *pData, int nSize, int nMemb,void *pArg)
{
	int nResult=0;

	//CString* pBuffer = (CString*) pArg;
	CBuffer* pBuffer = (CBuffer*) pArg;
	if(pBuffer && pData)
	{
		pBuffer->Write(pData, nSize*nMemb);
		nResult=nSize*nMemb;
	}

	return nResult;
}

bool CHttpCrawler::HandleTask(CHttpCrawler::Task* pTask)
{
    CURL *curl = NULL;
    DWORD threadid = ::GetCurrentThreadId();
    map<DWORD, CURL*>::iterator itr;

    itr = m_curl_connect.find(threadid);
    if (itr == m_curl_connect.end())
    {
        curl = ::curl_easy_init();
        m_curl_connect[threadid] = curl;
    }
    else
    {
        curl = itr->second;
    }

    return ProcessTask(curl, pTask);
}

bool CHttpCrawler::ProcessTask(CURL* curl, CHttpCrawler::Task* pTask)
{
    assert(curl != NULL);
    assert(pTask != NULL);

    bool bRet = false;

#ifdef  _DEBUG
    pTask->m_dwStartTick = ::GetTickCount();
#endif

    int retry = 0;
    do {
        CBuffer body,head;

        CURLcode    res   = CURLE_OK;
        curl_slist *slist = NULL;

        // reset curl handle
        curl_easy_reset(curl); 

        //if (pTask->m_szRefer.GetLength() > 0)
        curl_easy_setopt(curl, CURLOPT_REFERER, (LPCSTR)pTask->m_szRefer);

        curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, libcurl_body_callback);
		curl_easy_setopt(curl, CURLOPT_WRITEDATA, &body);
		curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, libcurl_head_callback);
		curl_easy_setopt(curl, CURLOPT_HEADERDATA, &head);
		curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, m_timeout);
        curl_easy_setopt(curl, CURLOPT_TIMEOUT, m_timeout);
        //curl_easy_setopt(curl, CURLOPT_COOKIE, pTask->getc);

		//header
        curl_easy_setopt(curl, CURLOPT_URL, pTask->m_rawURL);
		curl_easy_setopt(curl, CURLOPT_USERAGENT, m_szUserAgent);
		curl_easy_setopt(curl, CURLOPT_ENCODING, "gzip, deflate");

		slist = curl_slist_append(slist, "Connection: Keep-Alive");
		slist = curl_slist_append(slist, "Accept-Language: zh-cn");
		curl_easy_setopt(curl, CURLOPT_HTTPHEADER, slist);

        res = curl_easy_perform(curl);
        curl_slist_free_all(slist);

#ifdef _DEBUG
        pTask->m_dwStopTick = ::GetTickCount();
#endif

        if(CURLE_OK == res)
        {
            if (ParseHead((char*)head.GetBuffer(), head.GetBufferLen(), pTask->m_head_m, &pTask->m_nRetCode) &&
                pTask->m_nRetCode == 200)
            {
                pTask->m_httpHeader.Append((char*)head.GetBuffer(), head.GetBufferLen());

                if(pTask->m_head_m["content-encoding"].Find("gzip") >= 0 )
                {
                    //ungzip based on zlib (deflate算法)
                    size_t dwIn  = body.GetBufferLen();
                    size_t dwOut = dwIn*25;

                    if (dwIn > 0)
                    {
                        char* pBuffer = new char[dwOut];
                        UnZFilter ungzip;

                        if( ungzip( body.GetBuffer(), dwIn, (unsigned char*)pBuffer, dwOut ) )
                        {
                            if( dwOut < dwIn*25 )
                            {
                                pBuffer[dwOut]=0;
                                pTask->m_szResponseBody = pBuffer;
                            }
                        }
                        delete[] pBuffer;
                    }
                }
                else
                {
                    pTask->m_szResponseBody.Empty();
                    pTask->m_szResponseBody.Append((char*)body.GetBuffer(),body.GetBufferLen());
                }

                if (m_notify)
                    m_notify->OnCompleted(pTask);

                bRet = true;
            }

            break; // 完成
        }

    }while((++retry) < 3);

    return bRet;
}

void CHttpCrawler::FreeConnectAll()
{
    map<DWORD, CURL*>::iterator itr = m_curl_connect.begin();
    while(itr != m_curl_connect.end())
    {
        ::curl_easy_cleanup(itr->second);
        itr++;
    }
    m_curl_connect.clear();
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -