⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 html.cpp

📁 利用IE接口分析HTML文件
💻 CPP
字号:
/*
    Implement an HTML parser using IE4's IHTMLDocument2 interface.
*/


#include <windows.h>
#include <comdef.h>
#include <io.h>
#include "html.h"

#include <iostream>
using namespace std;


/*
	static function used to force dynamic allocation
*/
HTMLParser *HTMLParser::Create()
{
	return new HTMLParser;
}

// constructor/destructor

HTMLParser::HTMLParser()
{
	HRESULT hr;
	LPCONNECTIONPOINTCONTAINER pCPC = NULL;
	LPOLEOBJECT pOleObject = NULL;
	LPOLECONTROL pOleControl = NULL;


    // initialize all the class member variables
    m_dwRef = 1;	// must start at 1 for the current instance
    m_hrConnected = S_FALSE;
    m_dwCookie = 0;
    m_pMSHTML = NULL;
    m_pCP = NULL;
    m_pAnchorLinks = NULL;
    m_pImageLinks = NULL;


	// Create an instance of an dynamic HTML document
	if (FAILED(hr = CoCreateInstance( CLSID_HTMLDocument, NULL, CLSCTX_INPROC_SERVER, IID_IHTMLDocument2, (LPVOID*)&m_pMSHTML )))
	{
		goto Error;
	}

	if (FAILED(hr = m_pMSHTML->QueryInterface(IID_IOleObject, (LPVOID*)&pOleObject)))
	{
		goto Error;
	}
	hr = pOleObject->SetClientSite((IOleClientSite*)this);
	pOleObject->Release();

	if (FAILED(hr = m_pMSHTML->QueryInterface(IID_IOleControl, (LPVOID*)&pOleControl)))
	{
		goto Error;
	}
	hr = pOleControl->OnAmbientPropertyChange(DISPID_AMBIENT_DLCONTROL);
	pOleControl->Release();

	// Hook up sink to catch ready state property change
	if (FAILED(hr = m_pMSHTML->QueryInterface(IID_IConnectionPointContainer, (LPVOID*)&pCPC)))
	{
		goto Error;
	}

	if (FAILED(hr = pCPC->FindConnectionPoint(IID_IPropertyNotifySink, &m_pCP)))
	{
		goto Error;
	}

	m_hrConnected = m_pCP->Advise((LPUNKNOWN)(IPropertyNotifySink*)this, &m_dwCookie);

Error:
	if (pCPC) pCPC->Release();

}

HTMLParser::~HTMLParser()
{

    if ( m_pAnchorLinks )
        m_pAnchorLinks->Release();

    if ( m_pImageLinks )
        m_pImageLinks->Release();

	if (SUCCEEDED(m_hrConnected))
		m_pCP->Unadvise(m_dwCookie);

	if (m_pCP) 
		m_pCP->Release();

    if ( m_pMSHTML )
        m_pMSHTML->Release();

}



STDMETHODIMP HTMLParser::QueryInterface(REFIID riid, LPVOID* ppv)
{
	*ppv = NULL;

	if (IID_IUnknown == riid || IID_IPropertyNotifySink == riid)
	{
		*ppv = (LPUNKNOWN)(IPropertyNotifySink*)this;
		AddRef();
		return NOERROR;
	}
	else if (IID_IOleClientSite == riid)
	{
		*ppv = (IOleClientSite*)this;
		AddRef();
		return NOERROR;
	}
	else if (IID_IDispatch == riid)
	{
		*ppv = (IDispatch*)this;
		AddRef();
		return NOERROR;
	}
	else
		return E_NOTIMPL;
}

STDMETHODIMP_(ULONG) HTMLParser::AddRef()
{
	return ++m_dwRef;
}

STDMETHODIMP_(ULONG) HTMLParser::Release()
{
	if (--m_dwRef == 0) 
	{ 
		delete this; 
		return 0; 
	}

	return m_dwRef;
}

STDMETHODIMP HTMLParser::OnChanged(DISPID dispID)
{
	HRESULT hr;

	if (DISPID_READYSTATE == dispID)
	{
		VARIANT vResult = {0};
		EXCEPINFO excepInfo;
		UINT uArgErr;
		long lReadyState;


		DISPPARAMS dp = {NULL, NULL, 0, 0};
		if (SUCCEEDED(hr = m_pMSHTML->Invoke(DISPID_READYSTATE, IID_NULL, LOCALE_SYSTEM_DEFAULT, 
			DISPATCH_PROPERTYGET, &dp, &vResult, &excepInfo, &uArgErr)))
		{
			lReadyState = (READYSTATE)V_I4(&vResult);
			switch (lReadyState)
			{	
			case READYSTATE_UNINITIALIZED:
			case READYSTATE_LOADING: 
			case READYSTATE_LOADED:	
			case READYSTATE_INTERACTIVE:
				break;

			case READYSTATE_COMPLETE: 
				// IE4 is finished parsing the file
				BOOL fRet = PostThreadMessage(GetCurrentThreadId(),
									WM_USER_LOAD_COMPLETE,
									(WPARAM)0,
									(LPARAM)0);
				break;
			}
			VariantClear(&vResult);
		}
	}

	return NOERROR;
}

STDMETHODIMP HTMLParser::Invoke(DISPID dispIdMember,
            REFIID riid,
            LCID lcid,
            WORD wFlags,
            DISPPARAMS __RPC_FAR *pDispParams,
            VARIANT __RPC_FAR *pVarResult,
            EXCEPINFO __RPC_FAR *pExcepInfo,
            UINT __RPC_FAR *puArgErr)
{
	if (!pVarResult)
	{
		return E_POINTER;
	}

	switch(dispIdMember)
	{
	case DISPID_AMBIENT_DLCONTROL:
		// This tells IE4 that we want to download the page, 
		// but we don't want to run scripts, Java applets, or 
		// ActiveX controls
		V_VT(pVarResult) = VT_I4;
		V_I4(pVarResult) =  DLCTL_DOWNLOADONLY | 
							DLCTL_NO_SCRIPTS | 
							DLCTL_NO_JAVA |
							DLCTL_NO_DLACTIVEXCTLS |
							DLCTL_NO_RUNACTIVEXCTLS;
		break;
	default:
		return DISP_E_MEMBERNOTFOUND;
	}

	return NOERROR;
}




BOOL HTMLParser::LoadHTMLFile(LPCSTR pcszFile)
{
    HRESULT        hr;
	LPPERSISTFILE  pPF;
	IHTMLElementCollection* pColl = NULL;
    MSG msg;


	if ( !IsConnected() )
		return FALSE;

    // kill any previous links
    if ( m_pAnchorLinks )
    {
        m_pAnchorLinks->Release();
        m_pAnchorLinks = NULL;
    }

    if ( m_pImageLinks )
    {
        m_pImageLinks->Release();
        m_pImageLinks = NULL;
    }

	// avoid IE error msg box if the file does not exist
    if ( access(pcszFile, 0x00) != 0x00 )
    {
        return FALSE;
    }

    _bstr_t bstrFile(pcszFile);


	// use IPersistFile to load the HTML
    if ( SUCCEEDED(hr = m_pMSHTML->QueryInterface(IID_IPersistFile, (LPVOID*) &pPF)))
	{
		hr = pPF->Load((LPCWSTR)bstrFile, 0);
		pPF->Release();
	}

    BOOL bOK = FALSE;

    if (SUCCEEDED(hr))
    {
		while (GetMessage(&msg, NULL, 0, 0))
		{
			// notification from OnChanged
			if (WM_USER_LOAD_COMPLETE == msg.message && NULL == msg.hwnd)
			{
                bOK = TRUE;
                break;
			}
			else
			{
				DispatchMessage(&msg);
			}
		}
    }


    if ( bOK )
    {
		try
		{
			if ( FAILED(m_pMSHTML->get_links(&m_pAnchorLinks)) ||
				 FAILED(m_pMSHTML->get_images(&m_pImageLinks)) ) 
			{
				throw exception();
			}
		} 
		catch ( exception e )
		{
			if ( m_pAnchorLinks )
			{
				m_pAnchorLinks->Release();
				m_pAnchorLinks = NULL;
			}

			if ( m_pImageLinks )
			{
				m_pImageLinks->Release();
				m_pImageLinks = NULL;
			}

			bOK = FALSE;
		}
    }

	return bOK;
}

/*
	Get the number of links present in the current HTML file
*/
long HTMLParser::GetLinkCount()
{
    long lCount = 0;

    if ( m_pAnchorLinks )
        m_pAnchorLinks->get_length(&lCount);

    return lCount;
}


/*
	Get the number of images present in the current HTML file
*/
long HTMLParser::GetImageCount()
{
    long lCount = 0;

    if ( m_pImageLinks )
        m_pImageLinks->get_length(&lCount);

    return lCount;
}


/*
	Get the URL associated with a given link
*/
BOOL HTMLParser::GetLinkURL(long lIndex, string &rstrURL)
{
	if ( IsConnected() && m_pAnchorLinks )
	    return GetURLFromCollection(m_pAnchorLinks, IID_IHTMLAnchorElement, lIndex, rstrURL);
	else
		return FALSE;
}

/*
	Get the URL associated with a given image
*/
BOOL HTMLParser::GetImageURL(long lIndex, string &rstrURL)
{
	if ( IsConnected() && m_pImageLinks )
	    return GetURLFromCollection(m_pImageLinks, IID_IHTMLImgElement, lIndex, rstrURL);
	else
		return FALSE;
}

/*
	Get the URL associated with an element in a collection.  The element must
	be an image or an anchor.
*/
BOOL HTMLParser::GetURLFromCollection(IHTMLElementCollection *pCollection, REFIID rIID, long lIndex, string &rstrURL)
{
	VARIANT     varIndex;
	VARIANT     var2;
    HRESULT     hr;
	IDispatch*  pDisp = NULL; 
    BOOL        bFound = FALSE;

    varIndex.vt = VT_UINT;
	varIndex.lVal = lIndex;

	VariantInit( &var2 );

	hr = pCollection->raw_item( varIndex, var2, &pDisp );

	if ( SUCCEEDED(hr) && pDisp)
	{
		IHTMLImgElement* pImgElem = NULL;
		IHTMLAnchorElement* pAnchorElem = NULL;
        BSTR bstr = NULL;

        if ( rIID == IID_IHTMLImgElement &&             
		     SUCCEEDED(pDisp->QueryInterface(rIID, (void **)&pImgElem)) )
		{
			pImgElem->get_href(&bstr);
            pImgElem->Release();
            bFound = (bstr != NULL);
		}
        else if ( rIID == IID_IHTMLAnchorElement &&             
		          SUCCEEDED(pDisp->QueryInterface(rIID, (void **)&pAnchorElem)) )
        {
			pAnchorElem->get_href(&bstr);
            pAnchorElem->Release();
            bFound = (bstr != NULL);
		}

		pDisp->Release();

        if ( bFound && bstr )
        {
			// _bstr_t wrapper will delete since fCopy is FALSE
            _bstr_t bstrHREF(bstr, FALSE);
            rstrURL = (LPCSTR)bstrHREF; 
        }
        
	}

    return bFound;
}




⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -