📄 html.cpp
字号:
/*
Implement an HTML parser using IE4's IHTMLDocument2 interface.
*/
#include <windows.h>
#include <comdef.h>
#include <io.h>
#include "html.h"
#include <iostream>
using namespace std;
/*
static function used to force dynamic allocation
*/
HTMLParser *HTMLParser::Create()
{
return new HTMLParser;
}
// constructor/destructor
HTMLParser::HTMLParser()
{
HRESULT hr;
LPCONNECTIONPOINTCONTAINER pCPC = NULL;
LPOLEOBJECT pOleObject = NULL;
LPOLECONTROL pOleControl = NULL;
// initialize all the class member variables
m_dwRef = 1; // must start at 1 for the current instance
m_hrConnected = S_FALSE;
m_dwCookie = 0;
m_pMSHTML = NULL;
m_pCP = NULL;
m_pAnchorLinks = NULL;
m_pImageLinks = NULL;
// Create an instance of an dynamic HTML document
if (FAILED(hr = CoCreateInstance( CLSID_HTMLDocument, NULL, CLSCTX_INPROC_SERVER, IID_IHTMLDocument2, (LPVOID*)&m_pMSHTML )))
{
goto Error;
}
if (FAILED(hr = m_pMSHTML->QueryInterface(IID_IOleObject, (LPVOID*)&pOleObject)))
{
goto Error;
}
hr = pOleObject->SetClientSite((IOleClientSite*)this);
pOleObject->Release();
if (FAILED(hr = m_pMSHTML->QueryInterface(IID_IOleControl, (LPVOID*)&pOleControl)))
{
goto Error;
}
hr = pOleControl->OnAmbientPropertyChange(DISPID_AMBIENT_DLCONTROL);
pOleControl->Release();
// Hook up sink to catch ready state property change
if (FAILED(hr = m_pMSHTML->QueryInterface(IID_IConnectionPointContainer, (LPVOID*)&pCPC)))
{
goto Error;
}
if (FAILED(hr = pCPC->FindConnectionPoint(IID_IPropertyNotifySink, &m_pCP)))
{
goto Error;
}
m_hrConnected = m_pCP->Advise((LPUNKNOWN)(IPropertyNotifySink*)this, &m_dwCookie);
Error:
if (pCPC) pCPC->Release();
}
HTMLParser::~HTMLParser()
{
if ( m_pAnchorLinks )
m_pAnchorLinks->Release();
if ( m_pImageLinks )
m_pImageLinks->Release();
if (SUCCEEDED(m_hrConnected))
m_pCP->Unadvise(m_dwCookie);
if (m_pCP)
m_pCP->Release();
if ( m_pMSHTML )
m_pMSHTML->Release();
}
STDMETHODIMP HTMLParser::QueryInterface(REFIID riid, LPVOID* ppv)
{
*ppv = NULL;
if (IID_IUnknown == riid || IID_IPropertyNotifySink == riid)
{
*ppv = (LPUNKNOWN)(IPropertyNotifySink*)this;
AddRef();
return NOERROR;
}
else if (IID_IOleClientSite == riid)
{
*ppv = (IOleClientSite*)this;
AddRef();
return NOERROR;
}
else if (IID_IDispatch == riid)
{
*ppv = (IDispatch*)this;
AddRef();
return NOERROR;
}
else
return E_NOTIMPL;
}
STDMETHODIMP_(ULONG) HTMLParser::AddRef()
{
return ++m_dwRef;
}
STDMETHODIMP_(ULONG) HTMLParser::Release()
{
if (--m_dwRef == 0)
{
delete this;
return 0;
}
return m_dwRef;
}
STDMETHODIMP HTMLParser::OnChanged(DISPID dispID)
{
HRESULT hr;
if (DISPID_READYSTATE == dispID)
{
VARIANT vResult = {0};
EXCEPINFO excepInfo;
UINT uArgErr;
long lReadyState;
DISPPARAMS dp = {NULL, NULL, 0, 0};
if (SUCCEEDED(hr = m_pMSHTML->Invoke(DISPID_READYSTATE, IID_NULL, LOCALE_SYSTEM_DEFAULT,
DISPATCH_PROPERTYGET, &dp, &vResult, &excepInfo, &uArgErr)))
{
lReadyState = (READYSTATE)V_I4(&vResult);
switch (lReadyState)
{
case READYSTATE_UNINITIALIZED:
case READYSTATE_LOADING:
case READYSTATE_LOADED:
case READYSTATE_INTERACTIVE:
break;
case READYSTATE_COMPLETE:
// IE4 is finished parsing the file
BOOL fRet = PostThreadMessage(GetCurrentThreadId(),
WM_USER_LOAD_COMPLETE,
(WPARAM)0,
(LPARAM)0);
break;
}
VariantClear(&vResult);
}
}
return NOERROR;
}
STDMETHODIMP HTMLParser::Invoke(DISPID dispIdMember,
REFIID riid,
LCID lcid,
WORD wFlags,
DISPPARAMS __RPC_FAR *pDispParams,
VARIANT __RPC_FAR *pVarResult,
EXCEPINFO __RPC_FAR *pExcepInfo,
UINT __RPC_FAR *puArgErr)
{
if (!pVarResult)
{
return E_POINTER;
}
switch(dispIdMember)
{
case DISPID_AMBIENT_DLCONTROL:
// This tells IE4 that we want to download the page,
// but we don't want to run scripts, Java applets, or
// ActiveX controls
V_VT(pVarResult) = VT_I4;
V_I4(pVarResult) = DLCTL_DOWNLOADONLY |
DLCTL_NO_SCRIPTS |
DLCTL_NO_JAVA |
DLCTL_NO_DLACTIVEXCTLS |
DLCTL_NO_RUNACTIVEXCTLS;
break;
default:
return DISP_E_MEMBERNOTFOUND;
}
return NOERROR;
}
BOOL HTMLParser::LoadHTMLFile(LPCSTR pcszFile)
{
HRESULT hr;
LPPERSISTFILE pPF;
IHTMLElementCollection* pColl = NULL;
MSG msg;
if ( !IsConnected() )
return FALSE;
// kill any previous links
if ( m_pAnchorLinks )
{
m_pAnchorLinks->Release();
m_pAnchorLinks = NULL;
}
if ( m_pImageLinks )
{
m_pImageLinks->Release();
m_pImageLinks = NULL;
}
// avoid IE error msg box if the file does not exist
if ( access(pcszFile, 0x00) != 0x00 )
{
return FALSE;
}
_bstr_t bstrFile(pcszFile);
// use IPersistFile to load the HTML
if ( SUCCEEDED(hr = m_pMSHTML->QueryInterface(IID_IPersistFile, (LPVOID*) &pPF)))
{
hr = pPF->Load((LPCWSTR)bstrFile, 0);
pPF->Release();
}
BOOL bOK = FALSE;
if (SUCCEEDED(hr))
{
while (GetMessage(&msg, NULL, 0, 0))
{
// notification from OnChanged
if (WM_USER_LOAD_COMPLETE == msg.message && NULL == msg.hwnd)
{
bOK = TRUE;
break;
}
else
{
DispatchMessage(&msg);
}
}
}
if ( bOK )
{
try
{
if ( FAILED(m_pMSHTML->get_links(&m_pAnchorLinks)) ||
FAILED(m_pMSHTML->get_images(&m_pImageLinks)) )
{
throw exception();
}
}
catch ( exception e )
{
if ( m_pAnchorLinks )
{
m_pAnchorLinks->Release();
m_pAnchorLinks = NULL;
}
if ( m_pImageLinks )
{
m_pImageLinks->Release();
m_pImageLinks = NULL;
}
bOK = FALSE;
}
}
return bOK;
}
/*
Get the number of links present in the current HTML file
*/
long HTMLParser::GetLinkCount()
{
long lCount = 0;
if ( m_pAnchorLinks )
m_pAnchorLinks->get_length(&lCount);
return lCount;
}
/*
Get the number of images present in the current HTML file
*/
long HTMLParser::GetImageCount()
{
long lCount = 0;
if ( m_pImageLinks )
m_pImageLinks->get_length(&lCount);
return lCount;
}
/*
Get the URL associated with a given link
*/
BOOL HTMLParser::GetLinkURL(long lIndex, string &rstrURL)
{
if ( IsConnected() && m_pAnchorLinks )
return GetURLFromCollection(m_pAnchorLinks, IID_IHTMLAnchorElement, lIndex, rstrURL);
else
return FALSE;
}
/*
Get the URL associated with a given image
*/
BOOL HTMLParser::GetImageURL(long lIndex, string &rstrURL)
{
if ( IsConnected() && m_pImageLinks )
return GetURLFromCollection(m_pImageLinks, IID_IHTMLImgElement, lIndex, rstrURL);
else
return FALSE;
}
/*
Get the URL associated with an element in a collection. The element must
be an image or an anchor.
*/
BOOL HTMLParser::GetURLFromCollection(IHTMLElementCollection *pCollection, REFIID rIID, long lIndex, string &rstrURL)
{
VARIANT varIndex;
VARIANT var2;
HRESULT hr;
IDispatch* pDisp = NULL;
BOOL bFound = FALSE;
varIndex.vt = VT_UINT;
varIndex.lVal = lIndex;
VariantInit( &var2 );
hr = pCollection->raw_item( varIndex, var2, &pDisp );
if ( SUCCEEDED(hr) && pDisp)
{
IHTMLImgElement* pImgElem = NULL;
IHTMLAnchorElement* pAnchorElem = NULL;
BSTR bstr = NULL;
if ( rIID == IID_IHTMLImgElement &&
SUCCEEDED(pDisp->QueryInterface(rIID, (void **)&pImgElem)) )
{
pImgElem->get_href(&bstr);
pImgElem->Release();
bFound = (bstr != NULL);
}
else if ( rIID == IID_IHTMLAnchorElement &&
SUCCEEDED(pDisp->QueryInterface(rIID, (void **)&pAnchorElem)) )
{
pAnchorElem->get_href(&bstr);
pAnchorElem->Release();
bFound = (bstr != NULL);
}
pDisp->Release();
if ( bFound && bstr )
{
// _bstr_t wrapper will delete since fCopy is FALSE
_bstr_t bstrHREF(bstr, FALSE);
rstrURL = (LPCSTR)bstrHREF;
}
}
return bFound;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -