snagpars.cpp

来自「下载整个站点,将某一站点的资源全部下载,只要输入正确合法的站址!」· C++ 代码 · 共 267 行

CPP

267 行

/*
	Snagpars.cpp : implementation of the CSnaggerHtmlParser class

	Implements a derived HTML parser that specifically handles the types
	of objects needed by SiteSnagger.

	Author: Steven E. Sipe
*/

#include "stdafx.h"

#include "parser.h"
#include "snagpars.h"
#include "inet.h"


// The list of valid filetypes that SiteSnagger cares about
const int g_nValidFiles = 17;
static char *g_aszValidFiles[g_nValidFiles] = 
{
	".htm",
	".html",
	".shtm",
	".htms",
	".shtml",
	".gif",
	".jpg",
	".jpeg",
	".wav",
	".au",
	".asp",
	".bmp",
	".pdf",
	".txt",
	".class",
	".mid",
	".midi",
};

// Returns TRUE if the current page is a text link reference
BOOL CSnaggerHtmlParser::IsTextPage(CString& strFileName)
{
	CString strExt = CInet::SplitFileName(strFileName,CInet::EXT);
	strExt.MakeLower();
	strExt = strExt.SpanExcluding("#");

	if(strExt == ".htm" || strExt == ".html" || strExt == ".shtml" ||
			strExt == ".shtm" || strExt == ".htms" || strExt == "" ||
			strExt == ".com" || strExt == ".asp")
		return(TRUE);

	return(FALSE);
}

// Returns TRUE if we want to get this type of file
BOOL CSnaggerHtmlParser::WantFileType(CString& strFileName)
{
	CString strExt = CInet::SplitFileName(strFileName,CInet::EXT);
	strExt.MakeLower();
	strExt = strExt.SpanExcluding("#");

	if(strExt == ".com" || strExt.IsEmpty())
		return(TRUE);

	for(int i = 0; i < g_nValidFiles; i++)
	{
		if(strExt == g_aszValidFiles[i])
			return(TRUE);
	}

	return(FALSE);
}

// Clears all of the arrays used by the parser to collect links and
// so forth.
void CSnaggerHtmlParser::ResetArrays() 	
{ 
	m_arrStrMedia.RemoveAll(); 
	m_arrStrLinks.RemoveAll(); 
	m_arrbyIsOffsite.RemoveAll();
	m_Fixups.arrURL.RemoveAll();
	m_Fixups.arrIndex.RemoveAll();
	m_strTitle = "Untitled";
	m_bGetMedia = FALSE;
}

// Returns TRUE if we care about getting this particular HTML tag.  This
// method is called by the base HTML parser class during parsing of an
// HTML page
BOOL CSnaggerHtmlParser::WantTag(const CString& strTag)
{
	if(strTag == "frame" || strTag == "a" || strTag == "area" ||
			strTag == "meta" || strTag == "img" || strTag == "body" ||
			strTag == "bgsound" || strTag == "title" || strTag == "embed" ||
			strTag == "applet")
		return(TRUE);
	else return(FALSE);
}

// Called to process the specific tag the we've encountered during parsing.
// This method is called by the base HTML parser class (only if a previous
// call to WantTag() returned TRUE.
BOOL CSnaggerHtmlParser::ProcessTag(const CString& strTag, int nIndex, 
										const HTML_ARGS& arrOptions) 
{
	CString strText;

	// Frame, anchor or area tag?  -- these are all linked pages
	if(strTag == "frame" || strTag == "a" || strTag == "area")
	{
		for(int i = 0; i < arrOptions.GetSize(); i++)
		{
			strText = arrOptions[i].GetParm();

			// Get the src or href item -- this gives us the page
			if(strText == "src" || strText == "href")
			{
				strText = arrOptions[i].GetValue();

				// Only extract up to the parameters
				strText = strText.SpanExcluding("?");
				if(!WantFileType(strText))
					break;

				CString strServer, strObject, strUser, strPassword;
				INTERNET_PORT nPort;
				DWORD dwServiceType;
				AfxParseURLEx(strText,dwServiceType,strServer,strObject,nPort,
										strUser,strPassword,
										ICU_NO_ENCODE);

				CString strTemp = strText;
				strTemp.MakeLower();
				BOOL bMail = (strTemp.Left(7) == "mailto:");

				// Is this a mailto link then save it and get out now
				if(bMail)
				{
					strText.MakeLower();
					m_arrStrMedia.Add(strText);
					return(TRUE);
				}

				CString strOldServer, strOldObject;
				AfxParseURLEx(m_strPageURL,dwServiceType,strOldServer,
										strOldObject,nPort,
										strUser,strPassword,
										ICU_NO_ENCODE);

				// If the starting server is different that the new pages'
				// sever, then we must be offsite
				if(IsTextPage(strText))
				{
					if(!strServer.IsEmpty() && strOldServer != strServer)
						m_arrbyIsOffsite.Add(TRUE);
					else m_arrbyIsOffsite.Add(FALSE);
				}

				// Put the completely qualified name back together now
				AfxParseURLEx(m_strPageURL,dwServiceType,strServer,strObject,nPort,
									strUser,strPassword,
									ICU_NO_ENCODE);

				strServer = "http://"+strServer+strObject;
				char szOut[1000];
				unsigned long ulLen = sizeof(szOut);
				::InternetCombineUrl(strServer,strText,szOut,&ulLen,
											ICU_NO_ENCODE);
				strText = szOut;

				// Save some fixup information
				BOOL bText = IsTextPage(strText);

				if(m_bInFixupMode)
				{
					if(bText || m_bGetMedia)
					{
						m_Fixups.arrURL.Add(strText);
						m_Fixups.arrIndex.Add(arrOptions[i].GetIndex());
						m_Fixups.arrTextPage.Add(bText);
					}
				}
				else
				{
					// Save this link -- either links or multimedia
					if(bText)
						m_arrStrLinks.Add(strText);
					else if(m_bGetMedia)
						m_arrStrMedia.Add(strText);
				}

				// Found our matching option, so get out now
				break;
			}
		}
	}	// Handle the multimedia tags too
	else if(strTag == "meta" || strTag == "img" || strTag == "body" ||
						strTag == "bgsound" || strTag == "embed" || 
						strTag == "applet")
	{
		for(int i = 0; i < arrOptions.GetSize(); i++)
		{
			strText = arrOptions[i].GetParm();

			// Get the URL for the item
			if(strText == "src" || strText == "background" || strText == "code")
			{
				strText = arrOptions[i].GetValue();
				strText = strText.SpanExcluding("?");

				// Do we want this item?
				if(!WantFileType(strText))
					break;

				// Put the complete URL back together
				CString strServer, strObject, strUser, strPassword;
				INTERNET_PORT nPort;
				DWORD dwServiceType;
				AfxParseURLEx(m_strPageURL,dwServiceType,strServer,strObject,nPort,
										strUser,strPassword,
										ICU_NO_ENCODE);

				strServer = "http://"+strServer+strObject;
				char szOut[1000];
				unsigned long ulLen = sizeof(szOut);
				::InternetCombineUrl(strServer,strText,szOut,&ulLen,
										ICU_NO_ENCODE);
				strText = szOut;

				// Save some fixup information
				if(m_bGetMedia && !IsTextPage(strText))
				{
					if(m_bInFixupMode)
					{
						m_Fixups.arrURL.Add(strText);
						m_Fixups.arrIndex.Add(arrOptions[i].GetIndex());
						m_Fixups.arrTextPage.Add(FALSE);
					}
					else m_arrStrMedia.Add(strText);
				}

				// Found our matching option, so get out now
				break;
			}
		}
	}	// Get the page title (if we need to)
	else if(strTag == "title")
	{
		// Make sure we haven't gotten a title for this page yet...
		if(m_strTitle == "Untitled")
		{
			char ch = m_pRawBuffer[++nIndex];
			m_strTitle.Empty();

			// Grab the title up to the start of the </TITLE> tag
			while(ch && ch != '<')
			{
				m_strTitle += ch;
				nIndex++;
				ch = m_pRawBuffer[nIndex];
			}
		}
	}

	return(TRUE);
}

snagpars.cpp - 源码说明

本页面展示了「下载整个站点,将某一站点的资源全部下载,只要输入正确合法的站址!」中的 snagpars.cpp 源码文件，采用 C++ 编程语言编写，共 267 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。

虫虫下载站收录了大量与站点相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。

⌨️ 快捷键说明

复制代码Ctrl + C

搜索代码Ctrl + F

全屏模式F11

增大字号Ctrl + =

减小字号Ctrl + -

显示快捷键?