📄 htmlparser.cpp

📁 网络图片收集软件
💻 CPP
📖 第 1 页 / 共 2 页
字号:
12 下一页
// HtmlParser.cpp: implementation of the CHtmlParser class.
//
//////////////////////////////////////////////////////////////////////

#include "stdafx.h"
#include "image.h"
#include "HtmlParser.h"
#include <afxinet.h>

#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__;
#define new DEBUG_NEW
#endif
/*
const int g_nValidFiles=9;
static char * g_aszValidFiles[g_nValidFiles]=
{
	".htm",
	".html",
	".shtm",
	".htms",
	".shtml",
	".aspx",
	".jsp",
	".php",
	".jhtml",
};
*/
const int g_nValidFiles=1;
static char * g_aszValidFiles[g_nValidFiles]=
{
	".swf",
};
const int g_nValidImages=6;
static char *g_aszValidImages[g_nValidImages]=
{
	".gif",
	".jpg",
	".jpeg",
	".bmp",
	".pnp",
	".tiff",
};



//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////

CHtmlParser::CHtmlParser(Option &option,int unlikelevel)
{
	m_image_type=option.image_type;
	m_max_length=option.max_length;
	m_max_width=option.max_width;
	m_min_length=option.min_length;
	m_min_width=option.min_width;
	m_unlike_level=unlikelevel;
	m_exclusive_level=option.exclusive_level;
	m_site_maxlevel=option.max_level;//本变量用来记住本网站要下载的最大层的层数
	//i=0;

}

CHtmlParser::~CHtmlParser()
{

}

bool CHtmlParser::FindToken(const char *pTokens, const char *pBuffer, int &nIndex, int nMaxLen)
{
	int nTokenCount=strlen(pTokens);
	bool bQuote=false;

	//while we haven't run out of buffer space...
	while(nIndex<nMaxLen)
	{
		for(int i=0;i<nTokenCount;i++)
		{
			if(pBuffer[nIndex]==pTokens[i])
				return (true);

		}
		nIndex++;
	}
	return false;

}

void CHtmlParser::ParseText(char *pBuffer, int nMaxLen)
{
	int nIndex=0;

	//Initialize some work flags
	//m_strTag.Empty();
	//m_arrparapair.RemoveAll();

	while(nIndex < nMaxLen)
	{
		if(GetTag(pBuffer,nIndex,nMaxLen))
		{
			ProcessTag();
		}
	}

}

//最后nIndex的值为指向pTerminals的索引，这和原来的定义有差别
CString CHtmlParser::GetString(const char *pTerminals, const char *pBuffer, int &nIndex, int nMaxLen)
{
	int nOldIndex=nIndex;
	CString strText;

	//跳过前面的空格
	while(nIndex<nMaxLen && pBuffer[nIndex]==' ')
		nIndex++;
	//查询pTerminals指向的字符
	if(FindToken(pTerminals,pBuffer,nIndex,nMaxLen))
	{
		/*下面的内容为我自己加的原来的代码为
		 * nIndex++;
		 * strText=nBuffer+nOldIndex;
		 * strText=strText.left(nIndex-nOldIndex-1);
		 */
		strncpy(strText.GetBuffer(nIndex-nOldIndex),pBuffer+nOldIndex,nIndex-nOldIndex);
		strText.ReleaseBuffer();

	}
	return strText;

}

//是否完整，以后再说
bool CHtmlParser::WantTag()
{
	if(m_strTag=="frame"||m_strTag=="a"||m_strTag=="area"||
		m_strTag=="img"||m_strTag=="body"||m_strTag=="embed"||m_strTag=="input"
		||m_strTag=="param")
		return true;
	
	
	return false;

}



bool CHtmlParser::GetTag(const char *pBuffer, int &nIndex, int nMaxLen)
{
	m_strTag.Empty();//要放此次分析得到tag
	m_arrparapair.RemoveAll();//此数组要放tag对应的属性和属性对应的值

	//找到tag的起始点
	if(FindToken("<",pBuffer,nIndex,nMaxLen))
	{
		//现在把nIndex指向tag?
		int nStart=++nIndex;//+1;
		//nIndex++;
		
		//判断是否是一个有效的tag
		if(pBuffer[nIndex]==' '||pBuffer[nIndex]=='>'||pBuffer[nIndex]=='!')
			return false;

		//找到tag的结束位置
		if(FindToken(">",pBuffer,nIndex,nMaxLen))
		{
			//取得tag的值 only the tag ,not the parameter and value such as
			//<a href="http://www.china.com">
			m_strTag=GetString(" >\r\n",pBuffer,nStart,nIndex+1);
			m_strTag.MakeLower();
			//CString script;
			//bool noFind=true;
			//if(m_strTag=="script")
			//{
			//	while(noFind)
			//	{
			//		FindToken("<",pBuffer,nStart,nMaxLen);
			//		nStart++;
			//		if(nStart>=nMaxLen)
			//		break;
			//		_tcsncpy(script.GetBuffer(8),pBuffer+nStart,7);
			//		script.ReleaseBuffer();
			//		if(script=="/script")
			//		noFind=false;

			//	}
			//	nStart+=7;
				//处理script内容
			//	nIndex=nStart;

			//}
			//else
			if(WantTag())
			{
				GetParaPair(pBuffer,nStart,nIndex+1);
				return true;
			}
		}

	}
	return false;

}

bool CHtmlParser::ProcessTag()
{
	CString strText;
	CString temp;

	int ii;
	int level;
	//需要加入input 类型对于像www.eluxury.com类型的网站

	if(m_strTag=="frame" ||m_strTag=="a"||m_strTag=="area"||m_strTag=="input")
	{
		for(int i=0;i<m_arrparapair.GetSize();i++)
		{
			strText=m_arrparapair[i].GetPara();

			/*
			 *我们感兴趣的有onclick=""之类的东西
			 *
			 */
			level=m_nLevel;
			CString substr;
			if(strText=="onclick")
			{
				strText=m_arrparapair[i].GetValue();
				substr=strText.Left(11);
				substr.MakeLower();
				if(substr=="window.open")
				{
					//处理相http://wwww.shoebuy.com一类的网站
					int begin=strText.Find('(');
					if(begin!=-1)
						substr=strText.Mid(begin+1);
					else
						substr=strText.Mid(11);
					substr.TrimLeft();

					if(substr.Left(1)=="\"")
						substr=substr.SpanExcluding("\"");
					else if(substr.Left(1)=="'")
						substr=substr.SpanExcluding("'");
					else
						substr=substr.SpanExcluding(" ,");
					//现在str为window.open的第一个参数也既是url
					MakeQualifiedUrl(substr,level);
					if(level>0)
					{
						URL_LEVEL url_level;
						url_level.m_url=substr;
						url_level.level=level;
						m_url_level.Add(url_level);
					}

				}
				else if(substr=="javascript:"){
					//留下来供将来扩充

				}
			
				
			}
			else if(strText=="href")
			{
				//定做要在本段中进行大量的改造,需要留下扩充的接口

				strText=m_arrparapair[i].GetValue().SpanExcluding("?;");				
				strText.MakeLower();

				if(strText.Left(11)=="javascript:")
				{
					//定做主要在本部分完成而且要返回
					substr=strText.Mid(11);
					//CString temp=substr.Left(
					if(substr.Left(5)=="popup")
					{
						//本部分要处理的网站是www.zappos.com
						CString url,tmp;
						substr=substr.SpanExcluding(")");
						int index=substr.Find("(",0);
						substr=substr.Mid(index+1);
						//url是PopUp的第一个参数
						url=substr.SpanExcluding(",");

						index=substr.Find(",");
						substr=substr.Mid(index+1);
						
						//现在tmp是PopUp的第二个参数
						tmp=substr.SpanExcluding(",");
						tmp.MakeLower();
						index=tmp.Find("large",0);
						//如果不含large则不是我们感兴趣的东西退出
						if(index==-1)
							return true;
						tmp=tmp.SpanExcluding("l");

						for(index=0;index<url.GetLength();index++)
							if(url[index]=='\\'||url[index]=='\'')
								url.SetAt(index,' ');
						//现在url是已经提取的第一个参数
						url.TrimLeft();
						url.TrimRight();

						for(index=0;index<tmp.GetLength();index++)
							if(tmp[index]=='\\'||tmp[index]=='\'')
								tmp.SetAt(index,' ');

						//现在tmp是我们需要的第二个参数
						tmp.TrimLeft();
						tmp.TrimRight();
						
						//url是我们要求的超链
						substr=url+"?product_id="+tmp+"&color_id=";
						

						MakeQualifiedUrl(substr,level);
						if(level>0)
						{
							URL_LEVEL url_level;
							url_level.m_url=substr;
							url_level.level=level;
							m_url_level.Add(url_level);
						}

					}else if(substr.Left(11)=="executelink")//现在要处理的网站是www.coldwatercreek.com
					{
						strText=m_arrparapair[i].GetValue();
						int ss=strText.Find(',');
						if(ss!=-1)
						{
							strText=strText.Mid(ss+1);
							strText=strText.SpanExcluding(")");
							for(ss=0;ss<strText.GetLength();ss++)
							if((strText[ss]=='\'')||(strText[ss]=='"'))
								strText.SetAt(ss,' ');
							
							strText.TrimLeft();
							strText.TrimRight();
							if(strText.Find("productlist")==-1)
							{
								substr=strText.Mid(15);
								substr="/aspx/product.aspx"+substr;
							}
							else
								substr="/"+strText;

							MakeQualifiedUrl(substr,level);
							if(level>0)
							{
								URL_LEVEL url_level;
								url_level.m_url=substr;
								url_level.level=level;
								m_url_level.Add(url_level);
							}							
						}//end if(ss!=-1)
					}//end else if(substr.Left(11)=="executelink")

					continue;
				}//end 	if(strText.Left(11)=="javascript:")//下面判断是否是邮件的链接如果是简单的返回true
				else if(strText.Left(7)=="mailto:")
					continue;
				//此链接为指向本页面内的一个链接ignore it
				else if(strText.Left(1)=="#")
					continue;
				else if(WantFileType(strText))
				{
					//下面被我修改看是否可以实现本网站的下载
					strText=m_arrparapair[i].GetValue();
					MakeQualifiedUrl(strText,level);
					if(level>0)
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -