⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 webhost.cpp

📁 自动分析HTML页面中的超链接程序
💻 CPP
字号:
/*########################################################################
	【文件名】: 	WebHost.cpp
	【名  称】:		解析web连接的封装类.
	----------------------------------------------------
	Remarks:	...
	----------------------------------------------------
	Author:		huawenNie
	Email:		nie173@vip.sina.com
	MSN:		nie173@msn.com
	Created:	03/05/2004
  ########################################################################*/

#include "stdafx.h"
#include "WebHost.h"

//构造函数
CWebHost::CWebHost(const CString& m_str_webcode ,  vector<HyperLink>& m_vec_All_URL ,HyperLink& str_URL)
{
   str_Page_URL = str_URL.str_Hyperlink;
   str_pagetitle = str_URL.str_HyperlinkText;
   mb_ifFream = FALSE;
   OnRetrunWebContent(  m_str_webcode , m_vec_All_URL );
}

//=============================================================
//
// 返回所有合法链接
//
//=============================================================
void CWebHost::OnRetrunWebContent(const  CString& str_htmlcode ,  vector<HyperLink>& m_vec_All_URL)
{
   // 处理JAVASCRIPT代码
   OnAnalyseJavascrript( str_htmlcode , m_vec_All_URL);
   //处理嵌套代码的URL
   OnReturnFrameURL( str_htmlcode , m_vec_All_URL );
   //
   int pagesize = str_htmlcode.GetLength();
   //获取html的url
	OnGetHtmlURL( str_htmlcode, m_vec_All_URL, pagesize);
   //获取跳转的url
	if(m_vec_All_URL.size()<5)
	{
       OnGetJumpURL( str_htmlcode, m_vec_All_URL,pagesize);
	}
   
}
//////////////////////////////////////////////////////////////////////////
//返回Javascript代码里面合要求的URL 参数说明
//[页面代码],[查重URL容器],[返回的URL容器]
void CWebHost::OnAnalyseJavascrript(const CString& str_htmlcode ,  vector<HyperLink>& m_vec_All_URL  )
{
	CStringArray str_javascript;
	int pos1 = 0;
	int pos2 = 0;
	//int iLinkTextpos=0;
	//获取页面所有javascipt代码
	int isafety=0;
	while( pos1 != -1&&isafety<100 )
	{
		pos1 = str_htmlcode.Find( "<script" , pos1 );
		if( pos1!=-1 )
		{
			pos2 = str_htmlcode.Find( "</script>" , pos1 );
			pos2 +=9;
			str_javascript.Add( str_htmlcode.Mid( pos1 , pos2-pos1 ) );
			pos1 = pos2-1;
		}
		isafety++;
	}
	//返回页面Javascrript里面的URL
	CString str_link;
	for(int a = 0 ; a < str_javascript.GetSize() ; a++)
	{

		pos1=0;
		pos2=0;
		for(int b=0;pos1!=-1&&b<str_javascript[a].GetLength();b++)
		{
			pos1 = str_javascript[a].Find( "('" , pos1 );
			if( pos1!=-1 )
			{
				pos1 +=1 ;
				pos2 = str_javascript[a].Find("')",pos1);
				pos2 -=pos1;

				str_link = str_javascript[a].Mid(pos1,pos2);

				if(str_link.Find(".htm",0)!=-1||str_link.Find(".html",0)!=-1||
					str_link.Find(".php",0)!=-1||str_link.Find(".asp",0)!=-1||
					str_link.Find(".xml",0)!=-1||str_link.Find("http://",0)!=-1)
			 {
				 str_link.Replace(" ","");
				 str_link.Replace("'","");
				 str_link.Replace("\"","");
				 //,逗号处理
				 if(str_link.Find(",",0)!=-1)
				 {
					 str_link=str_link.Left(str_link.Find(",",0));
				 }
				 //=号处理
				 if(str_link.Find("=",0)!=-1)
				 {
					 str_link=str_link.Right(str_link.Find("=",0));
				 }
				 //>处理
				 if(str_link.Find(">",0)!=-1)
				 {
					 str_link=str_link.Left(str_link.Find(">",0));
				 }
				 //-----------------------------------------------------------------
				 if(str_link.Find("http://",0)<-1)//URL如果是相对地址的话
				 { 
					 str_link = OnConversionURL ( str_Page_URL , str_link);
				 }	

					 if( str_link.GetLength()>5)
					 {
						 m_HyperLink.str_Hyperlink=str_link;
						 m_HyperLink.str_HyperlinkText.Empty();
						 m_vec_All_URL.push_back( m_HyperLink );
					     
					 }
				 
			 }
			}
		}
	}
}
//====================================================
//
// ../表示向上一层
// /表示根目录下的
// XX.htm表示当前目录下的
// 把URL转换成绝对地址
CString CWebHost::OnConversionURL(CString sURL,CString str_fafURL)
{
	if(sURL.Find("/",8)<0)
	{
		sURL +="/";
	}
	str_fafURL.Replace("'","");
	CString str_activeURL;
	int int_j = 0;
	int i=0;
	str_activeURL = str_fafURL;
	if(str_fafURL.Find("../",0)!=-1&&str_fafURL[0]!='/')
	{
		while( i<=str_fafURL.GetLength() )
		{
			if( str_fafURL[i] == '.' && str_fafURL[i+1] == '.' && str_fafURL[i+2] == '/' )
			{ int_j++;}
			i++;
		}
		if(str_fafURL[0]=='/')
		{
			str_fafURL.Delete(0,1);
		}
		str_fafURL.Replace("../","");
		i=0;
		int int_i=0;
		while( i <= sURL.GetLength() )
		{
			if( sURL[i]=='/' )
			{ 
				int_i++;
			}
			i++;
		}
		int_i -= int_j;

		if( int_i<3 )
		{
			int_i = 3;
		}

		int int_cour=0;
		for( i=0; i<=sURL.GetLength(); i++)
		{
			if( sURL[i]=='/' )
			{ 
				int_cour++;
			}
			if( int_cour==int_i )
			{			 
				sURL= sURL.Left(i+1);
				break;
			}
		}
		//容错处理
		if( sURL[sURL.GetLength()-1]!='/' )
		{	
			sURL +="/";
		}
		sURL += str_fafURL;
		//   sURL += "[转换] 1";
		return sURL;
	}
	else
	{
		if( str_fafURL[0] =='/' )
		{
			int int_b = 0 ;
			for( int a=0; int_b<3 && a<sURL.GetLength(); a++)
			{
				if( sURL[a]=='/' )
			 {
				 int_b++;
			 }
			 if( int_b==3 )
			 {
				 sURL = sURL.Left(a);
				 break;
			 }
			}
			sURL += str_fafURL;
			//sURL += "[转换] 2";
		}
		else
		{
			for( int i=sURL.GetLength() ; i> 0 ; i-- )
			{
				if( sURL[i] =='/' )
				{
					sURL = sURL.Left( i+1 );
					break;
				}
			}
			sURL += str_fafURL;
			// sURL += "[转换 3]";
		}
		//	 sURL += "[转换]";
		return sURL;
	}
}
//////////////////////////////////////////////////////////////////////////
//获取html的url
void CWebHost::OnGetHtmlURL(const CString& str_htmlcode, vector<HyperLink>& m_vec_All_URL,int pagesize)
{
	int tag3 = 0;
	int tag0 = 0;
	int iLinkTextpos=0;
	int tag1 = 0;

	for(int i=0;i<pagesize&&tag0!=-1;i++)
	{
		tag0=str_htmlcode.Find("href=",tag0);
		if(tag0!=-1)
		{tag0 +=5;}
		tag1=str_htmlcode.Find(">",tag0);
		iLinkTextpos = tag1;
		tag1 -=tag0;
		str_link=str_htmlcode.Mid(tag0,tag1);
		str_link.Replace("\"","");
		 
		if(!str_link.IsEmpty())
		{
			tag3=str_link.Find(" ",0);
			if(tag3!=-1)
			{	
				str_link=str_link.Left(tag3);
			}
			else 
				//URL可能含有换行符合
			if(str_link.Find("\n",0)!=-1)
			{
				str_link=str_link.Left(str_link.Find("\n",0));
			}
	       
			//-----------------------------------------------------------------
			if(str_link.Find("http://",0)<0)//URL如果是相对地址的话
			{ 
				str_link= OnConversionURL(str_Page_URL,str_link);
			}			    			
				str_link.Replace( "\'" , "");
				m_HyperLink.str_Hyperlink=str_link;
				//AfxMessageBox(str_link);
				m_HyperLink.str_HyperlinkText=OnGetLinkText(iLinkTextpos+1,str_htmlcode);
               m_vec_All_URL.push_back(m_HyperLink);
		}//if结束
	}//if over
}
//////////////////////////////////////////////////////////////////////////
//获取跳转的url
void CWebHost::OnGetJumpURL(const CString& str_htmlcode, vector<HyperLink>& m_vec_All_URL, int pagesize)
{
	//处理跳转
	int tag0=0;
	int tag1=0;
	int tag3=0;
	int tag4=0;
	int i;
	int iLinkTextpos=0;
	for( i=0;i<pagesize&&tag0!=-1;i++)
	{
		tag0=str_htmlcode.Find("url=",tag0);
		if(tag0!=-1)
		{tag0 +=4;}
		tag1=str_htmlcode.Find(">",tag0);
		iLinkTextpos = tag1;
		tag1 -=tag0;
		str_link=str_htmlcode.Mid(tag0,tag1);

		if(!str_link.IsEmpty())
		{

			tag3=str_link.Find("\"",0);
			if(tag3!=-1)
			{ str_link.Replace("\"","");}
			tag3=str_link.Find(" ",0);
			if(tag3!=-1)
			{	
				str_link=str_link.Left(tag3);
			}
			else
				//URL可能含有换行符合
				if(str_link.Find("\n",0)!=-1)
				{
					str_link=str_link.Left(str_link.Find("\n",0));
				}

				if(str_link.Find("http://",0)==-1)//URL如果是相对地址的话
				{ 
					str_link=OnConversionURL(str_Page_URL,str_link);
				}	
				if( !tag4 && str_link.GetLength() > 5)//URL未被处理
				{
					str_link.Replace( "\'" , "");
					mb_ifFream=TRUE;
					m_HyperLink.str_Hyperlink=str_link;
					m_HyperLink.str_HyperlinkText=OnGetLinkText(iLinkTextpos+1,str_htmlcode);
					m_vec_All_URL.push_back(m_HyperLink);

				}
		}//if结束
	}//if over
}

//////////////////////////////////////////////////////////////////////////

//取URL连接文字
CString CWebHost::OnGetLinkText(int iIn,const CString& htmlcode)
{
	CString str_return;
	int ipos1=0;
	int ipos2=0;
	int iSafety=0;
	ipos1=htmlcode.Find("</a>",iIn);
    ipos2 = htmlcode.Find("<a>",iIn);
    if(ipos2>0)
     {
       if(ipos2<ipos1)
          ipos1 = ipos2;
     }
	if(ipos1>-1)
	{
        str_return =htmlcode.Mid(iIn,ipos1-iIn);
		if(str_return.Find("href=",0)<0)
		{
			while(ipos1>-1&&iSafety<500)
			{
				iSafety++;
				//
				ipos1 = str_return.Find("<",0);
				ipos2 = str_return.Find(">",ipos1);
				if(ipos1>-1&&ipos2>-1)
				{
					str_return.Delete(ipos1,ipos2-ipos1+1);
				}
				else
				{
                    ipos1 =-1;
				}
			}
		}
		else
		{
			str_return.Empty();
		}
	}
    
	if(str_return.GetLength()==1||str_return.GetLength()==2)
	{
		str_return =str_pagetitle;
	}
	return str_return;
}
//处理嵌套代码的URL
void CWebHost::OnReturnFrameURL(const CString& str_htmlcode ,vector<HyperLink>& m_vec_All_URL  )
{
   CStringArray str_javascript;
   int pos1 = 0;
   int pos2 = 0;
   str_javascript.RemoveAll();
   //获取页面所有 frame 代码
    CString str_link;
   while( pos1 != -1 )
   {
    pos1 = str_htmlcode.Find( "<frame" , pos1 );
    if( pos1!=-1 )
    {
	pos2 = str_htmlcode.Find( "</frameset>" , pos1);
    pos1 += 6;
    pos2 -= pos1;
	str_javascript.Add( str_htmlcode.Mid( pos1 , pos2 ) );
    }
   }
   for(int a = 0 ; a < str_javascript.GetSize() ; a++)
   {
	   
       //AfxMessageBox( str_javascript[a] );
	   pos1=0;
	   pos2=0;
	   for(int b=0;pos1!=-1&&b<str_javascript[a].GetLength();b++)
	   {
		   
           pos1 =str_javascript[a].Find( "src" , pos1 );
		   if( pos1!=-1 )
		   {
			   pos1 = str_javascript[a].Find( "=" , pos1 );
			   pos2 = str_javascript[a].Find( ">" , pos1 );
			   pos1 += 1;
			   pos2 -= pos1;
               str_link = str_javascript[a].Mid(pos1,pos2);
			  
               if(str_link.Find("http://",0)<0 
				   && str_link.Find(">",0)<0 && str_link.Find("<",0)<0 )//URL如果是相对地址的话
				 { 
				   
			      str_link = OnConversionURL ( str_Page_URL , str_link);
				 }		  
			    str_link.Replace( "\'" , "");
				
				if( str_link.GetLength() > 5)//URL未被处理
				{
					mb_ifFream =TRUE;		  
				m_HyperLink.str_Hyperlink=str_link;
                m_HyperLink.str_HyperlinkText.Empty();
			    m_vec_All_URL.push_back(m_HyperLink);
				}
		   }
	   }
   }
}

//====================================================
//URL模板,有4个通配符,$ 表示 一个数字,^ 表示一串数字,# 表示一个字符,!表示一串字符 文本
//判断URL是否是
// http://news.sina.com.cn
//====================================================


⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -