⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 dedehtml2.class.php

📁 这是matlab的一个小程序
💻 PHP
字号:
<?php
if(!defined('DEDEINC'))
{
	exit('dedecms');
}
/*******************************
* 织梦HTML解析类V1.6 PHP版 www.dedecms.com
* function c____DedeHtml2();
* 这个类针对于采集程序,主要是获取某区域内的图片、超链接等信息
* 最后修改 2008-7-8 ,修改links获取方式,能识别link内是否有图片,去除不使用的识别HTML头的功能
********************************/
class DedeHtml2
{
	var $CAtt;
	var $SourceHtml;
	var $Title;
	var $Medias;
	var $MediaInfos;
	var $Links;
	var $CharSet;
	var $BaseUrl;
	var $BaseUrlPath;
	var $HomeUrl;
	var $IsHead;
	var $ImgHeight;
	var $ImgWidth;
	var $GetLinkType;

	//构造函数
	function __construct()
	{
		$this->CAtt = '';
		$this->SourceHtml = '';
		$this->Title = '';
		$this->Medias = Array();
		$this->MediaInfos = Array();
		$this->Links = Array();
		$this->BaseUrl = '';
		$this->BaseUrlPath = '';
		$this->HomeUrl = '';
		$this->IsHead = false;
		$this->ImgHeight = 30;
		$this->ImgWidth = 50;
		$this->GetLinkType = 'link';
	}

	function DedeHtml2()
	{
		$this->__construct();
	}

	//设置HTML的内容和来源网址
	function SetSource(&$html,$url = '',$linktype='')
	{
		$this->__construct();
		$this->CAtt = new DedeAttribute2();
		$url = trim($url);
		$this->SourceHtml = $html;
		$this->BaseUrl = $url;
		//判断文档相对于当前的路径
		$urls = @parse_url($url);
		$this->HomeUrl = $urls['host'];
		$this->BaseUrlPath = $this->HomeUrl.$urls['path'];
		$this->BaseUrlPath = preg_replace("/\/([^\/]*)\.(.*)$/","/",$this->BaseUrlPath);
		$this->BaseUrlPath = preg_replace("/\/$/",'',$this->BaseUrlPath);
		if($linktype!='')
		{
			$this->GetLinkType = $linktype;
		}
		if($html != '')
		{
			$this->Analyser();
		}
	}

	//解析HTML
	function Analyser()
	{
		$cAtt = new DedeAttribute2();
		$cAtt->IsTagName = false;
		$c = '';
		$i = 0;
		$startPos = 0;
		$endPos = 0;
		$wt = 0;
		$ht = 0;
		$scriptdd = 0;
		$attStr = '';
		$tmpValue = '';
		$tmpValue2 = '';
		$tagName = '';
		$hashead = 0;
		$slen = strlen($this->SourceHtml);
		if($this->GetLinkType=='link' || $this->GetLinkType=='')
		{
			$needTags = array('a');
		}
		if($this->GetLinkType=='media')
		{
			$needTags = array('img','embed','a');
			$this->IsHead = true;
		}
		$tagbreaks = array(' ','<','>',"\r","\n","\t");
		for(;isset($this->SourceHtml[$i]);$i++)
		{
			if($this->SourceHtml[$i]=='<')
			{
				$tagName = '';
				$j = 0;
				for($i=$i+1; isset($this->SourceHtml[$i]); $i++)
				{
					if($j>10)
					{
						break;
					}
					$j++;
					if( in_array($this->SourceHtml[$i],$tagbreaks) )
					{
						break;
					}
					else
					{
						$tagName .= $this->SourceHtml[$i];
					}
				}
				$tagName = strtolower($tagName);

				//标记为注解
				if($tagName=='!--')
				{
					$endPos = strpos($this->SourceHtml,'-->',$i);
					if($endPos !== false)
					{
						$i=$endPos+3;
					}
					continue;
				}

				//标记在指定集合内
				else if( in_array($tagName,$needTags) )
				{
					$startPos = $i;
					$endPos = strpos($this->SourceHtml,'>',$i+1);
					if($endPos===false)
					{
						break;
					}
					$attStr = substr($this->SourceHtml,$i+1,$endPos-$startPos-1);
					$cAtt->SetSource($attStr);
					if($tagName=='img')
					{
						$this->InsertMedia($cAtt->GetAtt('src'),'img');
					}
					else if($tagName=='embed')
					{
						$rurl = $this->InsertMedia($cAtt->GetAtt('src'),'embed');
						if($rurl != '')
						{
							$this->MediaInfos[$rurl][0] = $cAtt->GetAtt('width');
							$this->MediaInfos[$rurl][1] = $cAtt->GetAtt('height');
						}
					}
					else if($tagName=='a')
					{
						$this->InsertLink($this->FillUrl($cAtt->GetAtt('href')),$this->GetInnerText($i,'a'));
					}
				}
				else
				{
					continue;
				}
			}//End if char

		}//End for

		if($this->Title == '')
		{
			$this->Title = $this->BaseUrl;
		}
	}

	//重置资源
	function Clear()
	{
		$this->CAtt = '';
		$this->SourceHtml = '';
		$this->Title = '';
		$this->Links = '';
		$this->Medias = '';
		$this->BaseUrl = '';
		$this->BaseUrlPath = '';
	}

	//分析媒体链接
	function InsertMedia($url,$mtype)
	{
		if( ereg("^(javascript:|#|'|\")",$url) )
		{
			return '';
		}
		if($url == '')
		{
			return '';
		}
		$this->Medias[$url]=$mtype;
		return $url;
	}

	function InsertLink($url,$atitle)
	{
		if( ereg("^(javascript:|#|'|\")",$url) )
		{
			return '';
		}
		if($url == '')
		{
			return '';
		}
		if(ereg('^img:',$atitle))
		{
			list($aimg,$atitle) = explode(':txt:',$atitle);
			if(!isset($this->Links[$url]))
			{
				if($atitle != '')
				{
					$this->Links[$url]['title'] = cn_substr($atitle,50);
				}
				else
				{
					$this->Links[$url]['title'] = ereg_replace('img:','',$aimg);
				}
				$this->Links[$url]['link']  = $url;
			}
			$this->Links[$url]['image'] = ereg_replace('img:','',$aimg);
			$this->InsertMedia($this->Links[$url]['image'],'img');
		}
		else
		{
			if(!isset($this->Links[$url]))
			{
				$this->Links[$url]['image'] = '';
				$this->Links[$url]['title'] = $atitle;
				$this->Links[$url]['link']  = $url;
			}
			else
			{
				if(strlen($this->Links[$url]['title']) < strlen($atitle)) $this->Links[$url]['title'] = $atitle;
			}
		}
		return $url;
	}

	//分析content-type中的字符类型
	function ParCharSet($att)
	{
		$startdd=0;
		$taglen=0;
		$startdd = strpos($att,'=');
		if($startdd===false)
		{
			return '';
		}
		else
		{
			$taglen = strlen($att)-$startdd-1;
			if($taglen<=0)
			{
				return '';
			}
			return trim(substr($att,$startdd+1,$taglen));
		}
	}

	//补全相对网址
	function FillUrl($surl)
	{
		$i = $pathStep = 0;
		$dstr = $pstr = $okurl = '';

		$surl = trim($surl);
		if($surl == '')
		{
			return '';
		}
		$pos = strpos($surl,'#');
		if($pos>0)
		{
			$surl = substr($surl,0,$pos);
		}
		if($surl[0]=='/')
		{
			$okurl = $this->HomeUrl.'/'.$surl;
		}
		else if($surl[0]=='.')
		{
			if(!isset($surl[2]))
			{
				return '';
			}
			else if($surl[0]=='/')
			{
				$okurl = $this->BaseUrlPath."/".substr($surl,2,strlen($surl)-2);
			}
			else
			{
				$urls = explode('/',$surl);
				foreach($urls as $u)
				{
					if($u=='..')
					{
						$pathStep++;
					}
					else if($i<count($urls)-1)
					{
						$dstr .= $urls[$i].'/';
					}
					else
					{
						$dstr .= $urls[$i];
					}
					$i++;
				}
				$urls = explode('/',$this->BaseUrlPath);
				if(count($urls) <= $pathStep)
				{
					return '';
				}
				else
				{
					$pstr = '';
					for($i=0;$i<count($urls)-$pathStep;$i++){ $pstr .= $urls[$i].'/'; }
					$okurl = $pstr.$dstr;
				}
			}
		}
		else
		{
			if( strlen($surl) < 7 )
			{
				$okurl = $this->BaseUrlPath.'/'.$surl;
			}
			else if( strtolower(substr($surl,0,7))=='http://' )
			{
				$okurl = eregi_replace('^http://','',$surl);
			}
			else
			{
				$okurl = $this->BaseUrlPath.'/'.$surl;
			}
		}
		$okurl = eregi_replace('/{1,}','/',$okurl);
		return 'http://'.$okurl;
	}

	//获得和下一个标记之间的文本内容
	function GetInnerText(&$pos,$tagname)
	{
		$startPos=0;
		$endPos=0;
		$textLen=0;
		$str = '';
		$startPos = strpos($this->SourceHtml,'>',$pos);

		if($tagname=='title')
		{
			$endPos = strpos($this->SourceHtml,'<',$startPos);
		}
		else
		{
			$endPos1 = strpos($this->SourceHtml,'</a',$startPos);
			$endPos2 = strpos($this->SourceHtml,'</A',$startPos);
			if($endPos1===false)
			{
				$endPos = $endPos2;
			}
			else if($endPos2===false)
			{
				$endPos = $endPos1;
			}
			else
			{
				$endPos = ($endPos1 < $endPos2 ? $endPos1 : $endPos2 );
			}
		}
		if($endPos > $startPos)
		{
			$textLen = $endPos-$startPos;
			$str = substr($this->SourceHtml,$startPos+1,$textLen-1);
		}
		$pos = $startPos + $textLen + strlen("</".$tagname) + 1;
		if($tagname=='title')
		{
			return trim($str);
		}
		else
		{
			preg_match_all("/<img(.*)src=[\"']{0,1}(.*)[\"']{0,1}[> \r\n\t]{1,}/isU",$str,$imgs);
			if(isset($imgs[2][0]))
			{
				$txt = trim(Html2Text($str));
				$imgs[2][0] = ereg_replace("[\"']",'',$imgs[2][0]);
				return "img:".$this->FillUrl($imgs[2][0]).':txt:'.$txt;
			}
			else
			{
				$str = eregi_replace('</(.*)$','',$str);
				$str = trim(eregi_replace('^(.*)>','',$str));
				return $str;
			}
		}
	}
}//End class

/*******************************
//属性解析器
function c____DedeAttribute2();
********************************/
class DedeAttribute2
{
	var $SourceString = '';
	var $SourceMaxSize = 1024;
	var $CharToLow = FALSE;  //属性值是否不分大小写(属性名统一为小写)
	var $IsTagName = TRUE; //是否解析标记名称
	var $Count = -1;
	var $Items = ''; //属性元素的集合

	//设置属性解析器源字符串
	function SetSource($str = '')
	{
		$this->Count = -1;
		$this->Items = '';
		$strLen = 0;
		$this->SourceString = trim(preg_replace("/[ \t\r\n]{1,}/"," ",$str));
		$strLen = strlen($this->SourceString);
		$this->SourceString .= " "; //增加一个空格结尾,以方便处理没有属性的标记
		if($strLen>0&&$strLen<=$this->SourceMaxSize)
		{
			$this->PrivateAttParse();
		}
	}

	//获得某个属性
	function GetAtt($str)
	{
		if($str == '')
		{
			return '';
		}
		$str = strtolower($str);
		if(isset($this->Items[$str]))
		{
			return $this->Items[$str];
		}
		else
		{
			return '';
		}
	}

	//判断属性是否存在
	function IsAtt($str)
	{
		if($str == '')
		{
			return false;
		}
		$str = strtolower($str);
		if(isset($this->Items[$str]))
		{
			return true;
		}
		else
		{
			return false;
		}
	}

	//获得标记名称
	function GetTagName()
	{
		return $this->GetAtt("tagname");
	}

	// 获得属性个数
	function GetCount()
	{
		return $this->Count+1;
	}

	//解析属性(仅给SetSource调用)
	function PrivateAttParse()
	{
		$d = '';
		$tmpatt = '';
		$tmpvalue = '';
		$startdd = -1;
		$ddtag = '';
		$strLen = strlen($this->SourceString);
		$j = 0;

		//这里是获得标记的名称
		if($this->IsTagName)
		{
			//如果属性是注解,不再解析里面的内容,直接返回
			if(isset($this->SourceString[2]))
			{
				if($this->SourceString[0].$this->SourceString[1].$this->SourceString[2]=='!--')
				{
					$this->Items['tagname'] = '!--';
					return ;
				}
			}
			for($i=0;$i<$strLen;$i++)
			{
				$d = $this->SourceString[$i];
				$j++;
				if(ereg("[ '\"\r\n\t]",$d))
				{
					$this->Count++;
					$this->Items["tagname"]=strtolower(trim($tmpvalue));
					$tmpvalue = ''; break;
				}
				else
				{
					$tmpvalue .= $d;
				}
			}
			if($j>0)
			{
				$j = $j-1;
			}
		}

		//遍历源字符串,获得各属性
		for($i=$j;$i<$strLen;$i++)
		{
			$d = $this->SourceString[$i];
			//获得属性的键
			if($startdd==-1)
			{
				if($d!='=')
				{
					$tmpatt .= $d;
				}
				else
				{
					$tmpatt = strtolower(trim($tmpatt));
					$startdd=0;
				}
			}

			//检测属性值是用什么包围的,允许使用 '' '' 或空白
			else if($startdd==0)
			{
				switch($d)
				{
					case ' ':
						continue;
						break;
					case '\'':
						$ddtag='\'';
						$startdd=1;
						break;
					case '"':
						$ddtag='"';
						$startdd=1;
						break;
					default:
						$tmpvalue.=$d;
						$ddtag=' ';
						$startdd=1;
						break;
				}
			}

			//获得属性的值
			else if($startdd==1)
			{
				if($d==$ddtag)
				{
					$this->Count++;
					if($this->CharToLow)
					{
						$this->Items[$tmpatt] = strtolower(trim($tmpvalue));
					}
					else
					{
						$this->Items[$tmpatt] = trim($tmpvalue);
					}
					$tmpatt = '';
					$tmpvalue = '';
					$startdd=-1;
				}
				else
				{
					$tmpvalue.=$d;
				}
			}
		}//End for

		//处理没有值的属性(必须放在结尾才有效)如:"input type=radio name=t1 value=aaa checked"
		if($tmpatt != '')
		{
			$this->Items[$tmpatt] = '';
		}
	}//End Function PrivateAttParse

}//End Class DedeAttribute2

?>

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -