⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 pub_dedehtml2.php

📁 强大的PHP内容管理系统尽量不要让站长把时间都花费在为您修正说明上。压缩包解压
💻 PHP
字号:
<?php 
/*******************************
//织梦HTML解析类V1.1 PHP版
//www.dedecms.com
function c____DedeHtml2();
这个类针对于采集程序,与DedeHtml类功能不尽相同
********************************/
class DedeHtml2
{
	var $CAtt;
	var $SourceHtml;
	var $Title;
	var $Medias;
	var $MediaInfos;
	var $Links;
	var $CharSet;
	var $BaseUrl;
	var $BaseUrlPath;
	var $HomeUrl;
	var $IsHead;
	var $ImgHeight;
	var $ImgWidth;
	var $GetLinkType;
	//-------------------------
	//构造函数
	//-------------------------
	function __construct()
 	{
 		$this->CAtt = "";
 		$this->SourceHtml = "";
 		$this->Title = "";
 		$this->Medias = Array();
 		$this->MediaInfos = Array();
 		$this->Links = Array();
    $this->CharSet = "";
    $this->BaseUrl = "";
    $this->BaseUrlPath = "";
    $this->HomeUrl = "";
    $this->IsHead = false;
    $this->ImgHeight = 30;
    $this->ImgWidth = 50;
    $this->GetLinkType = "all";
  }
  function DedeHtml2()
 	{
 		$this->__construct();
  }
	//设置HTML的内容和来源网址
	//gethead 是指是否要分析html头
	//如果是局部HTML,此项必须设为false,否则无法分析网页
	function SetSource(&$html,$url="",$gethead=false)
	{
		$this->__construct();
		if($gethead) $this->IsHead = false;
		else $this->IsHead = true;
		$this->CAtt = new DedeAttribute2();
		$url = trim($url);
		$this->SourceHtml = $html;
		$this->BaseUrl = $url;
		//判断文档相对于当前的路径
		$urls = @parse_url($url);
		$this->HomeUrl = $urls["host"];
		$this->BaseUrlPath = $this->HomeUrl.$urls["path"];
		$this->BaseUrlPath = preg_replace("/\/([^\/]*)\.(.*)$/","/",$this->BaseUrlPath);
		$this->BaseUrlPath = preg_replace("/\/$/","",$this->BaseUrlPath);
		if($html!="") $this->Analyser();
	}
	//-----------------------
	//解析HTML
	//-----------------------
	function Analyser()
	{
		$cAtt = new DedeAttribute2();
		$cAtt->IsTagName = false;
		$c = "";
		$i = 0;
		$startPos = 0;
		$endPos = 0;
		$wt = 0;
		$ht = 0;
		$scriptdd = 0;
		$attStr = "";
		$tmpValue = "";
		$tmpValue2 = "";
		$tagName = "";
		$hashead = 0;
		$slen = strlen($this->SourceHtml);
		
		if($this->GetLinkType=="link")
		{ $needTag = "a|meta|title|/head|body"; }
		else if($this->GetLinkType=="media")
		{ $needTag = "img|embed|a"; $this->IsHead = true; }
		else
		{ $needTag = "img|embed|a|meta|title|/head|body"; }
		
		for(;$i < $slen; $i++)
		{
			$c = $this->SourceHtml[$i];
			if($c=="<")
			{
				//这种情况一般是用于采集程序的模式
				$tagName = "";
				$j = 0;
				for($i=$i+1; $i < $slen; $i++){
					if($j>10) break;
					$j++;
					if(!ereg("[ <>\r\n\t]",$this->SourceHtml[$i]))
					{ $tagName .= $this->SourceHtml[$i]; }
					else{ break; }
				}
				$tagName = strtolower($tagName);
				if($tagName=="!--"){
					$endPos = strpos($this->SourceHtml,"-->",$i);
					if($endPos!==false) $i=$endPos+3;
					continue;
				}
				if(ereg($needTag,$tagName)){
					$startPos = $i;
					$endPos = strpos($this->SourceHtml,">",$i+1);
					if($endPos===false) break;
					$attStr = substr($this->SourceHtml,$i+1,$endPos-$startPos-1);
					$cAtt->SetSource($attStr);
				}else{
					continue;
				}
				//检测HTML头信息
				if(!$this->IsHead)
				{
					if($tagName=="meta"){
					  //分析name属性
					  $tmpValue = strtolower($cAtt->GetAtt("http-equiv"));
					  if($tmpValue=="content-type"){
							  $this->CharSet = strtolower($cAtt->GetAtt("charset"));
						}
				  } //End meta 分析
				  else if($tagName=="title"){
						$this->Title = $this->GetInnerText($i,"title");
						$i += strlen($this->Title)+12;
					}
				  else if($tagName=="/head"||$tagName=="body"){
				  	$this->IsHead = true;
				  	$i = $i+5;
					}
			  }
			  else
			  {
					//小型分析的数据
					//只获得内容里的多媒体资源链接,不获取text
					if($tagName=="img"){ //获取图片中的网址
						$this->InsertMedia($cAtt->GetAtt("src"),"img"); 
					}
					else if($tagName=="embed"){ //获得Flash或其它媒体的内容
						$rurl = $this->InsertMedia($cAtt->GetAtt("src"),"embed");
						if($rurl != ""){
						  $this->MediaInfos[$rurl][0] = $cAtt->GetAtt("width");
						  $this->MediaInfos[$rurl][1] = $cAtt->GetAtt("height");
						}
					}
					else if($tagName=="a"){ //获得Flash或其它媒体的内容
						$this->InsertLink($cAtt->GetAtt("href"),$this->GetInnerText($i,"a"));
					}
				}//结束解析body的内容
			}//End if char
		}//End for
		if($this->Title=="") $this->Title = $this->BaseUrl;
	}
	//
	//重置资源
	//
	function Clear()
	{
		$this->CAtt = "";
		$this->SourceHtml = "";
		$this->Title = "";
		$this->Links = "";
		$this->Medias = "";
		$this->BaseUrl = "";
		$this->BaseUrlPath = "";
	}
	//
	//分析媒体链接
	//
	function InsertMedia($url,$mtype)
	{
		if( ereg("^(javascript:|#|'|\")",$url) ) return "";
		if($url=="") return "";
		$this->Medias[$url]=$mtype;
		return $url;
	}
	function InsertLink($url,$atitle)
	{
		if( ereg("^(javascript:|#|'|\")",$url) ) return "";
		if($url=="") return "";
		$this->Links[$url]=$atitle;
		return $url;
	}
	//
	//分析content-type中的字符类型
	//
	function ParCharSet($att)
	{
		$startdd=0;
		$taglen=0;
		$startdd = strpos($att,"=");
		if($startdd===false) return "";
		else
		{
			$taglen = strlen($att)-$startdd-1;
			if($taglen<=0) return "";
			return trim(substr($att,$startdd+1,$taglen));
		}
	}
	//
	//分析refresh中的网址
	//
	function ParRefresh($att)
	{
		return $this->ParCharSet($att);
	}
	//
	//补全相对网址
	//
	function FillUrl($surl)
  {
    $i = 0;
    $dstr = "";
    $pstr = "";
    $okurl = "";
    $pathStep = 0;
    $surl = trim($surl);
    if($surl=="") return "";
    $pos = strpos($surl,"#");
    if($pos>0) $surl = substr($surl,0,$pos);
    if($surl[0]=="/"){
    	$okurl = "http://".$this->HomeUrl."/".$surl;
    }
    else if($surl[0]==".")
    {
      if(strlen($surl)<=2) return "";
      else if($surl[0]=="/")
      {
      	$okurl = "http://".$this->BaseUrlPath."/".substr($surl,2,strlen($surl)-2);
    	}
      else{
        $urls = explode("/",$surl);
        foreach($urls as $u){
          if($u=="..") $pathStep++;
          else if($i<count($urls)-1) $dstr .= $urls[$i]."/";
          else $dstr .= $urls[$i];
          $i++;
        }
        $urls = explode("/",$this->BaseUrlPath);
        if(count($urls) <= $pathStep)
        	return "";
        else{
          $pstr = "http://";
          for($i=0;$i<count($urls)-$pathStep;$i++)
          { $pstr .= $urls[$i]."/"; }
          $okurl = $pstr.$dstr;
        }   		
      }
    }
    else
    {
      if(strlen($surl)<7)
        $okurl = "http://".$this->BaseUrlPath."/".$surl;
      else if(strtolower(substr($surl,0,7))=="http://")
        $okurl = $surl;
      else
        $okurl = "http://".$this->BaseUrlPath."/".$surl;
    }
    $okurl = eregi_replace("^(http://)","",$okurl);
    $okurl = eregi_replace("/{1,}","/",$okurl);
    return "http://".$okurl;
  }
  //
	//获得和下一个标记之间的文本内容
	//
	function GetInnerText($pos,$tagname)
	{
		$startPos=0;
		$endPos=0;
		$textLen=0;
		$str="";
		$startPos = strpos($this->SourceHtml,'>',$pos);
		if($tagname=="title")
			$endPos = strpos($this->SourceHtml,'<',$startPos);
		else{
			$endPos = strpos($this->SourceHtml,'</a',$startPos);
			if($endPos===false) $endPos = strpos($this->SourceHtml,'</A',$startPos);
		}
		if($endPos>$startPos){
			$textLen = $endPos-$startPos;
			$str = substr($this->SourceHtml,$startPos+1,$textLen-1);
		}
		if($tagname=="title")
			return trim($str);
		else{
			$str = eregi_replace("</(.*)$","",$str);
			$str = eregi_replace("^(.*)>","",$str);
			return trim($str);
		}
	}
}//End class
/*******************************
//属性解析器
function c____DedeAttribute2();
********************************/
class DedeAttribute2
{
	var $SourceString = "";
	var $SourceMaxSize = 1024;
	var $CharToLow = FALSE;  //属性值是否不分大小写(属性名统一为小写)
	var $IsTagName = TRUE; //是否解析标记名称
	var $Count = -1;
  var $Items = ""; //属性元素的集合
  //设置属性解析器源字符串
	function SetSource($str="")
	{
		$this->Count = -1;
  	$this->Items = "";
		$strLen = 0;
		$this->SourceString = trim(preg_replace("/[ \t\r\n]{1,}/"," ",$str));
		$strLen = strlen($this->SourceString);
		$this->SourceString .= " "; //增加一个空格结尾,以方便处理没有属性的标记
		if($strLen>0&&$strLen<=$this->SourceMaxSize){
			$this->PrivateAttParse();
		}
	}
  //获得某个属性
  function GetAtt($str){
    if($str=="") return "";
    $str = strtolower($str);
    if(isset($this->Items[$str])) return $this->Items[$str];
    else return "";
  }
  //判断属性是否存在
  function IsAtt($str){
    if($str=="") return false;
    $str = strtolower($str);
    if(isset($this->Items[$str])) return true;
    else return false;
  }
  //获得标记名称
  function GetTagName(){
     return $this->GetAtt("tagname");
  }
  // 获得属性个数
  function GetCount(){
      return $this->Count+1;
	}
	//解析属性(仅给SetSource调用)
	function PrivateAttParse()
	{
		$d = "";
		$tmpatt="";
		$tmpvalue="";
		$startdd=-1;
		$ddtag="";
		$strLen = strlen($this->SourceString);
		$j = 0;
		//这里是获得标记的名称
		if($this->IsTagName)
		{
			//如果属性是注解,不再解析里面的内容,直接返回
			if(isset($this->SourceString[2]))
			{
				if($this->SourceString[0].$this->SourceString[1].$this->SourceString[2]=="!--")
				{ $this->Items["tagname"] = "!--"; return ;}
			}
			//
			for($i=0;$i<$strLen;$i++){
				$d = $this->SourceString[$i];
				$j++;
				if(ereg("[ '\"\r\n\t]",$d)){
					$this->Count++;
					$this->Items["tagname"]=strtolower(trim($tmpvalue));
					$tmpvalue = ""; break;
				}
				else
				{	$tmpvalue .= $d;}
			}
			if($j>0) $j = $j-1;
	  }
		//遍历源字符串,获得各属性
		for($i=$j;$i<$strLen;$i++)
		{
			$d = $this->SourceString[$i];
			//获得属性的键
			if($startdd==-1){
				if($d!="=")	$tmpatt .= $d;
				else{
					$tmpatt = strtolower(trim($tmpatt));
					$startdd=0;
				}
			}
			//检测属性值是用什么包围的,允许使用 '' "" 或空白
			else if($startdd==0){
				switch($d){
					case ' ':
						continue;
						break;
					case '\'':
						$ddtag='\'';
						$startdd=1;
						break;
					case '"':
						$ddtag='"';
						$startdd=1;
						break;
					default:
						$tmpvalue.=$d;
						$ddtag=' ';
						$startdd=1;
						break;
				}
			}
			//获得属性的值
			else if($startdd==1)
			{
				if($d==$ddtag){
					$this->Count++;
          if($this->CharToLow) $this->Items[$tmpatt] = strtolower(trim($tmpvalue));
					else $this->Items[$tmpatt] = trim($tmpvalue);
					$tmpatt = "";
					$tmpvalue = "";
					$startdd=-1;
				}
				else
					$tmpvalue.=$d;
			}
	  }//End for
	  //处理没有值的属性(必须放在结尾才有效)如:"input type=radio name=t1 value=aaa checked"
	  if($tmpatt!="")
	  { $this->Items[$tmpatt] = "";}
 }//End Function PrivateAttParse

}//End Class DedeAttribute2

?>

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -