pub_dedehtml.php

来自「强大的PHP内容管理系统尽量不要让站长把时间都花费在为您修正说明上。压缩包解压」· PHP 代码 · 共 615 行 · 第 1/2 页
PHP
615 行
<?php 
require_once(dirname(__FILE__)."/pub_charset.php");
/*******************************
//HTML解析器
function c____DedeHtml();
********************************/
class DedeHtml
{
	var $SourceHtml = "";
	var $Title = "";
	var $IsJump = false;
	var $IsFrame = false;
	var $JumpUrl = "";
	var $BodyText = "";
	var $KeywordText = "";
	var $Links = "";
	var $LinkCount = 0;
	var $CharSet = "";
	var $BaseUrl = "";
	var $BaseUrlPath = "";
	var $HomeUrl = "";
	var $IsHead = false; //是否已经分析HTML头<head></head>部份，
	                     //如果不想分析HTML头，可在SetSource之前直接设这个值为true
	var $IsParseText = true; //是否需要获得HTML里的文本
	var $ImgWidth = 0;
	var $ImgHeight = 0;
	var $NotEncodeText = "";
	//设置HTML的内容和来源网址
	function SetSource($html,$url="")
	{
		$this->CAtt = new DedeAttribute();
		$url = trim($url);
		$this->SourceHtml = $html;
		$this->BaseUrl = $url;
		//判断文档相对于当前的路径
		$urls = @parse_url($url);
		$this->HomeUrl = $urls["host"];
		if(isset($urls["path"])) $this->BaseUrlPath = $this->HomeUrl.$urls["path"];
		else $this->BaseUrlPath = $this->HomeUrl;
		$this->BaseUrlPath = preg_replace("/\/([^\/]*)\.(.*)$/","/",$this->BaseUrlPath);
		$this->BaseUrlPath = preg_replace("/\/$/","",$this->BaseUrlPath);
		if($html!="") $this->Analyser();
	}
	//
	//解析HTML
	//
	function Analyser()
	{
		$cAtt = new DedeAttribute();
		$cAtt->IsTagName = false;
		$c = "";
		$i = 0;
		$startPos = 0;
		$endPos = 0;
		$wt = 0;
		$ht = 0;
		$scriptdd = 0;
		$attStr = "";
		$tmpValue = "";
		$tmpValue2 = "";
		$tagName = "";
		$hashead = 0;
		$slen = strlen($this->SourceHtml);
		for(;$i < $slen; $i++)
		{
			$c = $this->SourceHtml[$i];
			if($c=="<")
			{
				//如果IsParseText==false表示不获取网页的额外资源，只获取多媒体信息
				//这种情况一般是用于采集程序的模式
				$tagName = "";
				$j = 0;
				for($i=$i+1; $i < $slen; $i++)
				{
					if($j>10) break;
					$j++;
					if(!ereg("[ <>\r\n\t]",$this->SourceHtml[$i])){
						$tagName .= $this->SourceHtml[$i];
					}
					else break;
				}
				$tagName = strtolower($tagName);
				if($tagName=="!--")
				{
					$endPos = strpos($this->SourceHtml,"-->",$i);
					if($endPos!==false) $i=$endPos+2;
					continue;
				}
				//简单模式，只获取多媒体资源
				if(!$this->IsParseText)
				{
					$needTag = "img|embed";
					if(ereg($needTag,$tagName))
					{
						$startPos = $i;
						$endPos = strpos($this->SourceHtml,">",$i+1);
						if($endPos===false) break;
						$attStr = substr($this->SourceHtml,$i+1,$endPos-$startPos-1);
						$cAtt->SetSource($attStr);
					}
				}
				//巨型模式，获取所有附加信息
				else
				{
					$startPos = $i;
					$endPos = strpos($this->SourceHtml,">",$i+1);
					if($endPos===false) break;
					$attStr = substr($this->SourceHtml,$i+1,$endPos-$startPos-1);
					$cAtt->SetSource($attStr);
				}
				//检测HTML头信息
				if(!$this->IsHead && $this->IsParseText)
				{
				  if($tagName=="meta")
				  {
					  //分析name属性
					  $tmpValue = strtolower($cAtt->GetAtt("name"));
					  if($tmpValue=="keywords")
							  $this->BodyText .= trim($this->TrimSymbol($cAtt->GetAtt("content")))." ";
					  if($tmpValue=="description")
						{
								$this->BodyText .= trim($this->TrimSymbol($cAtt->GetAtt("content")))." ";
						}
					  //分析http-equiv属性
					  $tmpValue = strtolower($cAtt->GetAtt("http-equiv"));
					  if($tmpValue=="refresh")
					  {
						  $tmpValue2 = InsertUrl($this->ParRefresh($cAtt->GetAtt("content")),"meta");
						  if($tmpValue2!=""){
								  $this->IsJump = true;
								  $this->JumpUrl = $tmpValue2;
							 }
						}
					  if($tmpValue=="content-type")
						{
							 if($this->CharSet=="")
							 { $this->CharSet = strtolower($this->ParCharSet($cAtt->GetAtt("content"))); }
						}
				  } //End meta 分析
				  else if($tagName=="title") //获得网页的标题
					{
						$t_startPos = strpos($this->SourceHtml,'>',$i);
						$t_endPos = strpos($this->SourceHtml,'<',$t_startPos);
						if($t_endPos>$t_startPos){
						  $textLen = $t_endPos-$t_startPos;
						  $this->Title = substr($this->SourceHtml,$t_startPos+1,$textLen-1);
						}
						if($t_endPos > $i) $i = $t_endPos + 6;
					}
				  else if($tagName=="/head"||$tagName=="body")
				  {
				  	$this->IsHead = true;
				  	$i = $i+5;
					}
			  }
			  else
			  {
					//小型分析的数据
					//只获得内容里的多媒体资源链接，不获取text
					if($tagName=="img")//获取图片中的网址
					{
						if($cAtt->GetAtt("alt")!="" && $this->IsParseText)
							{	$this->BodyText .= trim($this->TrimSymbol($cAtt->GetAtt("alt")))." "; }
						$wt = $cAtt->GetAtt("width");
						$ht = $cAtt->GetAtt("height");
						if(!ereg("[^0-9]",$wt)&&!ereg("[^0-9]",$ht)){
							if($wt >= $this->ImgWidth && $ht>= $this->ImgHeight){
								$this->InsertUrl($cAtt->GetAtt("src"),"images"); 
							}
						}
					}
					else if($tagName=="embed")//获得Flash或其它媒体的内容
					{
						$wt = $cAtt->GetAtt("width");
						$ht = $cAtt->GetAtt("height");
						if(!ereg("[^0-9]",$wt)&&!ereg("[^0-9]",$ht))
						{ $this->InsertUrl($cAtt->GetAtt("src"),$cAtt->GetAtt("type")); }
					}
					//
					//下面情况适用于获取HTML的所有附加信息的情况（蜘蛛程序）
					//
					if($this->IsParseText)
					{
						if($tagName=="a"||$tagName=="area")//获得超链接
							$this->InsertUrl($cAtt->GetAtt("href"),"hyperlink");
						else if($tagName=="frameset")//处理框架网页
							$this->IsFrame = true;
						else if($tagName=="frame"){
							$tmpValue = $this->InsertUrl($cAtt->GetAtt("src"),"frame");
							if($tmpValue!=""){
								$tmpValue2 = $cAtt->GetAtt("name");
								if(eregi("(main|body)",$tmpValue2)){
									$this->IsJump = true;
									$this->JumpUrl = $tmpValue;
								}
							}
						}
						else if(ereg("^(sc|st)",$tagName)){
							$scriptdd++;
						}
						else if(ereg("^(/sc|/st)",$tagName)){
							$scriptdd--;
						}
						////////////获取标记间的文本//////////////
						if($scriptdd==0){
							$tmpValue = trim($this->GetInnerText($i));
							if($tmpValue!=""){
								if(strlen($this->KeywordText)<512){
								if($this->IsHot($tagName,$cAtt)){
									$this->KeywordText .= $tmpValue;
								}}
								$this->BodyText .= $tmpValue." ";
							}
						}
					}//IsParseText
				}//结束解析body的内容
			}//End if char
		}//End for
		
		//对分析出来的文本进行简单处理
		if($this->BodyText!="")
		{
			$this->BodyText = $this->TrimSymbol($this->BodyText);
			if($this->NotEncodeText!="") $this->BodyText = $this->TrimSymbol($this->NotEncodeText).$this->BodyText;
			$this->BodyText = preg_replace("/&#{0,1}([a-zA-Z0-9]{3,5})( {0,1})/"," ",$this->BodyText);
			$this->BodyText = preg_replace("/[ -]{1,}/"," ",$this->BodyText);
			$this->BodyText = preg_replace("/-{1,}/","-",$this->BodyText);
			$this->NotEncodeText = "";
		}	
			
		if($this->KeywordText!="")
		{
			$this->KeywordText = $this->TrimSymbol($this->KeywordText); 
			$this->KeywordText = preg_replace("/&#{0,1}([a-zA-Z0-9]{3,5})( {0,1})/"," ",$this->KeywordText);
			$this->KeywordText = preg_replace("/ {1,}/"," ",$this->KeywordText);
			$this->KeywordText = preg_replace("/-{1,}/","-",$this->KeywordText);
	  }
	  
		if($this->Title==""){
			$this->Title = $this->BaseUrl;
		}else{
			$this->Title = $this->TrimSymbol($this->Title);
			$this->Title = preg_replace("/&#{0,1}([a-zA-Z0-9]{3,5})( {0,1})/"," ",$this->Title);
			$this->Title = preg_replace("/ {1,}/"," ",$this->Title);
			$this->Title = preg_replace("/-{1,}/","-",$this->Title);
		}
	}
	//
	//重置资源
	//
	function Clear()
	{
		$this->SourceHtml = "";
		$this->Title = "";
		$this->IsJump = false;
		$this->IsFrame = false;
		$this->JumpUrl = "";
		$this->BodyText = "";
		$this->KeywordText = "";
		$this->Links = "";
		$this->LinkCount = 0;
		$this->CharSet = "";
		$this->BaseUrl = "";
		$this->BaseUrlPath = "";
		$this->HomeUrl = "";
		$this->NotEncodeText = "";
	}
	//
	//分析URL，并加入指定分类中
	//
	function InsertUrl($url,$tagname)
	{
		$noUrl = true;
		if(trim($url)=="") return;
		if( ereg("^(javascript:|#|'|\")",$url) ) return "";
		if($url=="") return "";
		if($this->LinkCount>0)
		{
			foreach($this->Links as $k=>$v){
				if($url==$v){ $noUrl = false; break; }
			}
		}
		//如果不存在这个链接
		if($noUrl)
		{
			$this->Links[$this->LinkCount]=$url;
			$this->LinkCount++;
		}
		return $url;
	}
	//
	//分析content-type中的字符类型
	//
	function ParCharSet($att)
	{
		$startdd=0;
		$taglen=0;
		$startdd = strpos($att,"=");
		if($startdd===false) return "";
		else
		{
			$taglen = strlen($att)-$startdd-1;
			if($taglen<=0) return "";
			return trim(substr($att,$startdd+1,$taglen));
		}
	}
	//
	//分析refresh中的网址
pub_dedehtml.php - 源码说明

本页面展示了「强大的PHP内容管理系统尽量不要让站长把时间都花费在为您修正说明上。压缩包解压」中的 pub_dedehtml.php 源码文件，采用 PHP 编程语言编写，共 615 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与PHP相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?