📄 pub_dedehtml.php
字号:
<?php
require_once(dirname(__FILE__)."/pub_charset.php");
/*******************************
//HTML解析器
function c____DedeHtml();
********************************/
class DedeHtml
{
var $SourceHtml = "";
var $Title = "";
var $IsJump = false;
var $IsFrame = false;
var $JumpUrl = "";
var $BodyText = "";
var $KeywordText = "";
var $Links = "";
var $LinkCount = 0;
var $CharSet = "";
var $BaseUrl = "";
var $BaseUrlPath = "";
var $HomeUrl = "";
var $IsHead = false; //是否已经分析HTML头<head></head>部份,
//如果不想分析HTML头,可在SetSource之前直接设这个值为true
var $IsParseText = true; //是否需要获得HTML里的文本
var $ImgWidth = 0;
var $ImgHeight = 0;
var $NotEncodeText = "";
//设置HTML的内容和来源网址
function SetSource($html,$url="")
{
$this->CAtt = new DedeAttribute();
$url = trim($url);
$this->SourceHtml = $html;
$this->BaseUrl = $url;
//判断文档相对于当前的路径
$urls = @parse_url($url);
$this->HomeUrl = $urls["host"];
if(isset($urls["path"])) $this->BaseUrlPath = $this->HomeUrl.$urls["path"];
else $this->BaseUrlPath = $this->HomeUrl;
$this->BaseUrlPath = preg_replace("/\/([^\/]*)\.(.*)$/","/",$this->BaseUrlPath);
$this->BaseUrlPath = preg_replace("/\/$/","",$this->BaseUrlPath);
if($html!="") $this->Analyser();
}
//
//解析HTML
//
function Analyser()
{
$cAtt = new DedeAttribute();
$cAtt->IsTagName = false;
$c = "";
$i = 0;
$startPos = 0;
$endPos = 0;
$wt = 0;
$ht = 0;
$scriptdd = 0;
$attStr = "";
$tmpValue = "";
$tmpValue2 = "";
$tagName = "";
$hashead = 0;
$slen = strlen($this->SourceHtml);
for(;$i < $slen; $i++)
{
$c = $this->SourceHtml[$i];
if($c=="<")
{
//如果IsParseText==false表示不获取网页的额外资源,只获取多媒体信息
//这种情况一般是用于采集程序的模式
$tagName = "";
$j = 0;
for($i=$i+1; $i < $slen; $i++)
{
if($j>10) break;
$j++;
if(!ereg("[ <>\r\n\t]",$this->SourceHtml[$i])){
$tagName .= $this->SourceHtml[$i];
}
else break;
}
$tagName = strtolower($tagName);
if($tagName=="!--")
{
$endPos = strpos($this->SourceHtml,"-->",$i);
if($endPos!==false) $i=$endPos+2;
continue;
}
//简单模式,只获取多媒体资源
if(!$this->IsParseText)
{
$needTag = "img|embed";
if(ereg($needTag,$tagName))
{
$startPos = $i;
$endPos = strpos($this->SourceHtml,">",$i+1);
if($endPos===false) break;
$attStr = substr($this->SourceHtml,$i+1,$endPos-$startPos-1);
$cAtt->SetSource($attStr);
}
}
//巨型模式,获取所有附加信息
else
{
$startPos = $i;
$endPos = strpos($this->SourceHtml,">",$i+1);
if($endPos===false) break;
$attStr = substr($this->SourceHtml,$i+1,$endPos-$startPos-1);
$cAtt->SetSource($attStr);
}
//检测HTML头信息
if(!$this->IsHead && $this->IsParseText)
{
if($tagName=="meta")
{
//分析name属性
$tmpValue = strtolower($cAtt->GetAtt("name"));
if($tmpValue=="keywords")
$this->BodyText .= trim($this->TrimSymbol($cAtt->GetAtt("content")))." ";
if($tmpValue=="description")
{
$this->BodyText .= trim($this->TrimSymbol($cAtt->GetAtt("content")))." ";
}
//分析http-equiv属性
$tmpValue = strtolower($cAtt->GetAtt("http-equiv"));
if($tmpValue=="refresh")
{
$tmpValue2 = InsertUrl($this->ParRefresh($cAtt->GetAtt("content")),"meta");
if($tmpValue2!=""){
$this->IsJump = true;
$this->JumpUrl = $tmpValue2;
}
}
if($tmpValue=="content-type")
{
if($this->CharSet=="")
{ $this->CharSet = strtolower($this->ParCharSet($cAtt->GetAtt("content"))); }
}
} //End meta 分析
else if($tagName=="title") //获得网页的标题
{
$t_startPos = strpos($this->SourceHtml,'>',$i);
$t_endPos = strpos($this->SourceHtml,'<',$t_startPos);
if($t_endPos>$t_startPos){
$textLen = $t_endPos-$t_startPos;
$this->Title = substr($this->SourceHtml,$t_startPos+1,$textLen-1);
}
if($t_endPos > $i) $i = $t_endPos + 6;
}
else if($tagName=="/head"||$tagName=="body")
{
$this->IsHead = true;
$i = $i+5;
}
}
else
{
//小型分析的数据
//只获得内容里的多媒体资源链接,不获取text
if($tagName=="img")//获取图片中的网址
{
if($cAtt->GetAtt("alt")!="" && $this->IsParseText)
{ $this->BodyText .= trim($this->TrimSymbol($cAtt->GetAtt("alt")))." "; }
$wt = $cAtt->GetAtt("width");
$ht = $cAtt->GetAtt("height");
if(!ereg("[^0-9]",$wt)&&!ereg("[^0-9]",$ht)){
if($wt >= $this->ImgWidth && $ht>= $this->ImgHeight){
$this->InsertUrl($cAtt->GetAtt("src"),"images");
}
}
}
else if($tagName=="embed")//获得Flash或其它媒体的内容
{
$wt = $cAtt->GetAtt("width");
$ht = $cAtt->GetAtt("height");
if(!ereg("[^0-9]",$wt)&&!ereg("[^0-9]",$ht))
{ $this->InsertUrl($cAtt->GetAtt("src"),$cAtt->GetAtt("type")); }
}
//
//下面情况适用于获取HTML的所有附加信息的情况(蜘蛛程序)
//
if($this->IsParseText)
{
if($tagName=="a"||$tagName=="area")//获得超链接
$this->InsertUrl($cAtt->GetAtt("href"),"hyperlink");
else if($tagName=="frameset")//处理框架网页
$this->IsFrame = true;
else if($tagName=="frame"){
$tmpValue = $this->InsertUrl($cAtt->GetAtt("src"),"frame");
if($tmpValue!=""){
$tmpValue2 = $cAtt->GetAtt("name");
if(eregi("(main|body)",$tmpValue2)){
$this->IsJump = true;
$this->JumpUrl = $tmpValue;
}
}
}
else if(ereg("^(sc|st)",$tagName)){
$scriptdd++;
}
else if(ereg("^(/sc|/st)",$tagName)){
$scriptdd--;
}
////////////获取标记间的文本//////////////
if($scriptdd==0){
$tmpValue = trim($this->GetInnerText($i));
if($tmpValue!=""){
if(strlen($this->KeywordText)<512){
if($this->IsHot($tagName,$cAtt)){
$this->KeywordText .= $tmpValue;
}}
$this->BodyText .= $tmpValue." ";
}
}
}//IsParseText
}//结束解析body的内容
}//End if char
}//End for
//对分析出来的文本进行简单处理
if($this->BodyText!="")
{
$this->BodyText = $this->TrimSymbol($this->BodyText);
if($this->NotEncodeText!="") $this->BodyText = $this->TrimSymbol($this->NotEncodeText).$this->BodyText;
$this->BodyText = preg_replace("/&#{0,1}([a-zA-Z0-9]{3,5})( {0,1})/"," ",$this->BodyText);
$this->BodyText = preg_replace("/[ -]{1,}/"," ",$this->BodyText);
$this->BodyText = preg_replace("/-{1,}/","-",$this->BodyText);
$this->NotEncodeText = "";
}
if($this->KeywordText!="")
{
$this->KeywordText = $this->TrimSymbol($this->KeywordText);
$this->KeywordText = preg_replace("/&#{0,1}([a-zA-Z0-9]{3,5})( {0,1})/"," ",$this->KeywordText);
$this->KeywordText = preg_replace("/ {1,}/"," ",$this->KeywordText);
$this->KeywordText = preg_replace("/-{1,}/","-",$this->KeywordText);
}
if($this->Title==""){
$this->Title = $this->BaseUrl;
}else{
$this->Title = $this->TrimSymbol($this->Title);
$this->Title = preg_replace("/&#{0,1}([a-zA-Z0-9]{3,5})( {0,1})/"," ",$this->Title);
$this->Title = preg_replace("/ {1,}/"," ",$this->Title);
$this->Title = preg_replace("/-{1,}/","-",$this->Title);
}
}
//
//重置资源
//
function Clear()
{
$this->SourceHtml = "";
$this->Title = "";
$this->IsJump = false;
$this->IsFrame = false;
$this->JumpUrl = "";
$this->BodyText = "";
$this->KeywordText = "";
$this->Links = "";
$this->LinkCount = 0;
$this->CharSet = "";
$this->BaseUrl = "";
$this->BaseUrlPath = "";
$this->HomeUrl = "";
$this->NotEncodeText = "";
}
//
//分析URL,并加入指定分类中
//
function InsertUrl($url,$tagname)
{
$noUrl = true;
if(trim($url)=="") return;
if( ereg("^(javascript:|#|'|\")",$url) ) return "";
if($url=="") return "";
if($this->LinkCount>0)
{
foreach($this->Links as $k=>$v){
if($url==$v){ $noUrl = false; break; }
}
}
//如果不存在这个链接
if($noUrl)
{
$this->Links[$this->LinkCount]=$url;
$this->LinkCount++;
}
return $url;
}
//
//分析content-type中的字符类型
//
function ParCharSet($att)
{
$startdd=0;
$taglen=0;
$startdd = strpos($att,"=");
if($startdd===false) return "";
else
{
$taglen = strlen($att)-$startdd-1;
if($taglen<=0) return "";
return trim(substr($att,$startdd+1,$taglen));
}
}
//
//分析refresh中的网址
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -