⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 pub_collection.php

📁 强大的PHP内容管理系统尽量不要让站长把时间都花费在为您修正说明上。压缩包解压
💻 PHP
📖 第 1 页 / 共 2 页
字号:
<?php 
/*------------------------
DedeCms在线采集程序V2
作者:IT柏拉图  
开发时间 2006年9月 最后更改时间 2007-1-17
-----------------------*/
require_once(dirname(__FILE__)."/pub_httpdown.php");
require_once(dirname(__FILE__)."/pub_dedetag.php");
require_once(dirname(__FILE__)."/pub_db_mysql.php");
require_once(dirname(__FILE__)."/pub_charset.php");
require_once(dirname(__FILE__)."/pub_collection_functions.php"); //采集扩展函数
require_once(dirname(__FILE__)."/inc_photograph.php");
require_once(dirname(__FILE__)."/pub_dedehtml2.php");
@set_time_limit(0);
class DedeCollection
{
	var $Item = array(); //采集节点的基本配置信息
	var $List = array(); //采集节点的来源列表处理信息
	var $Art = array();  //采集节点的文章处理信息
	var $ArtNote = array(); //文章采集的字段信息
	var $dsql = "";
	var $NoteId = "";
	var $CDedeHtml = "";
	var $CHttpDown = "";
	var $MediaCount = 0;
	var $tmpUnitValue = "";
	var $tmpLinks = array();
	var $tmpHtml = "";
	var $breImage = "";
	//-------------------------------
	//兼容php5构造函数
	//-------------------------------
	function __construct(){
 		 $this->dsql = new DedeSql(false);
		 $this->CHttpDown = new DedeHttpDown();
		 $this->CDedeHtml = new DedeHtml2();
  }
	function DedeCollection(){
		 $this->__construct();
	}
	function Init(){
		//仅兼容性函数
	}
	//析放资源
	//---------------------------
	function Close(){
		 $this->dsql->Close();
		 unset($this->Item);
	   unset($this->List);
	   unset($this->Art);
	   unset($this->ArtNote);
	   unset($this->tmpLinks);
	   unset($this->dsql);
	   unset($this->CDedeHtml);
	   unset($this->CHttpDown);
	   unset($this->tmpUnitValue);
	   unset($this->tmpHtml);
	}
	//-------------------------------
	//从数据库里载入某个节点
	//-------------------------------
	function LoadNote($nid)
	{
		$this->NoteId = $nid;
		$this->dsql->SetSql("Select * from #@__conote where nid='$nid'");
		$this->dsql->Execute();
		$row = $this->dsql->Getarray();
		$this->LoadConfig($row["noteinfo"]);
		$this->dsql->FreeResult();
	}
	//-------------------------------
	//从数据库里载入某个节点
	//-------------------------------
	function LoadFromDB($nid)
	{
		$this->NoteId = $nid;
		$this->dsql->SetSql("Select * from #@__conote where nid='$nid'");
		$this->dsql->Execute();
		$row = $this->dsql->GetArray();
		$this->LoadConfig($row["noteinfo"]);
		$this->dsql->FreeResult();
	}
	//----------------------------
	//分析节点的配置信息
	//----------------------------
	function LoadConfig($configString)
	{
		$dtp = new DedeTagParse();
		$dtp->SetNameSpace("dede","{","}");
		$dtp2 = new DedeTagParse();
		$dtp2->SetNameSpace("dede","{","}");
		$dtp3 = new DedeTagParse();
		$dtp3->SetNameSpace("dede","{","}");
		$dtp->LoadString($configString);
		for($i=0;$i<=$dtp->Count;$i++)
		{
			$ctag = $dtp->CTags[$i];
			//item 配置
			//节点基本信息
			if($ctag->GetName()=="item")
			{
				$this->Item["name"] = $ctag->GetAtt("name");
				$this->Item["typeid"] = $ctag->GetAtt("typeid");
				$this->Item["imgurl"] = $ctag->GetAtt("imgurl");
				$this->Item["imgdir"] = $ctag->GetAtt("imgdir");
				$this->Item["language"] = $ctag->GetAtt("language");
				$this->Item["matchtype"] = $ctag->GetAtt("matchtype");
				$this->Item["isref"] = $ctag->GetAtt("isref");
				$this->Item["refurl"] = $ctag->GetAtt("refurl");
				$this->Item["exptime"] = $ctag->GetAtt("exptime"); 
				if($this->Item["matchtype"]=="") $this->Item["matchtype"]="string";
				//创建图片保存目录
				$updir = dirname(__FILE__)."/".$this->Item["imgdir"]."/";
				$updir = str_replace("\\","/",$updir);
				$updir = preg_replace("/\/{1,}/","/",$updir);
				if(!is_dir($updir)) MkdirAll($updir,$GLOBALS['cfg_dir_purview']);
			}
			//list 配置
			//要采集的列表页的信息
			else if($ctag->GetName()=="list")
			{
				$this->List["varstart"]= $ctag->GetAtt("varstart");
				$this->List["varend"] = $ctag->GetAtt("varend");
				$this->List["source"] = $ctag->GetAtt("source");
				$this->List["sourcetype"] = $ctag->GetAtt("sourcetype");
				$dtp2->LoadString($ctag->GetInnerText());
				for($j=0;$j<=$dtp2->Count;$j++)
				{
					$ctag2 = $dtp2->CTags[$j];
					$tname = $ctag2->GetName();
					if($tname=="need"){
						$this->List["need"] = trim($ctag2->GetInnerText());
					}else if($tname=="cannot"){
						$this->List["cannot"] = trim($ctag2->GetInnerText());
					}
					else if($tname=="linkarea"){
						$this->List["linkarea"] = trim($ctag2->GetInnerText());
				  }else if($tname=="url")
					{
						$gurl = trim($ctag2->GetAtt("value"));
						//手工指定列表网址
						if($this->List["source"]=="app")
						{
							$turl = trim($ctag2->GetInnerText());
							$turls = explode("\n",$turl);
							$l_tj = 0;
							foreach($turls as $turl){
								$turl = trim($turl);
								if($turl=="") continue;
								if(!eregi("^http://",$turl)) $turl = "http://".$turl;
								$this->List["url"][$l_tj] = $turl;
								$l_tj++;
							}
						}
						//用分页变量产生的网址
						else
						{	
							if(eregi("var:分页",trim($ctag2->GetAtt("value")))){
								if($this->List["varstart"]=="") $this->List["varstart"]=1;
								if($this->List["varend"]=="") $this->List["varend"]=10;
								$l_tj = 0;
								for($l_em = $this->List["varstart"];$l_em<=$this->List["varend"];$l_em++){
										$this->List["url"][$l_tj] = str_replace("[var:分页]",$l_em,$gurl);
										$l_tj++;
								}
							}//if set var
							else{
								$this->List["url"][0] = $gurl;
							}
						}
					}
				}//End inner Loop1
			}
			//art 配置
			//要采集的文章页的信息
			else if($ctag->GetName()=="art")
			{
				$dtp2->LoadString($ctag->GetInnerText());
				for($j=0;$j<=$dtp2->Count;$j++)
				{
					$ctag2 = $dtp2->CTags[$j];
					//文章要采集的字段的信息及处理方式
					if($ctag2->GetName()=="note"){
						$field = $ctag2->GetAtt('field');
						if($field == "") continue;
						$this->ArtNote[$field]["value"] = $ctag2->GetAtt('value');
						$this->ArtNote[$field]["isunit"] = $ctag2->GetAtt('isunit');
						$this->ArtNote[$field]["isdown"] = $ctag2->GetAtt('isdown');
						$dtp3->LoadString($ctag2->GetInnerText());
						for($k=0;$k<=$dtp3->Count;$k++)
						{
							$ctag3 = $dtp3->CTags[$k];
							if($ctag3->GetName()=="trim"){
								$this->ArtNote[$field]["trim"][] = $ctag3->GetInnerText();
							}
							else if($ctag3->GetName()=="match"){
								$this->ArtNote[$field]["match"] = $ctag3->GetInnerText();
							}
							else if($ctag3->GetName()=="function"){
								$this->ArtNote[$field]["function"] = $ctag3->GetInnerText();
							}
						}
					}
					else if($ctag2->GetName()=="sppage"){
						$this->ArtNote["sppage"] = $ctag2->GetInnerText();
						$this->ArtNote["sptype"] = $ctag2->GetAtt('sptype');
					}
				}//End inner Loop2
			}
		}//End Loop
		$dtp->Clear();
		$dtp2->Clear();
	}
	//-----------------------------
	//下载其中一个网址,并保存
	//-----------------------------
	function DownUrl($aid,$dourl)
	{
		$this->tmpLinks = array();
	  $this->tmpUnitValue = "";
	  $this->tmpHtml = "";
	  $this->breImage = "";
		$GLOBALS['RfUrl'] = $dourl;
		$html = $this->DownOnePage($dourl);
		$this->tmpHtml = $html;
		//检测是否有分页字段,并预先处理
		if(!empty($this->ArtNote["sppage"])){
		  $noteid = "";
		  foreach($this->ArtNote as $k=>$sarr){
			  if($sarr["isunit"]==1){ $noteid = $k; break;}
		  }
		  $this->GetSpPage($dourl,$noteid,$html);
		}
		//分析所有内容,并保存
		$body = addslashes($this->GetPageFields($dourl,true));
		$query = "Update #@__courl set dtime='".time()."',result='$body',isdown='1' where aid='$aid'";
		$this->dsql->SetSql($query);
		if(!$this->dsql->ExecuteNoneQuery()){
			echo $this->dsql->GetError();
		}
		unset($body);
		unset($query);
		unset($html);
	}
	//------------------------
	//获取分页区域的内容
	//------------------------
	function GetSpPage($dourl,$noteid,&$html,$step=0){
		 $sarr = $this->ArtNote[$noteid];
		 $linkareaHtml = $this->GetHtmlArea("[var:分页区域]",$this->ArtNote["sppage"],$html);
		 if($linkareaHtml==""){
		 	  if($this->tmpUnitValue=="") $this->tmpUnitValue .= $this->GetHtmlArea("[var:内容]",$sarr["match"],$html);
		 	  else $this->tmpUnitValue .= "#p#副标题#e#".$this->GetHtmlArea("[var:内容]",$sarr["match"],$html);
		    return;
		 }
		 //完整的分页列表
		 if($this->ArtNote["sptype"]=="full"||$this->ArtNote["sptype"]==""){
		 	  $this->tmpUnitValue .= $this->GetHtmlArea("[var:内容]",$sarr["match"],$html);
		 	  $this->CDedeHtml->GetLinkType = "link";
				$this->CDedeHtml->SetSource($linkareaHtml,$dourl,false);
				foreach($this->CDedeHtml->Links as $k=>$t){
					$k = $this->CDedeHtml->FillUrl($k);
					if($k==$dourl) continue;
					$nhtml = $this->DownOnePage($k);
					if($nhtml!=""){ 
						$this->tmpUnitValue .= "#p#副标题#e#".$this->GetHtmlArea("[var:内容]",$sarr["match"],$nhtml);
					}
			  }
		 }
		 //上下页形式或不完整的分页列表
		 else{
		 	  if($step>50) return;
		 	  if($step==0) $this->tmpUnitValue .= "#e#".$this->GetHtmlArea("[var:内容]",$sarr["match"],$html);
		 	  $this->CDedeHtml->GetLinkType = "link";
				$this->CDedeHtml->SetSource($linkareaHtml,$dourl,false);
				$hasLink = false;
				foreach($this->CDedeHtml->Links as $k=>$t){
					$k = $this->CDedeHtml->FillUrl($k);
					if(in_array($k,$this->tmpLinks)) continue;
					else{
						$nhtml = $this->DownOnePage($k);
					  if($nhtml!=""){ 
						  $this->tmpUnitValue .= "#p#副标题#e#".$this->GetHtmlArea("[var:内容]",$sarr["match"],$nhtml);
					  }
					  $hasLink = true;
					  $this->tmpLinks[] = $k;
					  $dourl = $k;
					  $step++;
					}
			  }
			  if($hasLink) $this->GetSpPage($dourl,$noteid,$nhtml,$step);
		 } 
	}
	//-----------------------
	//获取特定区域的HTML
	//-----------------------
	function GetHtmlArea($sptag,&$areaRule,&$html){
	  //用正则表达式的模式匹配
	  if($this->Item["matchtype"]=="regex"){
	     $areaRule = str_replace("/","\\/",$areaRule);
	     $areaRules = explode($sptag,$areaRule);
	     $arr = array();
	     if($html==""||$areaRules[0]==""){ return ""; }
       preg_match("/".$areaRules[0]."(.*)".$areaRules[1]."/isU",$html,$arr);
       if(!empty($arr[1])){ return trim($arr[1]); }
       else{ return ""; }
	  //用字符串模式匹配
	  }else{
	  	 $areaRules = explode($sptag,$areaRule);
	  	 if($html==""||$areaRules[0]==""){ return ""; }
	  	 $posstart = @strpos($html,$areaRules[0]);
	  	 if($posstart===false){ return ""; }
	  	 $posend = strpos($html,$areaRules[1],$posstart);
	  	 if($posend > $posstart && $posend!==false){
	  	 	 return substr($html,$posstart+strlen($areaRules[0]),$posend-$posstart-strlen($areaRules[0]));
	  	 }else{
	  	 	 return "";
	  	 }
	  }
	}
	//--------------------------
	//下载指定网址
	//--------------------------
	function DownOnePage($dourl){
		$this->CHttpDown->OpenUrl($dourl);
		$html = $this->CHttpDown->GetHtml();
		$this->CHttpDown->Close();
		$this->ChangeCode($html);
		return $html;
	}
	//---------------------
	//下载特定资源,并保存为指定文件
	//---------------------
	function DownMedia($dourl,$mtype='img'){
		//检测是否已经下载此文件
		$isError = false;
		$errfile = $GLOBALS['cfg_phpurl'].'/img/etag.gif';
		$row = $this->dsql->GetOne("Select nurl from #@__co_mediaurl where rurl like '$dourl'");
		$wi = false;
		if(!empty($row['nurl'])){
			$filename = $row['nurl'];
			return $filename;
		}else{
		   //如果不存在,下载该文件
		   $filename = $this->GetRndName($dourl,$mtype);
		   if(!ereg("^/",$filename)) $filename = "/".$filename;
		   
		   //反盗链模式
		   if($this->Item["isref"]=='yes' && $this->Item["refurl"]!=''){
		      if($this->Item["exptime"]=='') $this->Item["exptime"] = 10;
		      $rs = DownImageKeep($dourl,$this->Item["refurl"],$GLOBALS['cfg_basedir'].$filename,"",0,$this->Item["exptime"]);
		      if($rs){
		         $inquery = "INSERT INTO #@__co_mediaurl(nid,rurl,nurl) VALUES ('".$this->NoteId."', '".addslashes($dourl)."', '".addslashes($filename)."');";
		         $this->dsql->ExecuteNoneQuery($inquery);
		      }else{
		      	$inquery = "INSERT INTO #@__co_mediaurl(nid,rurl,nurl) VALUES ('".$this->NoteId."', '".addslashes($dourl)."', '".addslashes($errfile)."');";
		        $this->dsql->ExecuteNoneQuery($inquery);
		      	$isError = true;
		      }
		      if($mtype=='img'){ $wi = true; }
	     //常规模式
	     }else{
		      $this->CHttpDown->OpenUrl($dourl);
		      $this->CHttpDown->SaveToBin($GLOBALS['cfg_basedir'].$filename);
		      $inquery = "INSERT INTO #@__co_mediaurl(nid,rurl,nurl) VALUES ('".$this->NoteId."', '".addslashes($dourl)."', '".addslashes($filename)."');";
		      $this->dsql->ExecuteNoneQuery($inquery);
		      if($mtype=='img'){ $wi = true; }
	        $this->CHttpDown->Close();
	     }
	  }

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -