📄 pub_collection.php
字号:
<?php
/*------------------------
DedeCms在线采集程序V2
作者:IT柏拉图
开发时间 2006年9月 最后更改时间 2007-1-17
-----------------------*/
require_once(dirname(__FILE__)."/pub_httpdown.php");
require_once(dirname(__FILE__)."/pub_dedetag.php");
require_once(dirname(__FILE__)."/pub_db_mysql.php");
require_once(dirname(__FILE__)."/pub_charset.php");
require_once(dirname(__FILE__)."/pub_collection_functions.php"); //采集扩展函数
require_once(dirname(__FILE__)."/inc_photograph.php");
require_once(dirname(__FILE__)."/pub_dedehtml2.php");
@set_time_limit(0);
class DedeCollection
{
var $Item = array(); //采集节点的基本配置信息
var $List = array(); //采集节点的来源列表处理信息
var $Art = array(); //采集节点的文章处理信息
var $ArtNote = array(); //文章采集的字段信息
var $dsql = "";
var $NoteId = "";
var $CDedeHtml = "";
var $CHttpDown = "";
var $MediaCount = 0;
var $tmpUnitValue = "";
var $tmpLinks = array();
var $tmpHtml = "";
var $breImage = "";
//-------------------------------
//兼容php5构造函数
//-------------------------------
function __construct(){
$this->dsql = new DedeSql(false);
$this->CHttpDown = new DedeHttpDown();
$this->CDedeHtml = new DedeHtml2();
}
function DedeCollection(){
$this->__construct();
}
function Init(){
//仅兼容性函数
}
//析放资源
//---------------------------
function Close(){
$this->dsql->Close();
unset($this->Item);
unset($this->List);
unset($this->Art);
unset($this->ArtNote);
unset($this->tmpLinks);
unset($this->dsql);
unset($this->CDedeHtml);
unset($this->CHttpDown);
unset($this->tmpUnitValue);
unset($this->tmpHtml);
}
//-------------------------------
//从数据库里载入某个节点
//-------------------------------
function LoadNote($nid)
{
$this->NoteId = $nid;
$this->dsql->SetSql("Select * from #@__conote where nid='$nid'");
$this->dsql->Execute();
$row = $this->dsql->Getarray();
$this->LoadConfig($row["noteinfo"]);
$this->dsql->FreeResult();
}
//-------------------------------
//从数据库里载入某个节点
//-------------------------------
function LoadFromDB($nid)
{
$this->NoteId = $nid;
$this->dsql->SetSql("Select * from #@__conote where nid='$nid'");
$this->dsql->Execute();
$row = $this->dsql->GetArray();
$this->LoadConfig($row["noteinfo"]);
$this->dsql->FreeResult();
}
//----------------------------
//分析节点的配置信息
//----------------------------
function LoadConfig($configString)
{
$dtp = new DedeTagParse();
$dtp->SetNameSpace("dede","{","}");
$dtp2 = new DedeTagParse();
$dtp2->SetNameSpace("dede","{","}");
$dtp3 = new DedeTagParse();
$dtp3->SetNameSpace("dede","{","}");
$dtp->LoadString($configString);
for($i=0;$i<=$dtp->Count;$i++)
{
$ctag = $dtp->CTags[$i];
//item 配置
//节点基本信息
if($ctag->GetName()=="item")
{
$this->Item["name"] = $ctag->GetAtt("name");
$this->Item["typeid"] = $ctag->GetAtt("typeid");
$this->Item["imgurl"] = $ctag->GetAtt("imgurl");
$this->Item["imgdir"] = $ctag->GetAtt("imgdir");
$this->Item["language"] = $ctag->GetAtt("language");
$this->Item["matchtype"] = $ctag->GetAtt("matchtype");
$this->Item["isref"] = $ctag->GetAtt("isref");
$this->Item["refurl"] = $ctag->GetAtt("refurl");
$this->Item["exptime"] = $ctag->GetAtt("exptime");
if($this->Item["matchtype"]=="") $this->Item["matchtype"]="string";
//创建图片保存目录
$updir = dirname(__FILE__)."/".$this->Item["imgdir"]."/";
$updir = str_replace("\\","/",$updir);
$updir = preg_replace("/\/{1,}/","/",$updir);
if(!is_dir($updir)) MkdirAll($updir,$GLOBALS['cfg_dir_purview']);
}
//list 配置
//要采集的列表页的信息
else if($ctag->GetName()=="list")
{
$this->List["varstart"]= $ctag->GetAtt("varstart");
$this->List["varend"] = $ctag->GetAtt("varend");
$this->List["source"] = $ctag->GetAtt("source");
$this->List["sourcetype"] = $ctag->GetAtt("sourcetype");
$dtp2->LoadString($ctag->GetInnerText());
for($j=0;$j<=$dtp2->Count;$j++)
{
$ctag2 = $dtp2->CTags[$j];
$tname = $ctag2->GetName();
if($tname=="need"){
$this->List["need"] = trim($ctag2->GetInnerText());
}else if($tname=="cannot"){
$this->List["cannot"] = trim($ctag2->GetInnerText());
}
else if($tname=="linkarea"){
$this->List["linkarea"] = trim($ctag2->GetInnerText());
}else if($tname=="url")
{
$gurl = trim($ctag2->GetAtt("value"));
//手工指定列表网址
if($this->List["source"]=="app")
{
$turl = trim($ctag2->GetInnerText());
$turls = explode("\n",$turl);
$l_tj = 0;
foreach($turls as $turl){
$turl = trim($turl);
if($turl=="") continue;
if(!eregi("^http://",$turl)) $turl = "http://".$turl;
$this->List["url"][$l_tj] = $turl;
$l_tj++;
}
}
//用分页变量产生的网址
else
{
if(eregi("var:分页",trim($ctag2->GetAtt("value")))){
if($this->List["varstart"]=="") $this->List["varstart"]=1;
if($this->List["varend"]=="") $this->List["varend"]=10;
$l_tj = 0;
for($l_em = $this->List["varstart"];$l_em<=$this->List["varend"];$l_em++){
$this->List["url"][$l_tj] = str_replace("[var:分页]",$l_em,$gurl);
$l_tj++;
}
}//if set var
else{
$this->List["url"][0] = $gurl;
}
}
}
}//End inner Loop1
}
//art 配置
//要采集的文章页的信息
else if($ctag->GetName()=="art")
{
$dtp2->LoadString($ctag->GetInnerText());
for($j=0;$j<=$dtp2->Count;$j++)
{
$ctag2 = $dtp2->CTags[$j];
//文章要采集的字段的信息及处理方式
if($ctag2->GetName()=="note"){
$field = $ctag2->GetAtt('field');
if($field == "") continue;
$this->ArtNote[$field]["value"] = $ctag2->GetAtt('value');
$this->ArtNote[$field]["isunit"] = $ctag2->GetAtt('isunit');
$this->ArtNote[$field]["isdown"] = $ctag2->GetAtt('isdown');
$dtp3->LoadString($ctag2->GetInnerText());
for($k=0;$k<=$dtp3->Count;$k++)
{
$ctag3 = $dtp3->CTags[$k];
if($ctag3->GetName()=="trim"){
$this->ArtNote[$field]["trim"][] = $ctag3->GetInnerText();
}
else if($ctag3->GetName()=="match"){
$this->ArtNote[$field]["match"] = $ctag3->GetInnerText();
}
else if($ctag3->GetName()=="function"){
$this->ArtNote[$field]["function"] = $ctag3->GetInnerText();
}
}
}
else if($ctag2->GetName()=="sppage"){
$this->ArtNote["sppage"] = $ctag2->GetInnerText();
$this->ArtNote["sptype"] = $ctag2->GetAtt('sptype');
}
}//End inner Loop2
}
}//End Loop
$dtp->Clear();
$dtp2->Clear();
}
//-----------------------------
//下载其中一个网址,并保存
//-----------------------------
function DownUrl($aid,$dourl)
{
$this->tmpLinks = array();
$this->tmpUnitValue = "";
$this->tmpHtml = "";
$this->breImage = "";
$GLOBALS['RfUrl'] = $dourl;
$html = $this->DownOnePage($dourl);
$this->tmpHtml = $html;
//检测是否有分页字段,并预先处理
if(!empty($this->ArtNote["sppage"])){
$noteid = "";
foreach($this->ArtNote as $k=>$sarr){
if($sarr["isunit"]==1){ $noteid = $k; break;}
}
$this->GetSpPage($dourl,$noteid,$html);
}
//分析所有内容,并保存
$body = addslashes($this->GetPageFields($dourl,true));
$query = "Update #@__courl set dtime='".time()."',result='$body',isdown='1' where aid='$aid'";
$this->dsql->SetSql($query);
if(!$this->dsql->ExecuteNoneQuery()){
echo $this->dsql->GetError();
}
unset($body);
unset($query);
unset($html);
}
//------------------------
//获取分页区域的内容
//------------------------
function GetSpPage($dourl,$noteid,&$html,$step=0){
$sarr = $this->ArtNote[$noteid];
$linkareaHtml = $this->GetHtmlArea("[var:分页区域]",$this->ArtNote["sppage"],$html);
if($linkareaHtml==""){
if($this->tmpUnitValue=="") $this->tmpUnitValue .= $this->GetHtmlArea("[var:内容]",$sarr["match"],$html);
else $this->tmpUnitValue .= "#p#副标题#e#".$this->GetHtmlArea("[var:内容]",$sarr["match"],$html);
return;
}
//完整的分页列表
if($this->ArtNote["sptype"]=="full"||$this->ArtNote["sptype"]==""){
$this->tmpUnitValue .= $this->GetHtmlArea("[var:内容]",$sarr["match"],$html);
$this->CDedeHtml->GetLinkType = "link";
$this->CDedeHtml->SetSource($linkareaHtml,$dourl,false);
foreach($this->CDedeHtml->Links as $k=>$t){
$k = $this->CDedeHtml->FillUrl($k);
if($k==$dourl) continue;
$nhtml = $this->DownOnePage($k);
if($nhtml!=""){
$this->tmpUnitValue .= "#p#副标题#e#".$this->GetHtmlArea("[var:内容]",$sarr["match"],$nhtml);
}
}
}
//上下页形式或不完整的分页列表
else{
if($step>50) return;
if($step==0) $this->tmpUnitValue .= "#e#".$this->GetHtmlArea("[var:内容]",$sarr["match"],$html);
$this->CDedeHtml->GetLinkType = "link";
$this->CDedeHtml->SetSource($linkareaHtml,$dourl,false);
$hasLink = false;
foreach($this->CDedeHtml->Links as $k=>$t){
$k = $this->CDedeHtml->FillUrl($k);
if(in_array($k,$this->tmpLinks)) continue;
else{
$nhtml = $this->DownOnePage($k);
if($nhtml!=""){
$this->tmpUnitValue .= "#p#副标题#e#".$this->GetHtmlArea("[var:内容]",$sarr["match"],$nhtml);
}
$hasLink = true;
$this->tmpLinks[] = $k;
$dourl = $k;
$step++;
}
}
if($hasLink) $this->GetSpPage($dourl,$noteid,$nhtml,$step);
}
}
//-----------------------
//获取特定区域的HTML
//-----------------------
function GetHtmlArea($sptag,&$areaRule,&$html){
//用正则表达式的模式匹配
if($this->Item["matchtype"]=="regex"){
$areaRule = str_replace("/","\\/",$areaRule);
$areaRules = explode($sptag,$areaRule);
$arr = array();
if($html==""||$areaRules[0]==""){ return ""; }
preg_match("/".$areaRules[0]."(.*)".$areaRules[1]."/isU",$html,$arr);
if(!empty($arr[1])){ return trim($arr[1]); }
else{ return ""; }
//用字符串模式匹配
}else{
$areaRules = explode($sptag,$areaRule);
if($html==""||$areaRules[0]==""){ return ""; }
$posstart = @strpos($html,$areaRules[0]);
if($posstart===false){ return ""; }
$posend = strpos($html,$areaRules[1],$posstart);
if($posend > $posstart && $posend!==false){
return substr($html,$posstart+strlen($areaRules[0]),$posend-$posstart-strlen($areaRules[0]));
}else{
return "";
}
}
}
//--------------------------
//下载指定网址
//--------------------------
function DownOnePage($dourl){
$this->CHttpDown->OpenUrl($dourl);
$html = $this->CHttpDown->GetHtml();
$this->CHttpDown->Close();
$this->ChangeCode($html);
return $html;
}
//---------------------
//下载特定资源,并保存为指定文件
//---------------------
function DownMedia($dourl,$mtype='img'){
//检测是否已经下载此文件
$isError = false;
$errfile = $GLOBALS['cfg_phpurl'].'/img/etag.gif';
$row = $this->dsql->GetOne("Select nurl from #@__co_mediaurl where rurl like '$dourl'");
$wi = false;
if(!empty($row['nurl'])){
$filename = $row['nurl'];
return $filename;
}else{
//如果不存在,下载该文件
$filename = $this->GetRndName($dourl,$mtype);
if(!ereg("^/",$filename)) $filename = "/".$filename;
//反盗链模式
if($this->Item["isref"]=='yes' && $this->Item["refurl"]!=''){
if($this->Item["exptime"]=='') $this->Item["exptime"] = 10;
$rs = DownImageKeep($dourl,$this->Item["refurl"],$GLOBALS['cfg_basedir'].$filename,"",0,$this->Item["exptime"]);
if($rs){
$inquery = "INSERT INTO #@__co_mediaurl(nid,rurl,nurl) VALUES ('".$this->NoteId."', '".addslashes($dourl)."', '".addslashes($filename)."');";
$this->dsql->ExecuteNoneQuery($inquery);
}else{
$inquery = "INSERT INTO #@__co_mediaurl(nid,rurl,nurl) VALUES ('".$this->NoteId."', '".addslashes($dourl)."', '".addslashes($errfile)."');";
$this->dsql->ExecuteNoneQuery($inquery);
$isError = true;
}
if($mtype=='img'){ $wi = true; }
//常规模式
}else{
$this->CHttpDown->OpenUrl($dourl);
$this->CHttpDown->SaveToBin($GLOBALS['cfg_basedir'].$filename);
$inquery = "INSERT INTO #@__co_mediaurl(nid,rurl,nurl) VALUES ('".$this->NoteId."', '".addslashes($dourl)."', '".addslashes($filename)."');";
$this->dsql->ExecuteNoneQuery($inquery);
if($mtype=='img'){ $wi = true; }
$this->CHttpDown->Close();
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -