📄 dedecollection.class.php
字号:
<?php
if(!defined('DEDEINC'))
{
exit('dedecms');
}
require_once(DEDEINC."/dedecollection.func.php"); //采集扩展函数
require_once(DEDEINC."/image.func.php");
require_once(DEDEINC."/dedehtml2.class.php");
@set_time_limit(0);
class DedeCollection
{
var $artNotes = array(); //文章采集的字段信息
var $lists = array(); //采集节点的来源列表处理信息
var $noteInfos = array(); //采集节点的基本配置信息
var $dsql = '';
var $noteId = '';
var $cDedeHtml = '';
var $cHttpDown = '';
var $mediaCount = 0;
var $tmpUnitValue = '';
var $tmpLinks = array();
var $tmpHtml = '';
var $breImage = '';
var $errString = '';
//兼容php5构造函数
function __construct()
{
$this->dsql = $GLOBALS['dsql'];
$this->cHttpDown = new DedeHttpDown();
$this->cDedeHtml = new DedeHtml2();
}
function DedeCollection()
{
$this->__construct();
}
//析放资源
function Close()
{
}
//从数据库里载入某个节点
function LoadNote($nid)
{
$this->noteId = $nid;
$row = $this->dsql->GetOne("Select * from `#@__co_note` where nid='$nid'");
$this->LoadListConfig($row['listconfig']);
$this->LoadItemConfig($row['itemconfig']);
}
//分析基本节点的及索引配置信息
function LoadListConfig($configString)
{
$dtp = new DedeTagParse();
$dtp2 = new DedeTagParse();
$dtp->LoadString($configString);
for($i=0;$i<=$dtp->Count;$i++)
{
$ctag = $dtp->CTags[$i];
//item 配置
//节点基本信息
if($ctag->GetName()=="noteinfo")
{
$this->noteInfos['notename'] = $ctag->GetAtt('notename');
$this->noteInfos['matchtype'] = $ctag->GetAtt('matchtype');
$this->noteInfos['channelid'] = $ctag->GetAtt('channelid');
$this->noteInfos['refurl'] = $ctag->GetAtt('refurl');
$this->noteInfos['sourcelang'] = $ctag->GetAtt('sourcelang');
$this->noteInfos['cosort'] = $ctag->GetAtt('cosort');
$this->noteInfos['isref'] = $ctag->GetAtt('isref');
$this->noteInfos['exptime'] = $ctag->GetAtt('exptime');
}
//list 配置
//要采集的列表页的信息
else if($ctag->GetName()=="listrule")
{
$this->lists['sourcetype'] = $ctag->GetAtt('sourcetype');
$this->lists['rssurl'] = $ctag->GetAtt('rssurl');
$this->lists['regxurl'] = $ctag->GetAtt('regxurl');
$this->lists['startid'] = $ctag->GetAtt('startid');
$this->lists['endid'] = $ctag->GetAtt('endid');
$this->lists['addv'] = $ctag->GetAtt('addv');
$this->lists['urlrule'] = $ctag->GetAtt('urlrule');
$this->lists['musthas'] = $ctag->GetAtt('musthas');
$this->lists['nothas'] = $ctag->GetAtt('nothas');
$this->lists['listpic'] = $ctag->GetAtt('listpic');
$this->lists['usemore'] = $ctag->GetAtt('usemore');
$dtp2->LoadString($ctag->GetInnerText());
for($j=0;$j<=$dtp2->Count;$j++)
{
$ctag2 = $dtp2->CTags[$j];
$tname = $ctag2->GetName();
if($tname=='addurls')
{
$this->lists['addurls'] = trim($ctag2->GetInnerText());
}
else if($tname=='regxrule')
{
$this->lists['regxrule'] = trim($ctag2->GetInnerText());
}
else if($tname=='areastart')
{
$this->lists['areastart'] = trim($ctag2->GetInnerText());
}
else if($tname=='areaend')
{
$this->lists['areaend'] = trim($ctag2->GetInnerText());
}
else if($tname=='batchrule')
{
$this->lists['batchrule'] = trim($ctag2->GetInnerText());
}
}
//分析列表网址
if($this->lists['sourcetype'] != 'rss')
{
$this->lists['url'] = GetUrlFromListRule($this->lists['regxurl'],$this->lists['addurls'],
$this->lists['startid'],$this->lists['endid'],$this->lists['addv'],$this->lists['usemore'],$this->lists['batchrule']);
}
else
{
$this->lists['url'] = $this->lists['rssurl'];
}
}
}//End Loop
$dtp->Clear();
$dtp2->Clear();
}
//分析采集文章页的字段的设置
function LoadItemConfig($configString)
{
$dtp = new DedeTagParse();
$dtp2 = new DedeTagParse();
$dtp->LoadString($configString);
for($i=0;$i<=$dtp->Count;$i++)
{
$ctag = $dtp->CTags[$i];
if($ctag->GetName()=='sppage')
{
$this->artNotes['sppage'] = $ctag->GetInnerText();
$this->artNotes['sptype'] = $ctag->GetAtt('sptype');
}
else if($ctag->GetName()=='previewurl')
{
$this->artNotes['previewurl'] = $ctag->GetInnerText();
}
else if($ctag->GetName()=='keywordtrim')
{
$this->artNotes['keywordtrim'] = $ctag->GetInnerText();
}
else if($ctag->GetName()=='descriptiontrim')
{
$this->artNotes['descriptiontrim'] = $ctag->GetInnerText();
}
else if($ctag->GetName()=='item')
{
$field = $ctag->GetAtt('field');
if($field == '')
{
continue;
}
$this->artNotes[$field]['value'] = $ctag->GetAtt('value');
$this->artNotes[$field]['isunit'] = $ctag->GetAtt('isunit');
$this->artNotes[$field]['isdown'] = $ctag->GetAtt('isdown');
$this->artNotes[$field]['trim'] = array();
$this->artNotes[$field]['match'] = '';
$this->artNotes[$field]['function'] = '';
$t = 0;
$dtp2->LoadString($ctag->GetInnerText());
for($k=0;$k<=$dtp2->Count;$k++)
{
$ctag2 = $dtp2->CTags[$k];
if($ctag2->GetName()=='trim')
{
$this->artNotes[$field]['trim'][$t][0] = str_replace('#n#',' ',$ctag2->GetInnerText());
$this->artNotes[$field]['trim'][$t][1] = $ctag2->GetAtt('replace');
$t++;
}
else if($ctag2->GetName()=='match')
{
$this->artNotes[$field]['match'] = str_replace('#n#',' ',$ctag2->GetInnerText());
}
else if($ctag2->GetName()=='function')
{
$this->artNotes[$field]['function'] = $ctag2->GetInnerText();
}
}
}
}//End Loop
$dtp->Clear();
$dtp2->Clear();
}
//下载其中一个网址,并保存
function DownUrl($aid,$dourl,$litpic='',$issave=true)
{
$this->tmpLinks = array();
$this->tmpUnitValue = '';
$this->breImage = '';
$this->tmpHtml = $this->DownOnePage($dourl);
//检测是否有分页字段,并预先处理
if(!empty($this->artNotes['sppage']))
{
$noteid = '';
foreach($this->artNotes as $k=>$sarr)
{
if(isset($sarr['isunit']) && $sarr['isunit']==1)
{
$noteid = $k;
break;
}
}
$this->GetSpPage($dourl,$noteid,$this->tmpHtml);
if(eregi('#p#',$this->tmpUnitValue))
{
$this->tmpUnitValue = '副标题#e#'.$this->tmpUnitValue;
}
}
//处理字段
$body = $this->GetPageFields($dourl,$issave,$litpic);
//保存资料到数据库
if($issave)
{
$query = " Update `#@__co_htmls` set dtime='".time()."',result='".addslashes($body)."',isdown='1' where aid='$aid' ";
if(!$this->dsql->ExecuteNoneQuery($query))
{
echo $this->dsql->GetError();
}
return $body;
}
return $body;
}
//获取分页区域的内容
function GetSpPage($dourl,$noteid,$html,$step=0)
{
$sarr = $this->artNotes[$noteid];
$linkareaHtml = $this->GetHtmlArea('[内容]',$this->artNotes['sppage'],$html);
if($linkareaHtml=='')
{
if($this->tmpUnitValue=='')
{
$this->tmpUnitValue .= $this->GetHtmlArea('[内容]',$sarr['match'],$html);
}
else
{
$this->tmpUnitValue .= "#p#副标题#e#".$this->GetHtmlArea('[内容]',$sarr['match'],$html);
}
return;
}
//完整的分页列表
if($this->artNotes["sptype"]=='full' || $this->artNotes["sptype"]=='')
{
$this->tmpUnitValue .= $this->GetHtmlArea('[内容]',$sarr['match'],$html);
$this->cDedeHtml->GetLinkType = "link";
$this->cDedeHtml->SetSource($linkareaHtml,$dourl,'link');
foreach($this->cDedeHtml->Links as $k=>$t)
{
$k = $this->cDedeHtml->FillUrl($k);
if($k==$dourl)
{
continue;
}
$nhtml = $this->DownOnePage($k);
if($nhtml!='')
{
$ct = trim($this->GetHtmlArea('[内容]',$sarr['match'],$nhtml));
if($ct!='')
{
$this->tmpUnitValue .= "#p#副标题#e#".$ct;
}
}
}
}
//上下页形式或不完整的分页列表
else
{
if($step>50)
{
return;
}
if($step==0)
{
$this->tmpUnitValue .= $this->GetHtmlArea('[内容]',$sarr['match'],$html);
}
$this->cDedeHtml->GetLinkType = "link";
$this->cDedeHtml->SetSource($linkareaHtml,$dourl,'link');
$hasLink = false;
foreach($this->cDedeHtml->Links as $k=>$t)
{
$k = $this->cDedeHtml->FillUrl($k);
if(in_array($k,$this->tmpLinks))
{
continue;
}
else{
$nhtml = $this->DownOnePage($k);
if($nhtml!='')
{
$ct = trim($this->GetHtmlArea('[内容]',$sarr['match'],$nhtml));
if($ct!='')
{
$this->tmpUnitValue .= "#p#副标题#e#".$ct;
}
}
$hasLink = true;
$this->tmpLinks[] = $k;
$dourl = $k;
$step++;
}
}
if($hasLink)
{
$this->GetSpPage($dourl,$noteid,$nhtml,$step);
}
}
}
//获取特定区域的HTML
function GetHtmlArea($sptag,&$areaRule,&$html)
{
//用正则表达式的模式匹配
if($this->noteInfos['matchtype']=='regex')
{
$areaRule = str_replace("/","\\/",$areaRule);
$areaRules = explode($sptag,$areaRule);
$arr = array();
if($html==''||$areaRules[0]=='')
{
return '';
}
preg_match('/'.$areaRules[0]."(.*)".$areaRules[1]."/isU",$html,$arr);
return empty($arr[1]) ? '' : trim($arr[1]);
}
//用字符串模式匹配
else
{
$areaRules = explode($sptag,$areaRule);
if($html=='' || $areaRules[0]=='')
{
return '';
}
$posstart = @strpos($html,$areaRules[0]);
if($posstart===false)
{
return '';
}
$posend = @strpos($html,$areaRules[1],$posstart);
if($posend > $posstart && $posend!==false)
{
return substr($html,$posstart+strlen($areaRules[0]),$posend-$posstart-strlen($areaRules[0]));
}
else
{
return '';
}
}
}
//下载指定网址
function DownOnePage($dourl)
{
$this->cHttpDown->OpenUrl($dourl);
$html = $this->cHttpDown->GetHtml();
$this->cHttpDown->Close();
$this->ChangeCode($html);
return $html;
}
//下载特定资源,并保存为指定文件
function DownMedia($dourl,$mtype='img',$islitpic=false)
{
global $notckpic;
if(empty($notckpic))
{
$notckpic = 0;
}
//检测是否已经下载此文件
$wi = false;
$tofile = $filename = '';
if($notckpic==0)
{
$row = $this->dsql->GetOne("Select hash,tofile from `#@__co_mediaurls` where nid='{$this->noteId}' And hash='".md5($dourl)."' ");
if(isset($row['tofile']))
{
$tofile = $filename = $row['tofile'];
}
}
//如果不存在,下载文件
if($tofile=='' || !file_exists($GLOBALS['cfg_basedir'].$filename))
{
$filename = $this->GetRndName($dourl,$mtype);
if(!ereg("^/",$filename))
{
$filename = "/".$filename;
}
//防盗链模式
if($this->noteInfos['isref']=='yes' && $this->noteInfos['refurl']!='')
{
if($this->noteInfos['exptime']=='')
{
$this->noteInfos['exptime'] = 10;
}
DownImageKeep($dourl,$this->noteInfos['refurl'],$GLOBALS['cfg_basedir'].$filename,'',0,$this->Item['exptime']);
}
//普通模式
else
{
$this->cHttpDown->OpenUrl($dourl);
$this->cHttpDown->SaveToBin($GLOBALS['cfg_basedir'].$filename);
$this->cHttpDown->Close();
}
//下载文件成功,保存记录
if(file_exists($GLOBALS['cfg_basedir'].$filename))
{
if($tofile=='')
{
$query = "INSERT INTO `#@__co_mediaurls`(nid,hash,tofile) VALUES ('".$this->noteId."', '".md5($dourl)."', '".addslashes($filename)."');";
}
else
{
$query = "Update `#@__co_mediaurls` set tofile='".addslashes($filename)."' where hash='".md5($dourl)."' ";
}
$this->dsql->ExecuteNoneQuery($query);
}
}
//如果下载图片失败或图片不存在,返回网址
if(!file_exists($GLOBALS['cfg_basedir'].$filename))
{
return $dourl;
}
//生成缩略图
if($mtype=='img' && !$islitpic && $this->breImage=='')
{
$this->breImage = $filename;
if(!eregi("^http://",$this->breImage) && file_exists($GLOBALS['cfg_basedir'].$filename))
{
$filenames = explode('/',$filename);
$filenamed = $filenames[count($filenames)-1];
$nfilename = str_replace('.','_lit.',$filenamed);
$nfilename = str_replace($filenamed,$nfilename,$filename);
if(@copy($GLOBALS['cfg_basedir'].$filename,$GLOBALS['cfg_basedir'].$nfilename))
{
ImageResize($GLOBALS['cfg_basedir'].$nfilename,$GLOBALS['cfg_ddimg_width'],$GLOBALS['cfg_ddimg_height']);
$this->breImage = $nfilename;
}
}
}
if($mtype=='img' && !$islitpic)
{
@WaterImg($GLOBALS['cfg_basedir'].$filename,'up');
}
return $filename;
}
//获得下载媒体的随机名称
function GetRndName($url,$v)
{
global $cfg_image_dir,$cfg_dir_purview;
$this->mediaCount++;
$mnum = $this->mediaCount;
$timedir = "c".MyDate("ymd",time());
//存放路径
$fullurl = preg_replace("/\/{1,}/","/",$cfg_image_dir."/");
if(!is_dir($GLOBALS['cfg_basedir']."/$fullurl"))
{
MkdirAll($GLOBALS['cfg_basedir']."/$fullurl",$cfg_dir_purview);
}
$fullurl = $fullurl.$timedir."/";
if(!is_dir($GLOBALS['cfg_basedir']."/$fullurl"))
{
MkdirAll($GLOBALS['cfg_basedir']."/$fullurl",$cfg_dir_purview);
}
//文件名称
$timename = str_replace('.','',ExecTime());
$threadnum = 0;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -