📄 dedecollection.class.php
字号:
if(isset($_GET['threadnum']))
{
$threadnum = intval($_GET['threadnum']);
}
$filename = dd2char($timename.$threadnum.'-'.$mnum.mt_rand(1000,9999));
//分配扩展名
$urls = explode('.',$url);
if($v=='img')
{
$shortname = '.jpg';
if(eregi("\.gif",$v))
{
$shortname = '.gif';
}
else if(eregi("\.png",$v))
{
$shortname = '.png';
}
}
else if($v=='embed')
{
$shortname = '.swf';
}
else
{
$shortname = '';
}
$fullname = $fullurl.$filename.$shortname;
return preg_replace("/\/{1,}/","/",$fullname);
}
//按载入的网页内容获取规则,从一个HTML文件中获取内容
function GetPageFields($dourl,$needDown,$litpic='')
{
global $cfg_auot_description;
if($this->tmpHtml == '')
{
return '';
}
$artitem = '';
$isPutUnit = false;
$tmpLtKeys = array();
$inarr = array();
//自动分析关键字和摘要
preg_match("/<meta[\s]+name=['\"]keywords['\"] content=['\"](.*)['\"]/isU",$this->tmpHtml,$inarr);
preg_match("/<meta[\s]+content=['\"](.*)['\"] name=['\"]keywords['\"]/isU",$this->tmpHtml,$inarr2);
if(!isset($inarr[1]) && isset($inarr2[1]))
{
$inarr[1] = $inarr2[1];
}
if(isset($inarr[1]))
{
$keywords = trim(cn_substr(html2text($inarr[1]),30));
$keywords = preg_replace("/".$this->artNotes['keywordtrim']."/isU",'',$keywords);
if(!ereg(',',$keywords))
{
$keywords = str_replace(' ',',',$keywords);
}
$artitem .= "{dede:field name='keywords'}".$keywords."{/dede:field}\r\n";
}
else
{
$artitem .= "{dede:field name='keywords'}{/dede:field}\r\n";
}
preg_match("/<meta[\s]+name=['\"]description['\"] content=['\"](.*)['\"]/isU",$this->tmpHtml,$inarr);
preg_match("/<meta[\s]+content=['\"](.*)['\"] name=['\"]description['\"]/isU",$this->tmpHtml,$inarr2);
if(!isset($inarr[1]) && isset($inarr2[1]))
{
$inarr[1] = $inarr2[1];
}
if(isset($inarr[1]))
{
$description = trim(cn_substr(html2text($inarr[1]),$cfg_auot_description));
$description = preg_replace("/".$this->artNotes['descriptiontrim']."/isU",'',$description);
$artitem .= "{dede:field name='description'}".$description."{/dede:field}\r\n";
}
else
{
$artitem .= "{dede:field name='description'}{/dede:field}\r\n";
}
foreach($this->artNotes as $k=>$sarr)
{
//可能出现意外的情况
if($k=='sppage' || $k=='sptype')
{
continue;
}
if(!is_array($sarr))
{
continue;
}
//特殊的规则或没匹配选项
if($sarr['match']=='' || trim($sarr['match'])=='[内容]')
{
if($sarr['value']!='[内容]')
{
$v = trim($sarr['value']);
}
else
{
$v = '';
}
}
else
{
//分多页的内容
if($this->tmpUnitValue!='' && !$isPutUnit && $sarr['isunit']==1)
{
$v = $this->tmpUnitValue;
$isPutUnit = true;
}
else
{
$v = $this->GetHtmlArea('[内容]',$sarr['match'],$this->tmpHtml);
}
//过滤内容规则
if(isset($sarr['trim']) && $v!='')
{
foreach($sarr['trim'] as $nv)
{
if($nv[0]=='')
{
continue;
}
$nvs = str_replace("/","\\/",$nv[0]);
$v = preg_replace("/".$nvs."/isU",$nv[1],$v);
}
}
//是否下载远程资源
if($needDown)
{
if($sarr['isdown'] == '1')
{
$v = $this->DownMedias($v,$dourl);
}
}
else
{
if($sarr['isdown'] == '1')
{
$v = $this->MediasReplace($v,$dourl);
}
}
}
$v = trim($v);
//用户自行对内容进行处理的接口
if($sarr['function'] != '')
{
$tmpLtKeys[$k]['v'] = $v;
$tmpLtKeys[$k]['f'] = $sarr['function'];
}
else
{
$v = ereg_replace("( )$",'',$v);
$v = ereg_replace("[\r\n\t ]{1,}$",'',$v);
$artitem .= "{dede:field name='$k'}$v{/dede:field}\r\n";
}
}//End Foreach
//处理带函数的项目
foreach($tmpLtKeys as $k=>$sarr)
{
$v = $this->RunPHP($sarr['v'],$sarr['f']);
$v = ereg_replace("( )$",'',$v);
$v = ereg_replace("[\r\n\t ]{1,}$",'',$v);
$artitem .= "{dede:field name='$k'}$v{/dede:field}\r\n";
}
if($litpic!='' && $this->lists['listpic']==1)
{
$artitem .= "{dede:field name='litpic'}".$this->DownMedia($litpic,'img',true)."{/dede:field}\r\n";
}
else
{
$artitem .= "{dede:field name='litpic'}".$this->breImage."{/dede:field}\r\n";
}
return $artitem;
}
//下载内容里的资源
function DownMedias(&$html,$url)
{
$this->cDedeHtml->SetSource($html,$url,'media');
//下载标记里的图片和flash
foreach($this->cDedeHtml->Medias as $k=>$v)
{
$furl = $this->cDedeHtml->FillUrl($k);
if($v=='embed' && !eregi("\.(swf)\?(.*)$",$k)&& !eregi("\.(swf)$",$k))
{
continue;
}
$okurl = $this->DownMedia($furl,$v);
$html = str_replace($k,$okurl,$html);
}
//下载超链接里的图片
foreach($this->cDedeHtml->Links as $v=>$k)
{
if(eregi("\.(jpg|gif|png)\?(.*)$",$v) || eregi("\.(jpg|gif|png)$",$v))
{
$m = "img";
}
else if(eregi("\.(swf)\?(.*)$",$v) || eregi("\.(swf)$",$v))
{
$m = "embed";
}
else
{
continue;
}
$furl = $this->cDedeHtml->FillUrl($v);
$okurl = $this->DownMedia($furl,$m);
$html = str_replace($v,$okurl,$html);
}
return $html;
}
//仅替换内容里的资源为绝对网址
function MediasReplace(&$html,$dourl)
{
$this->cDedeHtml->SetSource($html,$dourl,'media');
foreach($this->cDedeHtml->Medias as $k=>$v)
{
$k = trim($k);
$okurl = $this->cDedeHtml->FillUrl($k);
$html = str_replace($k,$okurl,$html);
}
return $html;
}
//测试列表
function Testlists(&$dourl)
{
$links = array();
//从RSS中获取网址
if($this->lists['sourcetype']=='rss')
{
$dourl = $this->lists['rssurl'];
$links = GetRssLinks($dourl);
return $links;
}
//正常情况
if(isset($this->lists['url'][0][0]))
{
$dourl = $this->lists['url'][0][0];
}
else
{
$dourl = '';
$this->errString = "配置中指定列表的网址错误!\r\n";
return $links;
}
$dhtml = new DedeHtml2();
$html = $this->DownOnePage($dourl);
if($html=='')
{
$this->errString = "读取网址: $dourl 时失败!\r\n";
return $links;
}
if( trim($this->lists['areastart']) !='' && trim($this->lists['areaend']) != '' )
{
$areabody = $this->lists['areastart'].'[var:区域]'.$this->lists['areaend'];
$html = $this->GetHtmlArea('[var:区域]',$areabody,$html);
}
$t1 = ExecTime();
$dhtml->SetSource($html,$dourl,'link');
foreach($dhtml->Links as $s)
{
if($this->lists['nothas']!='')
{
if( eregi($this->lists['nothas'],$s['link']) )
{
continue;
}
}
if($this->lists['musthas']!='')
{
if( !eregi($this->lists['musthas'],$s['link']) )
{
continue;
}
}
$links[] = $s;
}
return $links;
}
//测试文章规则
function TestArt($dourl)
{
return $this->DownUrl(0,$dourl,'',false);
}
//采集种子网址
function GetSourceUrl($islisten=0,$glstart=0,$pagesize=10)
{
//在第一页中进行预处理
//“下载种子网址的未下载内容”的模式不需要经过采集种子网址的步骤
if($glstart==0)
{
//重新采集所有内容模式
if($islisten == -1)
{
$this->dsql->ExecuteNoneQuery("Delete From `#@__co_urls` where nid='".$this->noteId."'");
$this->dsql->ExecuteNoneQuery("Delete From `#@__co_htmls` where nid='".$this->noteId."' ");
}
//监听模式(保留未导出的内容、保留节点的历史网址记录)
else
{
$this->dsql->ExecuteNoneQuery("Delete From `#@__co_htmls` where nid='".$this->noteId."' And isexport=1 ");
}
}
//从RSS中获取种子
if($this->lists['sourcetype']=='rss')
{
$links = GetRssLinks($this->lists['rssurl']);
//if($this->noteInfos['cosort']!='asc')
$tmplink = krsort($links);
foreach($links as $v)
{
if($islisten==1)
{
$lrow = $this->dsql->GetOne("Select * From `#@__co_urls` where nid='{$this->noteId}' And hash='".md5($v['link'])."' ");
if(is_array($lrow))
{
continue;
}
}
$inquery = "INSERT INTO `#@__co_htmls` (`nid` ,`typeid`, `title` , `litpic` , `url` , `dtime` , `isdown` , `isexport` , `result`)
VALUES ('{$this->noteId}' , '0', '".addslashes($v['title'])."' , '".addslashes($v['image'])."' , '".addslashes($v['link'])."' , 'dtime' , '0' , '0' , ''); ";
$this->dsql->ExecuteNoneQuery($inquery);
$inquery = "INSERT INTO `#@__co_urls`(hash,nid) VALUES ('".md5($v['link'])."','{$this->noteId}');";
$this->dsql->ExecuteNoneQuery($inquery);
}
return 0;
}
else
{
$tmplink = array();
$arrStart = 0;
$moviePostion = 0;
$endpos = $glstart + $pagesize;
$totallen = count($this->lists['url']);
foreach($this->lists['url'] as $k=>$cururls)
{
$cururl = $cururls[0];
$typeid = (empty($cururls[1]) ? 0 : $cururls[1]);
$moviePostion++;
if($moviePostion > $endpos)
{
break;
}
if($moviePostion > $glstart)
{
$html = $this->DownOnePage($cururl);
if( trim($this->lists['areastart']) !='' && trim($this->lists['areaend']) != '' )
{
$areabody = $this->lists['areastart'].'[var:区域]'.$this->lists['areaend'];
$html = $this->GetHtmlArea('[var:区域]',$areabody,$html);
}
$this->cDedeHtml->SetSource($html,$cururl,'link');
$lk = 0;
foreach($this->cDedeHtml->Links as $k=>$v)
{
if($this->lists['nothas']!='')
{
if( eregi($this->lists['nothas'],$v['link']) )
{
continue;
}
}
if($this->lists['musthas']!='')
{
if( !eregi($this->lists['musthas'],$v['link']) )
{
continue;
}
}
$tmplink[$arrStart][0] = $v;
$tmplink[$arrStart][1] = $typeid;
$arrStart++;
$lk++;
}
$this->cDedeHtml->Clear();
}
}//foreach
//if($this->noteInfos['cosort']!='asc')
krsort($tmplink);
$unum = count($tmplink);
if($unum>0)
{
//echo "完成本次种子网址抓取,共找到:{$unum} 个记录!<br/>\r\n";
foreach($tmplink as $vs)
{
$v = $vs[0];
$typeid = $vs[1];
if($islisten==1)
{
$lrow = $this->dsql->GetOne("Select * From `#@__co_urls` where nid='{$this->noteId}' And hash='".md5($v['link'])."' ");
if(is_array($lrow))
{
continue;
}
}
$inquery = "INSERT INTO `#@__co_htmls` (`nid` ,`typeid`, `title` , `litpic` , `url` , `dtime` , `isdown` , `isexport` , `result`)
VALUES ('{$this->noteId}' ,'$typeid', '".addslashes($v['title'])."' , '".addslashes($v['image'])."' , '".addslashes($v['link'])."' , '".time()."' , '0' , '0' , ''); ";
$this->dsql->ExecuteNoneQuery($inquery);
$inquery = "INSERT INTO `#@__co_urls`(hash,nid) VALUES ('".md5($v['link'])."','{$this->noteId}');";
$this->dsql->ExecuteNoneQuery($inquery);
}
if($endpos >= $totallen)
{
return 0;
}
else
{
return ($totallen-$endpos);
}
}
else
{
//仅在第一批采集时出错才返回
if($glstart==0)
{
return -1;
}
//在其它页出错照常采集后面内容
if($endpos >= $totallen)
{
return 0;
}
else
{
return ($totallen-$endpos);
}
}
}
}
//用扩展函数处理采集到的原始数据
function RunPHP($fvalue,$phpcode)
{
$DedeMeValue = $fvalue;
$phpcode = preg_replace("/'@me'|\"@me\"|@me/isU",'$DedeMeValue',$phpcode);
if(eregi('@body',$phpcode))
{
$DedeBodyValue = $this->tmpHtml;
$phpcode = preg_replace("/'@body'|\"@body\"|@body/isU",'$DedeBodyValue',$phpcode);
}
if(eregi('@litpic',$phpcode))
{
$DedeLitPicValue = $this->breImage;
$phpcode = preg_replace("/'@litpic'|\"@litpic\"|@litpic/isU",'$DedeLitPicValue',$phpcode);
}
eval($phpcode.";");
return $DedeMeValue;
}
//编码转换
function ChangeCode(&$str)
{
global $cfg_soft_lang;
if($cfg_soft_lang=='utf-8')
{
if($this->noteInfos["sourcelang"]=="gb2312")
{
$str = gb2utf8($str);
}
if($this->noteInfos["sourcelang"]=="big5")
{
$str = gb2utf8(big52gb($str));
}
}
else
{
if($this->noteInfos["sourcelang"]=="utf-8")
{
$str = utf82gb($str);
}
if($this->noteInfos["sourcelang"]=="big5")
{
$str = big52gb($str);
}
}
}
}
?>
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -