⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 dedecollection.class.php

📁 这是matlab的一个小程序
💻 PHP
📖 第 1 页 / 共 2 页
字号:
<?php
if(!defined('DEDEINC'))
{
	exit('dedecms');
}

require_once(DEDEINC."/dedecollection.func.php"); //采集扩展函数
require_once(DEDEINC."/image.func.php");
require_once(DEDEINC."/dedehtml2.class.php");
@set_time_limit(0);
class DedeCollection
{
	var $artNotes = array(); //文章采集的字段信息
	var $lists = array(); //采集节点的来源列表处理信息
	var $noteInfos = array(); //采集节点的基本配置信息
	var $dsql = '';
	var $noteId = '';
	var $cDedeHtml = '';
	var $cHttpDown = '';
	var $mediaCount = 0;
	var $tmpUnitValue = '';
	var $tmpLinks = array();
	var $tmpHtml = '';
	var $breImage = '';
	var $errString = '';

	//兼容php5构造函数
	function __construct()
	{
		$this->dsql = $GLOBALS['dsql'];
		$this->cHttpDown = new DedeHttpDown();
		$this->cDedeHtml = new DedeHtml2();
	}

	function DedeCollection()
	{
		$this->__construct();
	}

	//析放资源
	function Close()
	{
	}

	//从数据库里载入某个节点
	function LoadNote($nid)
	{
		$this->noteId = $nid;
		$row = $this->dsql->GetOne("Select * from `#@__co_note` where nid='$nid'");
		$this->LoadListConfig($row['listconfig']);
		$this->LoadItemConfig($row['itemconfig']);
	}

	//分析基本节点的及索引配置信息
	function LoadListConfig($configString)
	{
		$dtp = new DedeTagParse();
		$dtp2 = new DedeTagParse();
		$dtp->LoadString($configString);
		for($i=0;$i<=$dtp->Count;$i++)
		{
			$ctag = $dtp->CTags[$i];

			//item 配置
			//节点基本信息
			if($ctag->GetName()=="noteinfo")
			{
				$this->noteInfos['notename'] = $ctag->GetAtt('notename');
				$this->noteInfos['matchtype'] = $ctag->GetAtt('matchtype');
				$this->noteInfos['channelid'] = $ctag->GetAtt('channelid');
				$this->noteInfos['refurl'] = $ctag->GetAtt('refurl');
				$this->noteInfos['sourcelang'] = $ctag->GetAtt('sourcelang');
				$this->noteInfos['cosort'] = $ctag->GetAtt('cosort');
				$this->noteInfos['isref'] = $ctag->GetAtt('isref');
				$this->noteInfos['exptime'] = $ctag->GetAtt('exptime');
			}

			//list 配置
			//要采集的列表页的信息
			else if($ctag->GetName()=="listrule")
			{
				$this->lists['sourcetype'] = $ctag->GetAtt('sourcetype');
				$this->lists['rssurl'] = $ctag->GetAtt('rssurl');
				$this->lists['regxurl'] = $ctag->GetAtt('regxurl');
				$this->lists['startid'] = $ctag->GetAtt('startid');
				$this->lists['endid'] = $ctag->GetAtt('endid');
				$this->lists['addv'] = $ctag->GetAtt('addv');
				$this->lists['urlrule'] = $ctag->GetAtt('urlrule');
				$this->lists['musthas'] = $ctag->GetAtt('musthas');
				$this->lists['nothas'] = $ctag->GetAtt('nothas');
				$this->lists['listpic'] = $ctag->GetAtt('listpic');
				$this->lists['usemore'] =  $ctag->GetAtt('usemore');
				$dtp2->LoadString($ctag->GetInnerText());
				for($j=0;$j<=$dtp2->Count;$j++)
				{
					$ctag2 = $dtp2->CTags[$j];
					$tname = $ctag2->GetName();
					if($tname=='addurls')
					{
						$this->lists['addurls'] = trim($ctag2->GetInnerText());
					}
					else if($tname=='regxrule')
					{
						$this->lists['regxrule'] = trim($ctag2->GetInnerText());
					}
					else if($tname=='areastart')
					{
						$this->lists['areastart'] = trim($ctag2->GetInnerText());
					}
					else if($tname=='areaend')
					{
						$this->lists['areaend'] = trim($ctag2->GetInnerText());
					}
					else if($tname=='batchrule')
					{
						$this->lists['batchrule'] = trim($ctag2->GetInnerText());
					}
				}

				//分析列表网址
				if($this->lists['sourcetype'] != 'rss')
				{
					$this->lists['url'] = GetUrlFromListRule($this->lists['regxurl'],$this->lists['addurls'],
					$this->lists['startid'],$this->lists['endid'],$this->lists['addv'],$this->lists['usemore'],$this->lists['batchrule']);
				}
				else
				{
					$this->lists['url'] = $this->lists['rssurl'];
				}
			}
		}//End Loop

		$dtp->Clear();
		$dtp2->Clear();
	}

	//分析采集文章页的字段的设置
	function LoadItemConfig($configString)
	{
		$dtp = new DedeTagParse();
		$dtp2 = new DedeTagParse();
		$dtp->LoadString($configString);
		for($i=0;$i<=$dtp->Count;$i++)
		{
			$ctag = $dtp->CTags[$i];
			if($ctag->GetName()=='sppage')
			{
				$this->artNotes['sppage'] = $ctag->GetInnerText();
				$this->artNotes['sptype'] = $ctag->GetAtt('sptype');
			}
			else if($ctag->GetName()=='previewurl')
			{
				$this->artNotes['previewurl'] = $ctag->GetInnerText();
			}
			else if($ctag->GetName()=='keywordtrim')
			{
				$this->artNotes['keywordtrim'] = $ctag->GetInnerText();
			}
			else if($ctag->GetName()=='descriptiontrim')
			{
				$this->artNotes['descriptiontrim'] = $ctag->GetInnerText();
			}
			else if($ctag->GetName()=='item')
			{
				$field = $ctag->GetAtt('field');
				if($field == '')
				{
					continue;
				}
				$this->artNotes[$field]['value'] = $ctag->GetAtt('value');
				$this->artNotes[$field]['isunit'] = $ctag->GetAtt('isunit');
				$this->artNotes[$field]['isdown'] = $ctag->GetAtt('isdown');
				$this->artNotes[$field]['trim'] = array();
				$this->artNotes[$field]['match'] = '';
				$this->artNotes[$field]['function'] = '';
				$t = 0;
				$dtp2->LoadString($ctag->GetInnerText());
				for($k=0;$k<=$dtp2->Count;$k++)
				{
					$ctag2 = $dtp2->CTags[$k];
					if($ctag2->GetName()=='trim')
					{
						$this->artNotes[$field]['trim'][$t][0] = str_replace('#n#','&nbsp;',$ctag2->GetInnerText());
						$this->artNotes[$field]['trim'][$t][1] = $ctag2->GetAtt('replace');
						$t++;
					}
					else if($ctag2->GetName()=='match')
					{
						$this->artNotes[$field]['match'] = str_replace('#n#','&nbsp;',$ctag2->GetInnerText());
					}
					else if($ctag2->GetName()=='function')
					{
						$this->artNotes[$field]['function'] = $ctag2->GetInnerText();
					}
				}
			}
		}//End Loop

		$dtp->Clear();
		$dtp2->Clear();
	}

	//下载其中一个网址,并保存
	function DownUrl($aid,$dourl,$litpic='',$issave=true)
	{
		$this->tmpLinks = array();
		$this->tmpUnitValue = '';
		$this->breImage = '';
		$this->tmpHtml = $this->DownOnePage($dourl);

		//检测是否有分页字段,并预先处理
		if(!empty($this->artNotes['sppage']))
		{
			$noteid = '';
			foreach($this->artNotes as $k=>$sarr)
			{
				if(isset($sarr['isunit']) && $sarr['isunit']==1)
				{
					$noteid = $k;
					break;
				}
			}
			$this->GetSpPage($dourl,$noteid,$this->tmpHtml);

			if(eregi('#p#',$this->tmpUnitValue))
			{
				$this->tmpUnitValue = '副标题#e#'.$this->tmpUnitValue;
			}

		}

		//处理字段
		$body = $this->GetPageFields($dourl,$issave,$litpic);

		//保存资料到数据库
		if($issave)
		{
			$query = " Update `#@__co_htmls` set dtime='".time()."',result='".addslashes($body)."',isdown='1' where aid='$aid' ";
			if(!$this->dsql->ExecuteNoneQuery($query))
			{
				echo $this->dsql->GetError();
			}
			return $body;
		}
		return $body;
	}

	//获取分页区域的内容
	function GetSpPage($dourl,$noteid,$html,$step=0)
	{
		$sarr = $this->artNotes[$noteid];
		$linkareaHtml = $this->GetHtmlArea('[内容]',$this->artNotes['sppage'],$html);
		if($linkareaHtml=='')
		{
			if($this->tmpUnitValue=='')
			{
				$this->tmpUnitValue .= $this->GetHtmlArea('[内容]',$sarr['match'],$html);
			}
			else
			{
				$this->tmpUnitValue .= "#p#副标题#e#".$this->GetHtmlArea('[内容]',$sarr['match'],$html);
			}
			return;
		}

		//完整的分页列表
		if($this->artNotes["sptype"]=='full' || $this->artNotes["sptype"]=='')
		{
			$this->tmpUnitValue .= $this->GetHtmlArea('[内容]',$sarr['match'],$html);
			$this->cDedeHtml->GetLinkType = "link";
			$this->cDedeHtml->SetSource($linkareaHtml,$dourl,'link');
			foreach($this->cDedeHtml->Links as $k=>$t)
			{
				$k = $this->cDedeHtml->FillUrl($k);
				if($k==$dourl)
				{
					continue;
				}
				$nhtml = $this->DownOnePage($k);
				if($nhtml!='')
				{
					$ct = trim($this->GetHtmlArea('[内容]',$sarr['match'],$nhtml));
					if($ct!='')
					{
						$this->tmpUnitValue .= "#p#副标题#e#".$ct;
					}
				}
			}
		}

		//上下页形式或不完整的分页列表
		else
		{
			if($step>50)
			{
				return;
			}
			if($step==0)
			{
				$this->tmpUnitValue .= $this->GetHtmlArea('[内容]',$sarr['match'],$html);
			}
			$this->cDedeHtml->GetLinkType = "link";
			$this->cDedeHtml->SetSource($linkareaHtml,$dourl,'link');
			$hasLink = false;
			foreach($this->cDedeHtml->Links as $k=>$t)
			{
				$k = $this->cDedeHtml->FillUrl($k);
				if(in_array($k,$this->tmpLinks))
				{
					continue;
				}
				else{
					$nhtml = $this->DownOnePage($k);
					if($nhtml!='')
					{
						$ct = trim($this->GetHtmlArea('[内容]',$sarr['match'],$nhtml));
						if($ct!='')
						{
							$this->tmpUnitValue .= "#p#副标题#e#".$ct;
						}
					}
					$hasLink = true;
					$this->tmpLinks[] = $k;
					$dourl = $k;
					$step++;
				}
			}
			if($hasLink)
			{
				$this->GetSpPage($dourl,$noteid,$nhtml,$step);
			}
		}
	}

	//获取特定区域的HTML
	function GetHtmlArea($sptag,&$areaRule,&$html)
	{
		//用正则表达式的模式匹配
		if($this->noteInfos['matchtype']=='regex')
		{
			$areaRule = str_replace("/","\\/",$areaRule);
			$areaRules = explode($sptag,$areaRule);
			$arr = array();
			if($html==''||$areaRules[0]=='')
			{
				return '';
			}
			preg_match('/'.$areaRules[0]."(.*)".$areaRules[1]."/isU",$html,$arr);
			return empty($arr[1]) ? '' : trim($arr[1]);
		}

		//用字符串模式匹配
		else
		{
			$areaRules = explode($sptag,$areaRule);
			if($html=='' || $areaRules[0]=='')
			{
				return '';
			}
			$posstart = @strpos($html,$areaRules[0]);
			if($posstart===false)
			{
				return '';
			}
			$posend = @strpos($html,$areaRules[1],$posstart);
			if($posend > $posstart && $posend!==false)
			{
				return substr($html,$posstart+strlen($areaRules[0]),$posend-$posstart-strlen($areaRules[0]));
			}
			else
			{
				return '';
			}
		}
	}

	//下载指定网址
	function DownOnePage($dourl)
	{
		$this->cHttpDown->OpenUrl($dourl);
		$html = $this->cHttpDown->GetHtml();
		$this->cHttpDown->Close();
		$this->ChangeCode($html);
		return $html;
	}

	//下载特定资源,并保存为指定文件
	function DownMedia($dourl,$mtype='img',$islitpic=false)
	{
		global $notckpic;
		if(empty($notckpic))
		{
			$notckpic = 0;
		}

		//检测是否已经下载此文件
		$wi = false;
		$tofile = $filename = '';
		if($notckpic==0)
		{
			$row = $this->dsql->GetOne("Select hash,tofile from `#@__co_mediaurls` where nid='{$this->noteId}' And hash='".md5($dourl)."' ");
			if(isset($row['tofile']))
			{
				$tofile = $filename = $row['tofile'];
			}
		}

		//如果不存在,下载文件
		if($tofile=='' || !file_exists($GLOBALS['cfg_basedir'].$filename))
		{
			$filename = $this->GetRndName($dourl,$mtype);
			if(!ereg("^/",$filename))
			{
				$filename = "/".$filename;
			}

			//防盗链模式
			if($this->noteInfos['isref']=='yes' && $this->noteInfos['refurl']!='')
			{
				if($this->noteInfos['exptime']=='')
				{
					$this->noteInfos['exptime'] = 10;
				}
				DownImageKeep($dourl,$this->noteInfos['refurl'],$GLOBALS['cfg_basedir'].$filename,'',0,$this->Item['exptime']);
			}

			//普通模式
			else
			{
				$this->cHttpDown->OpenUrl($dourl);
				$this->cHttpDown->SaveToBin($GLOBALS['cfg_basedir'].$filename);
				$this->cHttpDown->Close();
			}

			//下载文件成功,保存记录
			if(file_exists($GLOBALS['cfg_basedir'].$filename))
			{
				if($tofile=='')
				{
					$query = "INSERT INTO `#@__co_mediaurls`(nid,hash,tofile) VALUES ('".$this->noteId."', '".md5($dourl)."', '".addslashes($filename)."');";
				}
				else
				{
					$query = "Update `#@__co_mediaurls` set tofile='".addslashes($filename)."' where hash='".md5($dourl)."' ";
				}
				$this->dsql->ExecuteNoneQuery($query);
			}
		}

		//如果下载图片失败或图片不存在,返回网址
		if(!file_exists($GLOBALS['cfg_basedir'].$filename))
		{
			return $dourl;
		}

		//生成缩略图
		if($mtype=='img' && !$islitpic && $this->breImage=='')
		{
			$this->breImage = $filename;
			if(!eregi("^http://",$this->breImage) && file_exists($GLOBALS['cfg_basedir'].$filename))
			{
				$filenames = explode('/',$filename);
				$filenamed = $filenames[count($filenames)-1];
				$nfilename = str_replace('.','_lit.',$filenamed);
				$nfilename = str_replace($filenamed,$nfilename,$filename);
				if(@copy($GLOBALS['cfg_basedir'].$filename,$GLOBALS['cfg_basedir'].$nfilename))
				{
					ImageResize($GLOBALS['cfg_basedir'].$nfilename,$GLOBALS['cfg_ddimg_width'],$GLOBALS['cfg_ddimg_height']);
					$this->breImage = $nfilename;
				}
			}
		}
		if($mtype=='img' && !$islitpic)
		{
			@WaterImg($GLOBALS['cfg_basedir'].$filename,'up');
		}
		return $filename;
	}

	//获得下载媒体的随机名称
	function GetRndName($url,$v)
	{
		global $cfg_image_dir,$cfg_dir_purview;
		$this->mediaCount++;
		$mnum = $this->mediaCount;
		$timedir = "c".MyDate("ymd",time());
		//存放路径
		$fullurl = preg_replace("/\/{1,}/","/",$cfg_image_dir."/");
		if(!is_dir($GLOBALS['cfg_basedir']."/$fullurl"))
		{
			MkdirAll($GLOBALS['cfg_basedir']."/$fullurl",$cfg_dir_purview);
		}

		$fullurl = $fullurl.$timedir."/";
		if(!is_dir($GLOBALS['cfg_basedir']."/$fullurl"))
		{
			MkdirAll($GLOBALS['cfg_basedir']."/$fullurl",$cfg_dir_purview);
		}

		//文件名称
		$timename = str_replace('.','',ExecTime());
		$threadnum = 0;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -