⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 dedecollection.class.php

📁 这是matlab的一个小程序
💻 PHP
📖 第 1 页 / 共 2 页
字号:
		if(isset($_GET['threadnum']))
		{
			$threadnum = intval($_GET['threadnum']);
		}
		$filename = dd2char($timename.$threadnum.'-'.$mnum.mt_rand(1000,9999));

		//分配扩展名
		$urls = explode('.',$url);
		if($v=='img')
		{
			$shortname = '.jpg';
			if(eregi("\.gif",$v))
			{
				$shortname = '.gif';
			}
			else if(eregi("\.png",$v))
			{
				$shortname = '.png';
			}
		}
		else if($v=='embed')
		{
			$shortname = '.swf';
		}
		else
		{
			$shortname = '';
		}
		$fullname = $fullurl.$filename.$shortname;
		return preg_replace("/\/{1,}/","/",$fullname);
	}

	//按载入的网页内容获取规则,从一个HTML文件中获取内容
	function GetPageFields($dourl,$needDown,$litpic='')
	{
		global $cfg_auot_description;
		if($this->tmpHtml == '')
		{
			return '';
		}
		$artitem = '';
		$isPutUnit = false;
		$tmpLtKeys = array();
		$inarr = array();

		//自动分析关键字和摘要
		preg_match("/<meta[\s]+name=['\"]keywords['\"] content=['\"](.*)['\"]/isU",$this->tmpHtml,$inarr);
		preg_match("/<meta[\s]+content=['\"](.*)['\"] name=['\"]keywords['\"]/isU",$this->tmpHtml,$inarr2);
		if(!isset($inarr[1]) && isset($inarr2[1]))
		{
			$inarr[1] = $inarr2[1];
		}
		if(isset($inarr[1]))
		{
			$keywords = trim(cn_substr(html2text($inarr[1]),30));
			$keywords = preg_replace("/".$this->artNotes['keywordtrim']."/isU",'',$keywords);
			if(!ereg(',',$keywords))
			{
				$keywords = str_replace(' ',',',$keywords);
			}
			$artitem .= "{dede:field name='keywords'}".$keywords."{/dede:field}\r\n";
		}
		else
		{
			$artitem .= "{dede:field name='keywords'}{/dede:field}\r\n";
		}
		preg_match("/<meta[\s]+name=['\"]description['\"] content=['\"](.*)['\"]/isU",$this->tmpHtml,$inarr);
		preg_match("/<meta[\s]+content=['\"](.*)['\"] name=['\"]description['\"]/isU",$this->tmpHtml,$inarr2);
		if(!isset($inarr[1]) && isset($inarr2[1]))
		{
			$inarr[1] = $inarr2[1];
		}
		if(isset($inarr[1]))
		{
			$description = trim(cn_substr(html2text($inarr[1]),$cfg_auot_description));
			$description = preg_replace("/".$this->artNotes['descriptiontrim']."/isU",'',$description);
			$artitem .= "{dede:field name='description'}".$description."{/dede:field}\r\n";
		}
		else
		{
			$artitem .= "{dede:field name='description'}{/dede:field}\r\n";
		}

		foreach($this->artNotes as $k=>$sarr)
		{
			//可能出现意外的情况
			if($k=='sppage' || $k=='sptype')
			{
				continue;
			}
			if(!is_array($sarr))
			{
				continue;
			}

			//特殊的规则或没匹配选项
			if($sarr['match']=='' || trim($sarr['match'])=='[内容]')
			{
				if($sarr['value']!='[内容]')
				{
					$v = trim($sarr['value']);
				}
				else
				{
					$v = '';
				}
			}
			else
			{
				//分多页的内容
				if($this->tmpUnitValue!='' && !$isPutUnit && $sarr['isunit']==1)
				{
					$v = $this->tmpUnitValue;
					$isPutUnit = true;
				}
				else
				{
					$v = $this->GetHtmlArea('[内容]',$sarr['match'],$this->tmpHtml);
				}

				//过滤内容规则
				if(isset($sarr['trim']) && $v!='')
				{
					foreach($sarr['trim'] as $nv)
					{
						if($nv[0]=='')
						{
							continue;
						}
						$nvs = str_replace("/","\\/",$nv[0]);
						$v = preg_replace("/".$nvs."/isU",$nv[1],$v);
					}
				}

				//是否下载远程资源
				if($needDown)
				{
					if($sarr['isdown'] == '1')
					{
						$v = $this->DownMedias($v,$dourl);
					}
				}
				else
				{
					if($sarr['isdown'] == '1')
					{
						$v = $this->MediasReplace($v,$dourl);
					}
				}
			}
			$v = trim($v);

			//用户自行对内容进行处理的接口
			if($sarr['function'] != '')
			{
				$tmpLtKeys[$k]['v'] = $v;
				$tmpLtKeys[$k]['f'] = $sarr['function'];
			}
			else
			{
				$v = ereg_replace("( )$",'',$v);
				$v = ereg_replace("[\r\n\t ]{1,}$",'',$v);
				$artitem .= "{dede:field name='$k'}$v{/dede:field}\r\n";
			}
		}//End Foreach

		//处理带函数的项目
		foreach($tmpLtKeys as $k=>$sarr)
		{
			$v = $this->RunPHP($sarr['v'],$sarr['f']);
			$v = ereg_replace("( )$",'',$v);
			$v = ereg_replace("[\r\n\t ]{1,}$",'',$v);
			$artitem .= "{dede:field name='$k'}$v{/dede:field}\r\n";
		}
		if($litpic!='' && $this->lists['listpic']==1)
		{
			$artitem .= "{dede:field name='litpic'}".$this->DownMedia($litpic,'img',true)."{/dede:field}\r\n";
		}
		else
		{
			$artitem .= "{dede:field name='litpic'}".$this->breImage."{/dede:field}\r\n";
		}
		return $artitem;
	}

	//下载内容里的资源
	function DownMedias(&$html,$url)
	{
		$this->cDedeHtml->SetSource($html,$url,'media');

		//下载标记里的图片和flash
		foreach($this->cDedeHtml->Medias as $k=>$v)
		{
			$furl = $this->cDedeHtml->FillUrl($k);
			if($v=='embed' && !eregi("\.(swf)\?(.*)$",$k)&& !eregi("\.(swf)$",$k))
			{
				continue;
			}
			$okurl = $this->DownMedia($furl,$v);
			$html = str_replace($k,$okurl,$html);
		}

		//下载超链接里的图片
		foreach($this->cDedeHtml->Links as $v=>$k)
		{
			if(eregi("\.(jpg|gif|png)\?(.*)$",$v) || eregi("\.(jpg|gif|png)$",$v))
			{
				$m = "img";
			}
			else if(eregi("\.(swf)\?(.*)$",$v) || eregi("\.(swf)$",$v))
			{
				$m = "embed";
			}
			else
			{
				continue;
			}
			$furl = $this->cDedeHtml->FillUrl($v);
			$okurl = $this->DownMedia($furl,$m);
			$html = str_replace($v,$okurl,$html);
		}
		return $html;
	}

	//仅替换内容里的资源为绝对网址
	function MediasReplace(&$html,$dourl)
	{
		$this->cDedeHtml->SetSource($html,$dourl,'media');
		foreach($this->cDedeHtml->Medias as $k=>$v)
		{
			$k = trim($k);
			$okurl = $this->cDedeHtml->FillUrl($k);
			$html = str_replace($k,$okurl,$html);
		}
		return $html;
	}

	//测试列表
	function Testlists(&$dourl)
	{
		$links = array();

		//从RSS中获取网址
		if($this->lists['sourcetype']=='rss')
		{
			$dourl = $this->lists['rssurl'];
			$links = GetRssLinks($dourl);
			return $links;
		}

		//正常情况
		if(isset($this->lists['url'][0][0]))
		{
			$dourl = $this->lists['url'][0][0];
		}
		else
		{
			$dourl = '';
			$this->errString = "配置中指定列表的网址错误!\r\n";
			return $links;
		}
		$dhtml = new DedeHtml2();
		$html = $this->DownOnePage($dourl);
		if($html=='')
		{
			$this->errString = "读取网址: $dourl 时失败!\r\n";
			return $links;
		}
		if( trim($this->lists['areastart']) !='' && trim($this->lists['areaend']) != '' )
		{
			$areabody = $this->lists['areastart'].'[var:区域]'.$this->lists['areaend'];
			$html = $this->GetHtmlArea('[var:区域]',$areabody,$html);
		}
		$t1 = ExecTime();
		$dhtml->SetSource($html,$dourl,'link');
		foreach($dhtml->Links as $s)
		{
			if($this->lists['nothas']!='')
			{
				if( eregi($this->lists['nothas'],$s['link']) )
				{
					continue;
				}
			}
			if($this->lists['musthas']!='')
			{
				if( !eregi($this->lists['musthas'],$s['link']) )
				{
					continue;
				}
			}
			$links[] = $s;
		}
		return $links;
	}

	//测试文章规则
	function TestArt($dourl)
	{
		return $this->DownUrl(0,$dourl,'',false);
	}

	//采集种子网址
	function GetSourceUrl($islisten=0,$glstart=0,$pagesize=10)
	{
		//在第一页中进行预处理
		//“下载种子网址的未下载内容”的模式不需要经过采集种子网址的步骤
		if($glstart==0)
		{
			//重新采集所有内容模式
			if($islisten == -1)
			{
				$this->dsql->ExecuteNoneQuery("Delete From `#@__co_urls` where nid='".$this->noteId."'");
				$this->dsql->ExecuteNoneQuery("Delete From `#@__co_htmls` where nid='".$this->noteId."' ");
			}
			//监听模式(保留未导出的内容、保留节点的历史网址记录)
			else
			{
				$this->dsql->ExecuteNoneQuery("Delete From `#@__co_htmls` where nid='".$this->noteId."' And isexport=1 ");
			}
		}

		//从RSS中获取种子
		if($this->lists['sourcetype']=='rss')
		{
			$links = GetRssLinks($this->lists['rssurl']);
			//if($this->noteInfos['cosort']!='asc')
			$tmplink = krsort($links);
			foreach($links as $v)
			{
				if($islisten==1)
				{
					$lrow = $this->dsql->GetOne("Select * From `#@__co_urls` where nid='{$this->noteId}' And hash='".md5($v['link'])."' ");
					if(is_array($lrow))
					{
						continue;
					}
				}

				$inquery = "INSERT INTO `#@__co_htmls` (`nid` ,`typeid`, `title` , `litpic` , `url` , `dtime` , `isdown` , `isexport` , `result`)
                    VALUES ('{$this->noteId}' , '0', '".addslashes($v['title'])."' , '".addslashes($v['image'])."' , '".addslashes($v['link'])."' , 'dtime' , '0' , '0' , ''); ";
				$this->dsql->ExecuteNoneQuery($inquery);

				$inquery = "INSERT INTO `#@__co_urls`(hash,nid) VALUES ('".md5($v['link'])."','{$this->noteId}');";
				$this->dsql->ExecuteNoneQuery($inquery);
			}
			return 0;
		}
		else
		{
			$tmplink = array();
			$arrStart = 0;
			$moviePostion = 0;
			$endpos = $glstart + $pagesize;
			$totallen = count($this->lists['url']);
			foreach($this->lists['url'] as $k=>$cururls)
			{
				$cururl = $cururls[0];
				$typeid = (empty($cururls[1]) ? 0 : $cururls[1]);
				$moviePostion++;
				if($moviePostion > $endpos)
				{
					break;
				}
				if($moviePostion > $glstart)
				{
					$html = $this->DownOnePage($cururl);
					if( trim($this->lists['areastart']) !='' && trim($this->lists['areaend']) != '' )
					{
						$areabody = $this->lists['areastart'].'[var:区域]'.$this->lists['areaend'];
						$html = $this->GetHtmlArea('[var:区域]',$areabody,$html);
					}
					$this->cDedeHtml->SetSource($html,$cururl,'link');
					$lk = 0;
					foreach($this->cDedeHtml->Links as $k=>$v)
					{
						if($this->lists['nothas']!='')
						{
							if( eregi($this->lists['nothas'],$v['link']) )
							{
								continue;
							}
						}
						if($this->lists['musthas']!='')
						{
							if( !eregi($this->lists['musthas'],$v['link']) )
							{
								continue;
							}
						}
						$tmplink[$arrStart][0] = $v;
						$tmplink[$arrStart][1] = $typeid;
						$arrStart++;
						$lk++;
					}
					$this->cDedeHtml->Clear();
				}
			}//foreach
			//if($this->noteInfos['cosort']!='asc')

			krsort($tmplink);
			$unum = count($tmplink);
			if($unum>0)
			{
				//echo "完成本次种子网址抓取,共找到:{$unum} 个记录!<br/>\r\n";
				foreach($tmplink as $vs)
				{
					$v = $vs[0];
					$typeid = $vs[1];
					if($islisten==1)
					{
						$lrow = $this->dsql->GetOne("Select * From `#@__co_urls` where nid='{$this->noteId}' And hash='".md5($v['link'])."' ");
						if(is_array($lrow))
						{
							continue;
						}
					}
					$inquery = "INSERT INTO `#@__co_htmls` (`nid` ,`typeid`, `title` , `litpic` , `url` , `dtime` , `isdown` , `isexport` , `result`)
                    VALUES ('{$this->noteId}' ,'$typeid', '".addslashes($v['title'])."' , '".addslashes($v['image'])."' , '".addslashes($v['link'])."' , '".time()."' , '0' , '0' , ''); ";
					$this->dsql->ExecuteNoneQuery($inquery);

					$inquery = "INSERT INTO `#@__co_urls`(hash,nid) VALUES ('".md5($v['link'])."','{$this->noteId}');";
					$this->dsql->ExecuteNoneQuery($inquery);
				}
				if($endpos >= $totallen)
				{
					return 0;
				}
				else
				{
					return ($totallen-$endpos);
				}
			}
			else
			{
				//仅在第一批采集时出错才返回
				if($glstart==0)
				{
					return -1;
				}

				//在其它页出错照常采集后面内容
				if($endpos >= $totallen)
				{
					return 0;
				}
				else
				{
					return ($totallen-$endpos);
				}
			}
		}
	}

	//用扩展函数处理采集到的原始数据
	function RunPHP($fvalue,$phpcode)
	{
		$DedeMeValue = $fvalue;
		$phpcode = preg_replace("/'@me'|\"@me\"|@me/isU",'$DedeMeValue',$phpcode);
		if(eregi('@body',$phpcode))
		{
			$DedeBodyValue = $this->tmpHtml;
			$phpcode = preg_replace("/'@body'|\"@body\"|@body/isU",'$DedeBodyValue',$phpcode);
		}
		if(eregi('@litpic',$phpcode))
		{
			$DedeLitPicValue = $this->breImage;
			$phpcode = preg_replace("/'@litpic'|\"@litpic\"|@litpic/isU",'$DedeLitPicValue',$phpcode);
		}
		eval($phpcode.";");
		return $DedeMeValue;
	}

	//编码转换
	function ChangeCode(&$str)
	{
		global $cfg_soft_lang;
		if($cfg_soft_lang=='utf-8')
		{
			if($this->noteInfos["sourcelang"]=="gb2312")
			{
				$str = gb2utf8($str);
			}
			if($this->noteInfos["sourcelang"]=="big5")
			{
				$str = gb2utf8(big52gb($str));
			}
		}
		else
		{
			if($this->noteInfos["sourcelang"]=="utf-8")
			{
				$str = utf82gb($str);
			}
			if($this->noteInfos["sourcelang"]=="big5")
			{
				$str = big52gb($str);
			}
		}
	}
}
?>

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -