⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gather.php

📁 mv 具体不是很清楚
💻 PHP
📖 第 1 页 / 共 2 页
字号:
<?php
require_once("global.php");

set_time_limit(0);
error_reporting(7);
if($job=="add_title")
{
	$link_noinclude_word="<";
	$page_step=1;
	$webname="被采集的网站名称";
	$rulepage='0';
	require("head.php");
	require("template/gather/edit_title.htm");
	require("foot.php");
}
elseif($action=="add_title")
{
	if( !$db->get_one("SELECT * FROM `{$pre}mv_gather_rule` WHERE posttime='$posttime' ") ){
		$db->query("INSERT INTO `{$pre}mv_gather_rule` ( `type` , `filetype` , `webname` , `listurl` , `firstpage` , `page_begin` , `page_end` , `page_step` , `listmoreurl` , `link_include_word` , `link_noinclude_word` , `link_replace_word` , `title_replace_word` , `list_begin_code` , `list_end_code` , `list_begin_preg` , `list_end_preg` , `gatherthesame` , `title_minleng` , `show_end_preg` , `show_begin_code` , `show_end_code` , `show_replace_word` , `show_morepage` , `posttime` , `list`,title_rule,charset_type ) 
		VALUES (
		'$type','$filetype','$webname','$listurl','$firstpage','$page_begin','$page_end','$page_step','$listmoreurl','$link_include_word','$link_noinclude_word','$link_replace_word','$title_replace_word','$list_begin_code','$list_end_code','$list_begin_preg','$list_end_preg','$gatherthesame','$title_minleng','$show_end_preg','$show_begin_code','$show_end_code','$show_replace_word','$show_morepage','$posttime','$timestamp','$title_rule','$charset_type'
		)");
	}else{
		$db->query("UPDATE `{$pre}mv_gather_rule` SET webname='$webname',listurl='$listurl',firstpage='$firstpage',page_begin='$page_begin',page_end='$page_end',page_step='$page_step',listmoreurl='$listmoreurl',link_include_word='$link_include_word',link_noinclude_word='$link_noinclude_word',link_replace_word='$link_replace_word',title_replace_word='$title_replace_word',list_begin_code='$list_begin_code',list_end_code='$list_end_code',list_begin_preg='$list_begin_preg',list_end_preg='$list_end_preg',title_minleng='$title_minleng',title_rule='$title_rule',charset_type='$charset_type' WHERE posttime='$posttime'");
	}
	$rs=$db->get_one("SELECT id FROM {$pre}mv_gather_rule WHERE posttime='$posttime' ");
	if($testgather){
		refreshto("gather.php?lfj=$lfj&action=gather_title&id=$rs[id]&showurl=1","正在测试采集标题,请耐心等待",1);
	}
	refreshto("gather.php?lfj=$lfj&job=edit_content&id=$rs[id]","继续下一步",1);
}
elseif($job=="edit_title")
{
	$rsdb=$db->get_one("SELECT * FROM {$pre}mv_gather_rule WHERE id='$id'");
	@extract($rsdb);
	if($listurl){
		$rulepage='1';
	}else{
		$rulepage='0';
	}
	$charset_typedb[intval($rsdb[charset_type])]=' checked ';
	require("head.php");
	require("template/gather/edit_title.htm");
	require("foot.php");
}
elseif($action=="edit_title")
{
	$db->query("UPDATE `{$pre}mv_gather_rule` SET webname='$webname',listurl='$listurl',firstpage='$firstpage',page_begin='$page_begin',page_end='$page_end',page_step='$page_step',listmoreurl='$listmoreurl',link_include_word='$link_include_word',link_noinclude_word='$link_noinclude_word',link_replace_word='$link_replace_word',title_replace_word='$title_replace_word',list_begin_code='$list_begin_code',list_end_code='$list_end_code',list_begin_preg='$list_begin_preg',list_end_preg='$list_end_preg',title_minleng='$title_minleng',title_rule='$title_rule',charset_type='$charset_type' WHERE id='$id'");
	if($testgather){
		refreshto("gather.php?lfj=$lfj&action=gather_title&id=$id&showurl=1","正在测试采集标题,请耐心等待",1);
	}
	refreshto("gather.php?lfj=$lfj&job=edit_content&id=$id","继续下一步设置详细参数",1);
}
elseif($job=="edit_content")
{
	$rsdb=$db->get_one("SELECT * FROM {$pre}mv_gather_rule WHERE id='$id'");
	@extract($rsdb);
	
	$type || $type='iframe';
	$typedb[$type]=" checked ";
	$gatherthesame=intval($gatherthesame);
	$gatherthesamedb[$gatherthesame]=" checked ";
	$show_spe2page=intval($show_spe2page);
	$show_spe2pagedb[$show_spe2page]=" checked ";
	require("head.php");
	require("template/gather/edit_content.htm");
	require("foot.php");
}
elseif($action=="edit_content")
{
	$fixsystem || $fixsystem='article';
	$db->query("UPDATE {$pre}mv_gather_rule SET  type='$type',gatherthesame='$gatherthesame',show_begin_preg='$show_begin_preg',show_end_preg='$show_end_preg',show_endfile_preg='$show_endfile_preg',show_begin_code='$show_begin_code',show_end_code='$show_end_code',show_replace_word='$show_replace_word',show_morepage='$show_morepage',show_firstpage='$show_firstpage',copypic='$copypic',sort='$sort',file_type='$file_type',file_minleng='$file_minleng',file_minsize='$file_minsize',file_includeword='$file_includeword',file_noincludeword='$file_noincludeword',file_explode='$file_explode',file_picwidth='$file_picwidth',fixsystem='$fixsystem',file_star_string='$file_star_string',content_rule='$content_rule' WHERE id='$id'");
	if($testgather){
		refreshto("gather.php?lfj=gather&action=gather_title&id=$id&testgather=$testgather","请耐心等待,先采集标题,你再点击选择测试采集",1);
	}
	refreshto("gather.php?lfj=gather&job=list","修改成功",1);
}
elseif($action=="gather_title")
{
	$rsdb=$db->get_one("SELECT * FROM {$pre}mv_gather_rule WHERE id='$id'");
	$page=intval($page);
	if(!$page){
		if($rsdb[listmoreurl]){
			$detail=explode("\r\n",$rsdb[listmoreurl]);
			foreach( $detail AS $key=>$value){
				$allurldb[]="\$urldb[]='$value';";
			}
			$allurl=implode("\r\n",$allurldb);
		}else{
			$rsdb[page_step] || $rsdb[page_step]=1;
			
			for($i=$rsdb[page_begin];$i<=$rsdb[page_end];$i++ ){
				if($rsdb[page_begin]==0){
					$II=($i-1)*$rsdb[page_step];
				}else{
					$II=($i-1)*$rsdb[page_step]+1;
				}
				
				$value=str_replace("[page]","$II",$rsdb[listurl]);
				if($i==1&&$rsdb[firstpage]){
					$value=$rsdb[firstpage];
				}
				$allurldb[]="\$urldb[]='$value';";
			}//print_R($allurldb);  die($allurldb);
			$allurl=implode("\r\n",$allurldb);
		}
		write_file(PHP168_PATH."cache/gather_morepage.php","<?php\r\n".$allurl);
		write_file(PHP168_PATH."cache/gather_title.php","<?php\r\n");
		if($rsdb[list_begin_preg]){
			write_file(PHP168_PATH."cache/gather_list.begin_preg.php","<?php\r\n$rsdb[list_begin_preg]");
		}
		if($rsdb[list_end_preg]){
			write_file(PHP168_PATH."cache/gather_list.end_preg.php","<?php\r\n$rsdb[list_end_preg]");
		}
		if($rsdb[show_begin_preg]){
			write_file(PHP168_PATH."cache/gather_show.begin_preg.php","<?php\r\n$rsdb[show_begin_preg]");
		}
		if($rsdb[show_end_preg]){
			write_file(PHP168_PATH."cache/gather_show.end_preg.php","<?php\r\n$rsdb[show_end_preg]");
		}
		if($rsdb[show_endfile_preg]){
			write_file(PHP168_PATH."cache/gather_show.endfile_preg.php","<?php\r\n$rsdb[show_endfile_preg]");
		}
	}
	@include(PHP168_PATH."cache/gather_morepage.php");
	$listurl=$urldb[$page];
	if($list_content=file_get_contents($listurl))
	{
	}
	elseif($list_content=file($listurl))
	{
		$list_content=implode("",$list_content);
	}
	elseif(copy($listurl,PHP168_PATH."cache/gather_cache.php"))
	{
		$list_content=read_file(PHP168_PATH."cache/gather_cache.php");
	}
	elseif($list_content=sockOpenUrl($listurl)){
	}
	else
	{
		echo("采集失败<br><br><br><br><br><br><br><br><br><br>");
	}

	//UTF8->GBK
	if($rsdb[charset_type]==1){
		require_once(PHP168_PATH."inc/class.chinese.php");
		$cnvert = new Chinese("UTF8","GB2312",$list_content,PHP168_PATH."./inc/gbkcode/");
		$list_content = $cnvert->ConvertIT();
	}

	//开头正则
	if($rsdb[list_begin_preg]){
		$htmlcode=$list_content;	//方便外部正则语句的变量比较统一
		include(PHP168_PATH."cache/gather_list.begin_preg.php");
		$list_content=$htmlcode;
	}
	
	//截取从某段字符开始至结尾的内容
	if($rsdb[list_begin_code]){
		$list_content=strstr($list_content,$rsdb[list_begin_code]);
	}

	//去掉某段字符后面的内容
	if($rsdb[list_end_code]){
		$end_content=strstr($list_content,$rsdb[list_end_code]);
		$list_content=str_replace($end_content,"",$list_content);
	}

	//替换一些字符
	if($rsdb[title_replace_word]){
		$detail=explode("\r\n",$rsdb[title_replace_word]);
		foreach($detail AS $key=>$value){
			list($oldword,$newword)=explode("|",$value);
			$list_content=str_replace($oldword,$newword,$list_content);
		}
	}

	//用户自定义正则
	if($rsdb[title_rule])
	{
		//把空白都去除,方便处理
		$rsdb[title_rule]=clean_blank($rsdb[title_rule]);
		$list_content=clean_blank($list_content);

		//获取正则里的规则数组
		preg_match_all("/\{(.*?)\}/is",$rsdb[title_rule],$array);
		
		//获取变量
		foreach( $array[1] AS $key=>$value){
			if( !ereg("^NO",$value)&&!ereg("^\*",$value) ){
				$detail=explode("=",$value);
				$ruledb[++$key]=$detail[0];
			}
		}
		
		//获取处理后能使用的规则
		$rule = get_rule($rsdb[title_rule]);

		//对采集的内容跟据正则进行校正
		preg_match_all("/$rule/is",$list_content,$array2);

		//获取有用的数组
		foreach( $ruledb AS $key=>$value){
			foreach( $array2[$key] AS $key2=>$value2){
				$listdb[$value][]=$value2;
			}
		}
		
		//url文章地址必须要有的
		$detail_content=$listdb[url];
	}

	else
	{
		$list_content=str_replace("HREF=","href=",$list_content);
		$list_content=str_replace("</A>","</a>",$list_content);
		$list_content=str_replace("href='","href=",$list_content);
		$list_content=str_replace('href="','href=',$list_content);
		$detail_content=explode("href=",$list_content);	
	}


	unset($i,$_url,$_title);
	foreach($detail_content AS $key_content=>$value_content){
		if($rsdb[title_rule])
		{
			$url=$value_content;
			$title=$listdb[title][$key_content];
			$picurl=$listdb[picurl][$key_content];
		}
		else
		{
			if($key_content==0){
				continue;
			}
			$url=preg_replace("/([^'\" >]+)(.*)/is","\\1",$value_content);//echo $url; die();
			$s1_title=strstr($value_content,">");
			$s2_title=strstr($value_content,"</a>");
			$s3_title=str_replace($s2_title,"",$s1_title);
			$title=str_replace(">","",$s3_title);
			$title=substr($s3_title,1,strlen($s3_title)-1);
			$title=preg_replace("/<([^<>]+)>(.*)<([^<>]+)>/is","\\2",$title);
		}


		//标题与URL中不能包含有的字符
		if($rsdb[link_noinclude_word]){
			$detail=explode("\r\n",$rsdb[link_noinclude_word]);
			foreach($detail AS $key=>$value){
				if(!$value){
					continue;
				}
				if(strstr($title,$value)||strstr($url,$value)){
					unset($url,$title);
				}
			}
		}

		//URL中必须包含有的字符
		if($rsdb[link_include_word]){
			$detail=explode("\r\n",$rsdb[link_include_word]);
			foreach($detail AS $key=>$value){
				if(!$value){
					continue;
				}
				if(!strstr($url,$value)){
					unset($url,$title);
				}
			}
		}

		//标题不能小于多少字
		if($rsdb[title_minleng]){
			if(strlen($title)<$rsdb[title_minleng]+1){
				unset($url,$title);
			}
		}

		//对地址的完整性进行处理
		if($url&&$title){
			if(!ereg("://",$url)){
				if(ereg("^/",$url)){
					$url=preg_replace("/http:\/\/([^\/]+)(.*)/is","http://\\1$url",$listurl);
				}else{
					$url=str_replace(basename($listurl),"",$listurl).$url;
				}
				
			}
			if($picurl&&!ereg("://",$picurl)){
				if(ereg("^/",$picurl)){
					$picurl=preg_replace("/http:\/\/([^\/]+)(.*)/is","http://\\1$picurl",$listurl);
				}else{
					$picurl=str_replace(basename($listurl),"",$listurl).$picurl;
				}
				
			}

			$url=str_replace("'","&#39;",$url);
			$picurl=str_replace("'","&#39;",$picurl);
			$title=str_replace("'","&#39;",$title);
			$_url[]=$url;
			$_title[]=$title;
			if($picurl)
			{
				$UT[]="\$urldb[]='$url\t$title@@$picurl';";
			}
			else
			{
				$UT[]="\$urldb[]='$url\t$title';";
			}
		}
	}
	$writefile=implode("\r\n",$UT);

	//结尾正则
	if($rsdb[list_end_preg])
	{
		$htmlcode=$writefile;	//方便外部正则语句的变量比较统一
		include(PHP168_PATH."cache/gather_list.end_preg.php");
		$writefile=$htmlcode;	//方便外部正则语句的变量比较统一
	}

	write_file(PHP168_PATH."cache/gather_title.php","\r\n".$writefile,'a');
	$page++;
	
	if($urldb[$page]){
		unset($urldb);
		include(PHP168_PATH."cache/gather_title.php");
		echo "$listurl<br>正在采集第[{$page}]页的标题与内容网址,请稍候...<hr>";
		foreach( $urldb AS $key=>$value){
			if($key>50){
				break;
			}
			echo "$value<br>";
		}
		echo "<META HTTP-EQUIV=REFRESH CONTENT='0;URL=gather.php?lfj=$lfj&action=$action&id=$id&showurl=$showurl&testgather=$testgather&page=$page'>";
		exit;
	}else{
		echo "<META HTTP-EQUIV=REFRESH CONTENT='0;URL=gather.php?lfj=$lfj&showurl=$showurl&testgather=$testgather&job=list_title&id=$id'>";
		exit;
	}
}
elseif($job=="list_title")
{
	$rs=$ruledb=$db->get_one("SELECT * FROM {$pre}mv_gather_rule WHERE id='$id'");
	if($rs[type]=="jump"){
		$msg="注意:当前配置文件设置的参数:点击标题后跳转到外部网址,使得下面大部分无效";
	}elseif($rs[type]=="iframe"){
		$msg="注意:当前配置文件设置的参数:点击标题后框架外部网址,类似大旗、奇虎,使得下面大部分无效";
	}
	if($testgather){
		$autosub="autosub();";
	}
	include(PHP168_PATH."cache/gather_title.php");
	require("head.php");
	require("template/gather/list_title.htm");
	require("foot.php");
}
elseif($action=="list_title")
{
	include(PHP168_PATH."cache/gather_title.php");
	/*
	foreach( $urldb AS $key=>$value){
		if($postdb[$key]){
			$UT[]="\$urldb[]='$value';";
		}
	}
	*/
	//倒序处理
	$num=count($urldb)-1;
	for($i=$num;$i>=0;$i--){
		if($postdb[$i]){
			$UT[]="\$urldb[]='{$urldb[$i]}';";
		}
	}
	
	$writefile=implode("\r\n",$UT);
	write_file(PHP168_PATH."cache/gather_title.php","<?php\r\n".$writefile);
	$action='gather_content';
	echo "<META HTTP-EQUIV=REFRESH CONTENT='1;URL=?lfj=$lfj&action=$action&id=$id&system_type=$system_type&GetFile=$GetFile&file_dir=$file_dir&makesmallpic=$makesmallpic&showpic=$showpic&username=$username&fid=$fid&testgather=$testgather&page=$page'>";
	exit;
}
elseif($action=="gather_content")
{
	unset($urldb);
	$rsdb=$db->get_one("SELECT * FROM {$pre}mv_gather_rule WHERE id='$id'");
	@include(PHP168_PATH."cache/gather_title.php");
	$page=intval($page);
	//$morepage大于0时.代表多页
	list($curl,$title,$morepage)=explode("\t",$urldb[$page]);

	if($show_content=file_get_contents($curl))
	{
	}
	elseif($show_content=file($curl))
	{
		$show_content=implode("",$show_content);
	}
	elseif(copy($curl,PHP168_PATH."cache/gather_cache.php"))
	{
		$show_content=read_file(PHP168_PATH."cache/gather_cache.php");
	}
	elseif($show_content=sockOpenUrl($curl))
	{
	}
	else
	{
		echo("服务器获取不了远程文件信息,因而采集失败$curl<br><br><br><br><br><br><br><br><br><br>");
	}
	
	//UTF8->GBK
	if($rsdb[charset_type]==1){
		require_once(PHP168_PATH."inc/class.chinese.php");
		$cnvert = new Chinese("UTF8","GB2312",$show_content,PHP168_PATH."./inc/gbkcode/");
		$show_content = $cnvert->ConvertIT();
	}

	if($rsdb[type]=='iframe'){//类似奇虎

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -