📄 gather.php

📁 mv 具体不是很清楚
💻 PHP
📖 第 1 页 / 共 2 页
字号:
上一页 12
		$iframeurl=$curl;
	}else{
		//开头做正则处理
		if($rsdb[show_begin_preg]){
			$htmlcode=$show_content;
			include(PHP168_PATH."cache/gather_show.begin_preg.php");
			$show_content=$htmlcode;
		}

		//对一篇文章多页的处理,只是在第一页的时候处理.第二页就不需要了
		if($rsdb[show_firstpage]&&$rsdb[show_morepage]&&!$morepage){
			$i=1;
			unset($moreurl_db);
			do{
				$i++;
				//后面页与第一页的不同之处做替换得到后页的真实地址
				$nexturl=str_replace($rsdb[show_firstpage],str_replace("[page]",$i,$rsdb[show_morepage]),$curl);
				//对一些特殊的网站处理.比如第一页是index.htm第二页竟然是index_1.htm
				if($i==2&&$rsdb[show_spe2page]){
					$tsurl=str_replace($rsdb[show_firstpage],str_replace("[page]",1,$rsdb[show_morepage]),$curl);
					if(ereg(basename($tsurl),$show_content)){
						$moreurl_db[$page][]="$tsurl\t$title\t1";
					}
				}
				if(ereg(basename($nexturl),$show_content)){
					$moreurl_db[$page][]="$nexturl\t$title\t$i";
				}else{
					$i=0;
				}
			}
			while($i!=0);
			if(is_array($moreurl_db[$page])){
				//对原要采集的文章再重新处理,因为增加了分页
				foreach($urldb AS $key=>$value){
					$_urlDB[]="\$urldb[]='$value';";
					if($page==$key&&is_array($moreurl_db[$key])){
						foreach($moreurl_db[$key] AS $key2=>$value2){
							$_urlDB[]="\$urldb[]='$value2';";
						}
					}
					
				}
				$write_file=implode("\r\n",$_urlDB);
				write_file(PHP168_PATH."cache/gather_title.php","<?php\r\n$write_file");
				unset($urldb);
				include(PHP168_PATH."cache/gather_title.php");
			}
		}

		//用户自定义正则,对文章做正则
		if($rsdb[content_rule])
		{

			//把空白都去除,方便处理
			$rsdb[content_rule]=clean_blank($rsdb[content_rule]);
			$show_content=clean_blank($show_content);

			//获取正则里的规则数组
			preg_match_all("/\{(.*?)\}/is",$rsdb[content_rule],$array);
		
			//获取变量
			foreach( $array[1] AS $key=>$value){
				if( !ereg("^NO",$value)&&!ereg("^\*",$value) ){
					$detail=explode("=",$value);
					$ruledb[++$key]=$detail[0];
				}
			}
		
			//获取处理后能使用的规则
			$rule = get_rule($rsdb[content_rule]);

			//对采集的内容跟据正则进行校正
			preg_match_all("/$rule/is",$show_content,$array2);

			//获取有用的数组
			foreach( $ruledb AS $key=>$value){
				foreach( $array2[$key] AS $key2=>$value2){
					$listdb[$value][]=$value2;
				}
			}
			
			//把用户自定义的变量都取出来
			foreach( $listdb AS $key=>$value){
				$$key=$value[0];
			}

			if($content)
			{
				$show_content=$content;
			}
			//主要是处理那种画中画的广告.把文章截成两段了
			elseif($content1)
			{
				$show_content=$content1.$content2;
			}
			echo ("$videourl<hr>");
		}


		//过滤文章前面无效内容
		if($rsdb[show_begin_code]){
			$show_content=strstr($show_content,$rsdb[show_begin_code]);
			$num_1=strlen($rsdb[show_begin_code]);
			$num_2=strlen($show_content);
			$show_content=substr($show_content,$num_1,$num_2);
		}
		//过滤文章后的无效内容
		if($rsdb[show_end_code]){
			$end_content=strstr($show_content,$rsdb[show_end_code]);
			$show_content=str_replace($end_content,"",$show_content);
		}
		//过滤文章中不想看到的文字
		if($rsdb[show_replace_word]){
			$detail=explode("\r\n",$rsdb[show_replace_word]);
			foreach($detail AS $key=>$value){
				list($oldword,$newword)=explode("|",$value);
				$show_content=str_replace($oldword,$newword,$show_content);
			}
		}
		//文章结尾做正则处理
		if($rsdb[show_end_preg]){
			$htmlcode=$show_content;
			include(PHP168_PATH."cache/gather_show.end_preg.php");
			$show_content=$htmlcode;
		}
	}
	
	//获取文件.文件切割符,图片一般src=,这里默认是图片
	if(!$rsdb[file_explode]){
		$rsdb[file_explode]='src=';
		$show_content=str_replace("SRC=","src=",$show_content);
		if( !$rsdb[file_type] && $rsdb[fixsystem]=='article' ){
			$rsdb[file_type]="jpg|gif|png";
		}
	}
	$Filedb=GetFileUrl($rsdb,$show_content,$curl);
	//结尾正则,可以直接通过正则获取文件地址,如果不获取文件的话.与上面已有结尾正则是有点雷同
	if($rsdb[show_endfile_preg]){
		$htmlcode=$show_content;
		include(PHP168_PATH."cache/gather_show.endfile_preg.php");
		$show_content=$htmlcode;
	}
	//文件本地化
	if( $Filedb && $GetFile && $fid ){
		$dir_id=$file_dir?$file_dir:$fid;
		if(!is_dir(PHP168_PATH."$webdb[updir]/$dir_id")){
			makepath(PHP168_PATH."$webdb[updir]/$dir_id");
		}
		if($GetFile){
			foreach($Filedb AS $key2=>$fileurl){
				$Filedb[$key2]="$dir_id/".rands(6).basename($fileurl);
				if(strstr($Filedb[$key2],'?')){
					$Filedb[$key2]=str_replace("?","_____",$Filedb[$key2]);
				}
				$file_Type=strrchr($Filedb[$key2],".");
				if(strlen($file_Type)>5){
					$Filedb[$key2].=".rar";
				}
				if( $getfilecontent=sockOpenUrl($fileurl) ){
					write_file(PHP168_PATH."$webdb[updir]/{$Filedb[$key2]}",$getfilecontent);
				}else{
					copy($fileurl,PHP168_PATH."$webdb[updir]/{$Filedb[$key2]}");
				}
			}
		}
	}
	//采集边浏览图片
	$Filedb || $Filedb=array();
	foreach($Filedb AS $key2=>$fileurl){
		if(eregi(".jpg$",$fileurl)||eregi(".gif$",$fileurl)){
			echo "<img src=".tempdir($fileurl)."><br>";
		}
		echo "<A HREF='$fileurl' target=_blank>$fileurl</A><hr>";
	}
	
	$detail_title=explode("@@",$title);
	if($detail_title[1]==''){
		$title=$detail_title[0];
	}

	$content=$show_content;
	//导入哪个系统进行选择
	if(!$system_type||!file_exists("inc/gather/system.$system_type.php")){
		$system_type="article";
	}
	//不测试的时候.入库
	if(!$testgather ){
		include("inc/gather/system.$system_type.php");
	}
	
	$page++;
	if($urldb[$page]){
		$p=$page-1;
		//只显示部分方便用户查看采集情况
		$testgather || $content=get_word($content,1000);
		$content=filtrate($content);
		echo "正在采集第[$page]页,请耐心等待...<A HREF={$urldb[$p]} target=_blank>{$urldb[$p]}</A><hr>$content";
		echo "<META HTTP-EQUIV=REFRESH CONTENT='1;URL=?lfj=$lfj&action=$action&id=$id&system_type=$system_type&GetFile=$GetFile&file_dir=$file_dir&makesmallpic=$makesmallpic&showpic=$showpic&username=$username&fid=$fid&testgather=$testgather&page=$page'>";
		exit;
	}else{
		$num=count($urldb);
		if($testgather){
			refreshto("gather.php?lfj=$lfj&job=list","测试采集完毕,模拟总共采集了{$num}篇,其实没有入数据库",20);
		}else{
			refreshto("gather.php?lfj=$lfj&job=list","采集完毕,总共采集了{$num}篇",10);
		}
	}
	
}
elseif($job=="list")
{
	$query = $db->query("SELECT * FROM {$pre}mv_gather_rule ORDER BY id DESC");
	while($rs = $db->fetch_array($query)){
		$rs[posttime]=date("Y-m-d",$rs[posttime]);
		$listdb[]=$rs;
	}
	require("head.php");
	require("template/gather/list.htm");
	require("foot.php");
}
elseif($job=="addrulesql")
{
	require("head.php");
	require("template/gather/addrulesql.htm");
	require("foot.php");
}
elseif($action=='addrulesql')
{	
	if(strstr($sqlcode,"'")){
		$sqlcode=StripSlashes($sqlcode);
	}else{
		$sqlcode=urldecode($sqlcode);
	}
	$sqlcode=str_replace('p8_mv_gather_rule',"{$pre}mv_gather_rule",$sqlcode);
	$db->query($sqlcode);
	refreshto("gather.php?lfj=gather&job=list","如果刚才页面没有报错,那恭喜你,导入成功",1);
}
elseif($job=='sharerulesql')
{
	$rsdb=$db->get_one("SELECT * FROM {$pre}mv_gather_rule WHERE id='$id'");
	foreach($rsdb AS $key=>$value){
		$rsdb[$key]=mysql_escape_string($value);
	}
	extract($rsdb);
	$SQL="INSERT INTO `p8_mv_gather_rule` (`id`, `type`, `fixsystem`, `filetype`, `webname`, `listurl`, `firstpage`, `page_begin`, `page_end`, `page_step`, `title_minleng`, `listmoreurl`, `link_include_word`, `link_noinclude_word`, `link_replace_word`, `title_replace_word`, `list_begin_code`, `list_end_code`, `list_begin_preg`, `list_end_preg`, `gatherthesame`, `show_begin_preg`, `show_end_preg`, `show_endfile_preg`, `show_begin_code`, `show_end_code`, `show_replace_word`, `show_morepage`, `show_firstpage`, `show_spe2page`, `posttime`, `list`, `copypic`, `sort`, `file_type`, `file_minleng`, `file_minsize`, `file_includeword`, `file_noincludeword`, `file_explode`, `file_picwidth`, `file_star_string`, `title_rule`, `content_rule`, `title_morepage_rull`, `content_morepage_rull`, `charset_type`) VALUES ('','$type','$fixsystem','$filetype','$webname','$listurl','$firstpage','$page_begin','$page_end','$page_step','$title_minleng','$listmoreurl','$link_include_word','$link_noinclude_word','$link_replace_word','$title_replace_word','$list_begin_code','$list_end_code','$list_begin_preg','$list_end_preg','$gatherthesame','$show_begin_preg','$show_end_preg','$show_endfile_preg','$show_begin_code','$show_end_code','$show_replace_word','$show_morepage','$show_firstpage','$show_spe2page','$posttime','$list','$copypic','$sort','$file_type','$file_minleng','$file_minsize','$file_includeword','$file_noincludeword','$file_explode','$file_picwidth','$file_star_string','$title_rule','$content_rule','$title_morepage_rull','$content_morepage_rull','$charset_type');";
	$SQL=urlencode($SQL);
	require("head.php");
	require("template/gather/sharerulesql.htm");
	require("foot.php");
}
elseif($action=="deleterule")
{
	$db->query("DELETE FROM {$pre}mv_gather_rule WHERE id='$id'");
	refreshto("gather.php?lfj=gather&job=list","删除成功",1);
}
elseif($job=="showfid")
{
	include_once("inc/gather/show_system_fid.php");
}
function SinaTitle($word,$file){
	if($word){
		$detail=explode($word,$file);
	}
	$count=count($detail);
	for($i=2;$i<$count;$i++){
		$detail[$i]=str_replace("target=_blank","class=a01 target=_blank",$detail[$i]);
		$detail2=explode("target=_blank>",$detail[$i]);
		$detail3=explode("</a>",$detail2[1]);
		$title=$detail3[0];
		$detail4=explode(" class=",$detail[$i]);
		$url="$word$detail4[0]";
		if(!$url||!$title){
			continue;
		}
		$rs[url]=$url;
		$rs[title]=$title;
		$rs[j]=++$j;
		$listdb[]=$rs;
	}
	return $listdb;
}

function GetFileUrl($rsdb,$show_content,$curl){
	global $oldFileDB;
	$detail=explode($rsdb[file_explode],$show_content);
	foreach( $detail AS $key=>$value){
		$i++;
		if($i==1){
			continue;
		}
		//获取文件的地址
		$fileurl=$oldFileurl=preg_replace("/(['\" ]*)([^'\" >]+)(.*)/is","\\2",$value);
		if(!$fileurl){
			continue;
		}
		//文件地址的结尾字符串,图片一般是jpg
		if($rsdb[file_type]){
			$CK=0;
			$detail2=explode("|",$rsdb[file_type]);
			foreach( $detail2 AS $key2=>$value2){
				if($value2 && eregi("{$value2}$",$fileurl)){
					$CK=1;
				}
			}
			if(!$CK){
				continue;
			}
		}
		//文件地址的开头字符串
		if($rsdb[file_star_string]){
			$CK=0;
			$detail2=explode("|",$rsdb[file_star_string]);
			foreach( $detail2 AS $key2=>$value2){
				if($value2 && eregi("^{$value2}",$fileurl)){
					$CK=1;
				}
			}
			if(!$CK){
				continue;
			}
		}
		//地址中必须包含的字符
		if($rsdb[file_includeword]){
			$CK=0;
			$detail2=explode("\r\n",$rsdb[file_includeword]);
			foreach( $detail2 AS $key2=>$value2){
				if( $value2&&strstr($fileurl,$value2) ){
					$CK=1;
				}
			}
			if(!$CK){
				continue;
			}
		}
		//地址中不能包含的字符
		if($rsdb[file_noincludeword]){
			$CK=0;
			$detail2=explode("\r\n",$rsdb[file_noincludeword]);
			foreach( $detail2 AS $key2=>$value2){
				if( $value2&&strstr($fileurl,$value2) ){
					$CK=1;
				}
			}
			if($CK){
				continue;
			}
		}
		//对文件地址做绝对地址处理
		if(!ereg("^http://",$fileurl)){
			if(ereg("^/",$fileurl)){
				$fileurl=preg_replace("/http:\/\/([^\/]+)(.*)/is","http://\\1$fileurl",$curl);
			}else{
				$fileurl=str_replace(basename($curl),"",$curl).$fileurl;
			}	
		}
		/*判断文件的大小不能低于多少*/
		if($rsdb[file_minsize]){
			copy($fileurl,PHP168_PATH."cache/gather_.file");
			if(filesize(PHP168_PATH."cache/gather_.file")<$rsdb[file_minsize]){
				continue;
			}
		}
		$fileDB[]=$fileurl;
		$oldFileDB[]=$oldFileurl;
	}
	return $fileDB;
}


function get_rule($string){
	$string=str_replace('\\','\\\\',$string);
	$string=str_replace("(","\(",$string);
	$string=str_replace(")","\)",$string);
	$string=str_replace("[","\[",$string);
	$string=str_replace("]","\]",$string);
	$string=str_replace('"','\"',$string);
	$string=str_replace('.','\.',$string);
	$string=str_replace('?','\?',$string);
	$string=str_replace('$','\$',$string);
	$string=str_replace('^','\^',$string);
	$string=str_replace('/','\/',$string);
	$string=str_replace('+','\+',$string);
	$string=preg_replace("/\{(.*?)\}/eis","replace_preg('\\1')",$string);
	return $string;
}

function replace_preg($string){
	$string=str_replace('\"','"',$string);
	$rule=$string;
	
	if(ereg("^NO",$rule)){
		$detail=explode("NO",$rule);
		return "([^{$detail[1]}]*)";
	}elseif($rule=='*'){
		return "(.*?)";
	}elseif($rule=='**'){
		return "(.*)";
	}else{
		$detail=explode("=",$string);
		$rule=$detail[1];
		if(ereg("^NO",$rule)){
			$detail=explode("NO",$rule);
			return "([^{$detail[1]}]*)";
		}elseif($rule=='*'){
			return "(.*?)";
		}elseif($rule=='**'){
			return "(.*)";
		}
	}
}

function clean_blank($str){
	$str=preg_replace("/([\r\n]*)/is","",$str);
	$str=preg_replace("/>([ \t]*)</is","><",$str);
	$str=preg_replace("/^([ ]*)/is","",$str);
	$str=preg_replace("/([ ]*)$/is","",$str);
	return $str;
}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -