📄 gather.php
字号:
$iframeurl=$curl;
}else{
//开头做正则处理
if($rsdb[show_begin_preg]){
$htmlcode=$show_content;
include(PHP168_PATH."cache/gather_show.begin_preg.php");
$show_content=$htmlcode;
}
//对一篇文章多页的处理,只是在第一页的时候处理.第二页就不需要了
if($rsdb[show_firstpage]&&$rsdb[show_morepage]&&!$morepage){
$i=1;
unset($moreurl_db);
do{
$i++;
//后面页与第一页的不同之处做替换得到后页的真实地址
$nexturl=str_replace($rsdb[show_firstpage],str_replace("[page]",$i,$rsdb[show_morepage]),$curl);
//对一些特殊的网站处理.比如第一页是index.htm第二页竟然是index_1.htm
if($i==2&&$rsdb[show_spe2page]){
$tsurl=str_replace($rsdb[show_firstpage],str_replace("[page]",1,$rsdb[show_morepage]),$curl);
if(ereg(basename($tsurl),$show_content)){
$moreurl_db[$page][]="$tsurl\t$title\t1";
}
}
if(ereg(basename($nexturl),$show_content)){
$moreurl_db[$page][]="$nexturl\t$title\t$i";
}else{
$i=0;
}
}
while($i!=0);
if(is_array($moreurl_db[$page])){
//对原要采集的文章再重新处理,因为增加了分页
foreach($urldb AS $key=>$value){
$_urlDB[]="\$urldb[]='$value';";
if($page==$key&&is_array($moreurl_db[$key])){
foreach($moreurl_db[$key] AS $key2=>$value2){
$_urlDB[]="\$urldb[]='$value2';";
}
}
}
$write_file=implode("\r\n",$_urlDB);
write_file(PHP168_PATH."cache/gather_title.php","<?php\r\n$write_file");
unset($urldb);
include(PHP168_PATH."cache/gather_title.php");
}
}
//用户自定义正则,对文章做正则
if($rsdb[content_rule])
{
//把空白都去除,方便处理
$rsdb[content_rule]=clean_blank($rsdb[content_rule]);
$show_content=clean_blank($show_content);
//获取正则里的规则数组
preg_match_all("/\{(.*?)\}/is",$rsdb[content_rule],$array);
//获取变量
foreach( $array[1] AS $key=>$value){
if( !ereg("^NO",$value)&&!ereg("^\*",$value) ){
$detail=explode("=",$value);
$ruledb[++$key]=$detail[0];
}
}
//获取处理后能使用的规则
$rule = get_rule($rsdb[content_rule]);
//对采集的内容跟据正则进行校正
preg_match_all("/$rule/is",$show_content,$array2);
//获取有用的数组
foreach( $ruledb AS $key=>$value){
foreach( $array2[$key] AS $key2=>$value2){
$listdb[$value][]=$value2;
}
}
//把用户自定义的变量都取出来
foreach( $listdb AS $key=>$value){
$$key=$value[0];
}
if($content)
{
$show_content=$content;
}
//主要是处理那种画中画的广告.把文章截成两段了
elseif($content1)
{
$show_content=$content1.$content2;
}
echo ("$videourl<hr>");
}
//过滤文章前面无效内容
if($rsdb[show_begin_code]){
$show_content=strstr($show_content,$rsdb[show_begin_code]);
$num_1=strlen($rsdb[show_begin_code]);
$num_2=strlen($show_content);
$show_content=substr($show_content,$num_1,$num_2);
}
//过滤文章后的无效内容
if($rsdb[show_end_code]){
$end_content=strstr($show_content,$rsdb[show_end_code]);
$show_content=str_replace($end_content,"",$show_content);
}
//过滤文章中不想看到的文字
if($rsdb[show_replace_word]){
$detail=explode("\r\n",$rsdb[show_replace_word]);
foreach($detail AS $key=>$value){
list($oldword,$newword)=explode("|",$value);
$show_content=str_replace($oldword,$newword,$show_content);
}
}
//文章结尾做正则处理
if($rsdb[show_end_preg]){
$htmlcode=$show_content;
include(PHP168_PATH."cache/gather_show.end_preg.php");
$show_content=$htmlcode;
}
}
//获取文件.文件切割符,图片一般src=,这里默认是图片
if(!$rsdb[file_explode]){
$rsdb[file_explode]='src=';
$show_content=str_replace("SRC=","src=",$show_content);
if( !$rsdb[file_type] && $rsdb[fixsystem]=='article' ){
$rsdb[file_type]="jpg|gif|png";
}
}
$Filedb=GetFileUrl($rsdb,$show_content,$curl);
//结尾正则,可以直接通过正则获取文件地址,如果不获取文件的话.与上面已有结尾正则是有点雷同
if($rsdb[show_endfile_preg]){
$htmlcode=$show_content;
include(PHP168_PATH."cache/gather_show.endfile_preg.php");
$show_content=$htmlcode;
}
//文件本地化
if( $Filedb && $GetFile && $fid ){
$dir_id=$file_dir?$file_dir:$fid;
if(!is_dir(PHP168_PATH."$webdb[updir]/$dir_id")){
makepath(PHP168_PATH."$webdb[updir]/$dir_id");
}
if($GetFile){
foreach($Filedb AS $key2=>$fileurl){
$Filedb[$key2]="$dir_id/".rands(6).basename($fileurl);
if(strstr($Filedb[$key2],'?')){
$Filedb[$key2]=str_replace("?","_____",$Filedb[$key2]);
}
$file_Type=strrchr($Filedb[$key2],".");
if(strlen($file_Type)>5){
$Filedb[$key2].=".rar";
}
if( $getfilecontent=sockOpenUrl($fileurl) ){
write_file(PHP168_PATH."$webdb[updir]/{$Filedb[$key2]}",$getfilecontent);
}else{
copy($fileurl,PHP168_PATH."$webdb[updir]/{$Filedb[$key2]}");
}
}
}
}
//采集边浏览图片
$Filedb || $Filedb=array();
foreach($Filedb AS $key2=>$fileurl){
if(eregi(".jpg$",$fileurl)||eregi(".gif$",$fileurl)){
echo "<img src=".tempdir($fileurl)."><br>";
}
echo "<A HREF='$fileurl' target=_blank>$fileurl</A><hr>";
}
$detail_title=explode("@@",$title);
if($detail_title[1]==''){
$title=$detail_title[0];
}
$content=$show_content;
//导入哪个系统进行选择
if(!$system_type||!file_exists("inc/gather/system.$system_type.php")){
$system_type="article";
}
//不测试的时候.入库
if(!$testgather ){
include("inc/gather/system.$system_type.php");
}
$page++;
if($urldb[$page]){
$p=$page-1;
//只显示部分方便用户查看采集情况
$testgather || $content=get_word($content,1000);
$content=filtrate($content);
echo "正在采集第[$page]页,请耐心等待...<A HREF={$urldb[$p]} target=_blank>{$urldb[$p]}</A><hr>$content";
echo "<META HTTP-EQUIV=REFRESH CONTENT='1;URL=?lfj=$lfj&action=$action&id=$id&system_type=$system_type&GetFile=$GetFile&file_dir=$file_dir&makesmallpic=$makesmallpic&showpic=$showpic&username=$username&fid=$fid&testgather=$testgather&page=$page'>";
exit;
}else{
$num=count($urldb);
if($testgather){
refreshto("gather.php?lfj=$lfj&job=list","测试采集完毕,模拟总共采集了{$num}篇,其实没有入数据库",20);
}else{
refreshto("gather.php?lfj=$lfj&job=list","采集完毕,总共采集了{$num}篇",10);
}
}
}
elseif($job=="list")
{
$query = $db->query("SELECT * FROM {$pre}mv_gather_rule ORDER BY id DESC");
while($rs = $db->fetch_array($query)){
$rs[posttime]=date("Y-m-d",$rs[posttime]);
$listdb[]=$rs;
}
require("head.php");
require("template/gather/list.htm");
require("foot.php");
}
elseif($job=="addrulesql")
{
require("head.php");
require("template/gather/addrulesql.htm");
require("foot.php");
}
elseif($action=='addrulesql')
{
if(strstr($sqlcode,"'")){
$sqlcode=StripSlashes($sqlcode);
}else{
$sqlcode=urldecode($sqlcode);
}
$sqlcode=str_replace('p8_mv_gather_rule',"{$pre}mv_gather_rule",$sqlcode);
$db->query($sqlcode);
refreshto("gather.php?lfj=gather&job=list","如果刚才页面没有报错,那恭喜你,导入成功",1);
}
elseif($job=='sharerulesql')
{
$rsdb=$db->get_one("SELECT * FROM {$pre}mv_gather_rule WHERE id='$id'");
foreach($rsdb AS $key=>$value){
$rsdb[$key]=mysql_escape_string($value);
}
extract($rsdb);
$SQL="INSERT INTO `p8_mv_gather_rule` (`id`, `type`, `fixsystem`, `filetype`, `webname`, `listurl`, `firstpage`, `page_begin`, `page_end`, `page_step`, `title_minleng`, `listmoreurl`, `link_include_word`, `link_noinclude_word`, `link_replace_word`, `title_replace_word`, `list_begin_code`, `list_end_code`, `list_begin_preg`, `list_end_preg`, `gatherthesame`, `show_begin_preg`, `show_end_preg`, `show_endfile_preg`, `show_begin_code`, `show_end_code`, `show_replace_word`, `show_morepage`, `show_firstpage`, `show_spe2page`, `posttime`, `list`, `copypic`, `sort`, `file_type`, `file_minleng`, `file_minsize`, `file_includeword`, `file_noincludeword`, `file_explode`, `file_picwidth`, `file_star_string`, `title_rule`, `content_rule`, `title_morepage_rull`, `content_morepage_rull`, `charset_type`) VALUES ('','$type','$fixsystem','$filetype','$webname','$listurl','$firstpage','$page_begin','$page_end','$page_step','$title_minleng','$listmoreurl','$link_include_word','$link_noinclude_word','$link_replace_word','$title_replace_word','$list_begin_code','$list_end_code','$list_begin_preg','$list_end_preg','$gatherthesame','$show_begin_preg','$show_end_preg','$show_endfile_preg','$show_begin_code','$show_end_code','$show_replace_word','$show_morepage','$show_firstpage','$show_spe2page','$posttime','$list','$copypic','$sort','$file_type','$file_minleng','$file_minsize','$file_includeword','$file_noincludeword','$file_explode','$file_picwidth','$file_star_string','$title_rule','$content_rule','$title_morepage_rull','$content_morepage_rull','$charset_type');";
$SQL=urlencode($SQL);
require("head.php");
require("template/gather/sharerulesql.htm");
require("foot.php");
}
elseif($action=="deleterule")
{
$db->query("DELETE FROM {$pre}mv_gather_rule WHERE id='$id'");
refreshto("gather.php?lfj=gather&job=list","删除成功",1);
}
elseif($job=="showfid")
{
include_once("inc/gather/show_system_fid.php");
}
function SinaTitle($word,$file){
if($word){
$detail=explode($word,$file);
}
$count=count($detail);
for($i=2;$i<$count;$i++){
$detail[$i]=str_replace("target=_blank","class=a01 target=_blank",$detail[$i]);
$detail2=explode("target=_blank>",$detail[$i]);
$detail3=explode("</a>",$detail2[1]);
$title=$detail3[0];
$detail4=explode(" class=",$detail[$i]);
$url="$word$detail4[0]";
if(!$url||!$title){
continue;
}
$rs[url]=$url;
$rs[title]=$title;
$rs[j]=++$j;
$listdb[]=$rs;
}
return $listdb;
}
function GetFileUrl($rsdb,$show_content,$curl){
global $oldFileDB;
$detail=explode($rsdb[file_explode],$show_content);
foreach( $detail AS $key=>$value){
$i++;
if($i==1){
continue;
}
//获取文件的地址
$fileurl=$oldFileurl=preg_replace("/(['\" ]*)([^'\" >]+)(.*)/is","\\2",$value);
if(!$fileurl){
continue;
}
//文件地址的结尾字符串,图片一般是jpg
if($rsdb[file_type]){
$CK=0;
$detail2=explode("|",$rsdb[file_type]);
foreach( $detail2 AS $key2=>$value2){
if($value2 && eregi("{$value2}$",$fileurl)){
$CK=1;
}
}
if(!$CK){
continue;
}
}
//文件地址的开头字符串
if($rsdb[file_star_string]){
$CK=0;
$detail2=explode("|",$rsdb[file_star_string]);
foreach( $detail2 AS $key2=>$value2){
if($value2 && eregi("^{$value2}",$fileurl)){
$CK=1;
}
}
if(!$CK){
continue;
}
}
//地址中必须包含的字符
if($rsdb[file_includeword]){
$CK=0;
$detail2=explode("\r\n",$rsdb[file_includeword]);
foreach( $detail2 AS $key2=>$value2){
if( $value2&&strstr($fileurl,$value2) ){
$CK=1;
}
}
if(!$CK){
continue;
}
}
//地址中不能包含的字符
if($rsdb[file_noincludeword]){
$CK=0;
$detail2=explode("\r\n",$rsdb[file_noincludeword]);
foreach( $detail2 AS $key2=>$value2){
if( $value2&&strstr($fileurl,$value2) ){
$CK=1;
}
}
if($CK){
continue;
}
}
//对文件地址做绝对地址处理
if(!ereg("^http://",$fileurl)){
if(ereg("^/",$fileurl)){
$fileurl=preg_replace("/http:\/\/([^\/]+)(.*)/is","http://\\1$fileurl",$curl);
}else{
$fileurl=str_replace(basename($curl),"",$curl).$fileurl;
}
}
/*判断文件的大小不能低于多少*/
if($rsdb[file_minsize]){
copy($fileurl,PHP168_PATH."cache/gather_.file");
if(filesize(PHP168_PATH."cache/gather_.file")<$rsdb[file_minsize]){
continue;
}
}
$fileDB[]=$fileurl;
$oldFileDB[]=$oldFileurl;
}
return $fileDB;
}
function get_rule($string){
$string=str_replace('\\','\\\\',$string);
$string=str_replace("(","\(",$string);
$string=str_replace(")","\)",$string);
$string=str_replace("[","\[",$string);
$string=str_replace("]","\]",$string);
$string=str_replace('"','\"',$string);
$string=str_replace('.','\.',$string);
$string=str_replace('?','\?',$string);
$string=str_replace('$','\$',$string);
$string=str_replace('^','\^',$string);
$string=str_replace('/','\/',$string);
$string=str_replace('+','\+',$string);
$string=preg_replace("/\{(.*?)\}/eis","replace_preg('\\1')",$string);
return $string;
}
function replace_preg($string){
$string=str_replace('\"','"',$string);
$rule=$string;
if(ereg("^NO",$rule)){
$detail=explode("NO",$rule);
return "([^{$detail[1]}]*)";
}elseif($rule=='*'){
return "(.*?)";
}elseif($rule=='**'){
return "(.*)";
}else{
$detail=explode("=",$string);
$rule=$detail[1];
if(ereg("^NO",$rule)){
$detail=explode("NO",$rule);
return "([^{$detail[1]}]*)";
}elseif($rule=='*'){
return "(.*?)";
}elseif($rule=='**'){
return "(.*)";
}
}
}
function clean_blank($str){
$str=preg_replace("/([\r\n]*)/is","",$str);
$str=preg_replace("/>([ \t]*)</is","><",$str);
$str=preg_replace("/^([ ]*)/is","",$str);
$str=preg_replace("/([ ]*)$/is","",$str);
return $str;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -