📄 gather.php
字号:
<?php
require_once("global.php");
set_time_limit(0);
error_reporting(7);
if($job=="add_title")
{
$link_noinclude_word="<";
$page_step=1;
$webname="被采集的网站名称";
$rulepage='0';
require("head.php");
require("template/gather/edit_title.htm");
require("foot.php");
}
elseif($action=="add_title")
{
if( !$db->get_one("SELECT * FROM `{$pre}mv_gather_rule` WHERE posttime='$posttime' ") ){
$db->query("INSERT INTO `{$pre}mv_gather_rule` ( `type` , `filetype` , `webname` , `listurl` , `firstpage` , `page_begin` , `page_end` , `page_step` , `listmoreurl` , `link_include_word` , `link_noinclude_word` , `link_replace_word` , `title_replace_word` , `list_begin_code` , `list_end_code` , `list_begin_preg` , `list_end_preg` , `gatherthesame` , `title_minleng` , `show_end_preg` , `show_begin_code` , `show_end_code` , `show_replace_word` , `show_morepage` , `posttime` , `list`,title_rule,charset_type )
VALUES (
'$type','$filetype','$webname','$listurl','$firstpage','$page_begin','$page_end','$page_step','$listmoreurl','$link_include_word','$link_noinclude_word','$link_replace_word','$title_replace_word','$list_begin_code','$list_end_code','$list_begin_preg','$list_end_preg','$gatherthesame','$title_minleng','$show_end_preg','$show_begin_code','$show_end_code','$show_replace_word','$show_morepage','$posttime','$timestamp','$title_rule','$charset_type'
)");
}else{
$db->query("UPDATE `{$pre}mv_gather_rule` SET webname='$webname',listurl='$listurl',firstpage='$firstpage',page_begin='$page_begin',page_end='$page_end',page_step='$page_step',listmoreurl='$listmoreurl',link_include_word='$link_include_word',link_noinclude_word='$link_noinclude_word',link_replace_word='$link_replace_word',title_replace_word='$title_replace_word',list_begin_code='$list_begin_code',list_end_code='$list_end_code',list_begin_preg='$list_begin_preg',list_end_preg='$list_end_preg',title_minleng='$title_minleng',title_rule='$title_rule',charset_type='$charset_type' WHERE posttime='$posttime'");
}
$rs=$db->get_one("SELECT id FROM {$pre}mv_gather_rule WHERE posttime='$posttime' ");
if($testgather){
refreshto("gather.php?lfj=$lfj&action=gather_title&id=$rs[id]&showurl=1","正在测试采集标题,请耐心等待",1);
}
refreshto("gather.php?lfj=$lfj&job=edit_content&id=$rs[id]","继续下一步",1);
}
elseif($job=="edit_title")
{
$rsdb=$db->get_one("SELECT * FROM {$pre}mv_gather_rule WHERE id='$id'");
@extract($rsdb);
if($listurl){
$rulepage='1';
}else{
$rulepage='0';
}
$charset_typedb[intval($rsdb[charset_type])]=' checked ';
require("head.php");
require("template/gather/edit_title.htm");
require("foot.php");
}
elseif($action=="edit_title")
{
$db->query("UPDATE `{$pre}mv_gather_rule` SET webname='$webname',listurl='$listurl',firstpage='$firstpage',page_begin='$page_begin',page_end='$page_end',page_step='$page_step',listmoreurl='$listmoreurl',link_include_word='$link_include_word',link_noinclude_word='$link_noinclude_word',link_replace_word='$link_replace_word',title_replace_word='$title_replace_word',list_begin_code='$list_begin_code',list_end_code='$list_end_code',list_begin_preg='$list_begin_preg',list_end_preg='$list_end_preg',title_minleng='$title_minleng',title_rule='$title_rule',charset_type='$charset_type' WHERE id='$id'");
if($testgather){
refreshto("gather.php?lfj=$lfj&action=gather_title&id=$id&showurl=1","正在测试采集标题,请耐心等待",1);
}
refreshto("gather.php?lfj=$lfj&job=edit_content&id=$id","继续下一步设置详细参数",1);
}
elseif($job=="edit_content")
{
$rsdb=$db->get_one("SELECT * FROM {$pre}mv_gather_rule WHERE id='$id'");
@extract($rsdb);
$type || $type='iframe';
$typedb[$type]=" checked ";
$gatherthesame=intval($gatherthesame);
$gatherthesamedb[$gatherthesame]=" checked ";
$show_spe2page=intval($show_spe2page);
$show_spe2pagedb[$show_spe2page]=" checked ";
require("head.php");
require("template/gather/edit_content.htm");
require("foot.php");
}
elseif($action=="edit_content")
{
$fixsystem || $fixsystem='article';
$db->query("UPDATE {$pre}mv_gather_rule SET type='$type',gatherthesame='$gatherthesame',show_begin_preg='$show_begin_preg',show_end_preg='$show_end_preg',show_endfile_preg='$show_endfile_preg',show_begin_code='$show_begin_code',show_end_code='$show_end_code',show_replace_word='$show_replace_word',show_morepage='$show_morepage',show_firstpage='$show_firstpage',copypic='$copypic',sort='$sort',file_type='$file_type',file_minleng='$file_minleng',file_minsize='$file_minsize',file_includeword='$file_includeword',file_noincludeword='$file_noincludeword',file_explode='$file_explode',file_picwidth='$file_picwidth',fixsystem='$fixsystem',file_star_string='$file_star_string',content_rule='$content_rule' WHERE id='$id'");
if($testgather){
refreshto("gather.php?lfj=gather&action=gather_title&id=$id&testgather=$testgather","请耐心等待,先采集标题,你再点击选择测试采集",1);
}
refreshto("gather.php?lfj=gather&job=list","修改成功",1);
}
elseif($action=="gather_title")
{
$rsdb=$db->get_one("SELECT * FROM {$pre}mv_gather_rule WHERE id='$id'");
$page=intval($page);
if(!$page){
if($rsdb[listmoreurl]){
$detail=explode("\r\n",$rsdb[listmoreurl]);
foreach( $detail AS $key=>$value){
$allurldb[]="\$urldb[]='$value';";
}
$allurl=implode("\r\n",$allurldb);
}else{
$rsdb[page_step] || $rsdb[page_step]=1;
for($i=$rsdb[page_begin];$i<=$rsdb[page_end];$i++ ){
if($rsdb[page_begin]==0){
$II=($i-1)*$rsdb[page_step];
}else{
$II=($i-1)*$rsdb[page_step]+1;
}
$value=str_replace("[page]","$II",$rsdb[listurl]);
if($i==1&&$rsdb[firstpage]){
$value=$rsdb[firstpage];
}
$allurldb[]="\$urldb[]='$value';";
}//print_R($allurldb); die($allurldb);
$allurl=implode("\r\n",$allurldb);
}
write_file(PHP168_PATH."cache/gather_morepage.php","<?php\r\n".$allurl);
write_file(PHP168_PATH."cache/gather_title.php","<?php\r\n");
if($rsdb[list_begin_preg]){
write_file(PHP168_PATH."cache/gather_list.begin_preg.php","<?php\r\n$rsdb[list_begin_preg]");
}
if($rsdb[list_end_preg]){
write_file(PHP168_PATH."cache/gather_list.end_preg.php","<?php\r\n$rsdb[list_end_preg]");
}
if($rsdb[show_begin_preg]){
write_file(PHP168_PATH."cache/gather_show.begin_preg.php","<?php\r\n$rsdb[show_begin_preg]");
}
if($rsdb[show_end_preg]){
write_file(PHP168_PATH."cache/gather_show.end_preg.php","<?php\r\n$rsdb[show_end_preg]");
}
if($rsdb[show_endfile_preg]){
write_file(PHP168_PATH."cache/gather_show.endfile_preg.php","<?php\r\n$rsdb[show_endfile_preg]");
}
}
@include(PHP168_PATH."cache/gather_morepage.php");
$listurl=$urldb[$page];
if($list_content=file_get_contents($listurl))
{
}
elseif($list_content=file($listurl))
{
$list_content=implode("",$list_content);
}
elseif(copy($listurl,PHP168_PATH."cache/gather_cache.php"))
{
$list_content=read_file(PHP168_PATH."cache/gather_cache.php");
}
elseif($list_content=sockOpenUrl($listurl)){
}
else
{
echo("采集失败<br><br><br><br><br><br><br><br><br><br>");
}
//UTF8->GBK
if($rsdb[charset_type]==1){
require_once(PHP168_PATH."inc/class.chinese.php");
$cnvert = new Chinese("UTF8","GB2312",$list_content,PHP168_PATH."./inc/gbkcode/");
$list_content = $cnvert->ConvertIT();
}
//开头正则
if($rsdb[list_begin_preg]){
$htmlcode=$list_content; //方便外部正则语句的变量比较统一
include(PHP168_PATH."cache/gather_list.begin_preg.php");
$list_content=$htmlcode;
}
//截取从某段字符开始至结尾的内容
if($rsdb[list_begin_code]){
$list_content=strstr($list_content,$rsdb[list_begin_code]);
}
//去掉某段字符后面的内容
if($rsdb[list_end_code]){
$end_content=strstr($list_content,$rsdb[list_end_code]);
$list_content=str_replace($end_content,"",$list_content);
}
//替换一些字符
if($rsdb[title_replace_word]){
$detail=explode("\r\n",$rsdb[title_replace_word]);
foreach($detail AS $key=>$value){
list($oldword,$newword)=explode("|",$value);
$list_content=str_replace($oldword,$newword,$list_content);
}
}
//用户自定义正则
if($rsdb[title_rule])
{
//把空白都去除,方便处理
$rsdb[title_rule]=clean_blank($rsdb[title_rule]);
$list_content=clean_blank($list_content);
//获取正则里的规则数组
preg_match_all("/\{(.*?)\}/is",$rsdb[title_rule],$array);
//获取变量
foreach( $array[1] AS $key=>$value){
if( !ereg("^NO",$value)&&!ereg("^\*",$value) ){
$detail=explode("=",$value);
$ruledb[++$key]=$detail[0];
}
}
//获取处理后能使用的规则
$rule = get_rule($rsdb[title_rule]);
//对采集的内容跟据正则进行校正
preg_match_all("/$rule/is",$list_content,$array2);
//获取有用的数组
foreach( $ruledb AS $key=>$value){
foreach( $array2[$key] AS $key2=>$value2){
$listdb[$value][]=$value2;
}
}
//url文章地址必须要有的
$detail_content=$listdb[url];
}
else
{
$list_content=str_replace("HREF=","href=",$list_content);
$list_content=str_replace("</A>","</a>",$list_content);
$list_content=str_replace("href='","href=",$list_content);
$list_content=str_replace('href="','href=',$list_content);
$detail_content=explode("href=",$list_content);
}
unset($i,$_url,$_title);
foreach($detail_content AS $key_content=>$value_content){
if($rsdb[title_rule])
{
$url=$value_content;
$title=$listdb[title][$key_content];
$picurl=$listdb[picurl][$key_content];
}
else
{
if($key_content==0){
continue;
}
$url=preg_replace("/([^'\" >]+)(.*)/is","\\1",$value_content);//echo $url; die();
$s1_title=strstr($value_content,">");
$s2_title=strstr($value_content,"</a>");
$s3_title=str_replace($s2_title,"",$s1_title);
$title=str_replace(">","",$s3_title);
$title=substr($s3_title,1,strlen($s3_title)-1);
$title=preg_replace("/<([^<>]+)>(.*)<([^<>]+)>/is","\\2",$title);
}
//标题与URL中不能包含有的字符
if($rsdb[link_noinclude_word]){
$detail=explode("\r\n",$rsdb[link_noinclude_word]);
foreach($detail AS $key=>$value){
if(!$value){
continue;
}
if(strstr($title,$value)||strstr($url,$value)){
unset($url,$title);
}
}
}
//URL中必须包含有的字符
if($rsdb[link_include_word]){
$detail=explode("\r\n",$rsdb[link_include_word]);
foreach($detail AS $key=>$value){
if(!$value){
continue;
}
if(!strstr($url,$value)){
unset($url,$title);
}
}
}
//标题不能小于多少字
if($rsdb[title_minleng]){
if(strlen($title)<$rsdb[title_minleng]+1){
unset($url,$title);
}
}
//对地址的完整性进行处理
if($url&&$title){
if(!ereg("://",$url)){
if(ereg("^/",$url)){
$url=preg_replace("/http:\/\/([^\/]+)(.*)/is","http://\\1$url",$listurl);
}else{
$url=str_replace(basename($listurl),"",$listurl).$url;
}
}
if($picurl&&!ereg("://",$picurl)){
if(ereg("^/",$picurl)){
$picurl=preg_replace("/http:\/\/([^\/]+)(.*)/is","http://\\1$picurl",$listurl);
}else{
$picurl=str_replace(basename($listurl),"",$listurl).$picurl;
}
}
$url=str_replace("'","'",$url);
$picurl=str_replace("'","'",$picurl);
$title=str_replace("'","'",$title);
$_url[]=$url;
$_title[]=$title;
if($picurl)
{
$UT[]="\$urldb[]='$url\t$title@@$picurl';";
}
else
{
$UT[]="\$urldb[]='$url\t$title';";
}
}
}
$writefile=implode("\r\n",$UT);
//结尾正则
if($rsdb[list_end_preg])
{
$htmlcode=$writefile; //方便外部正则语句的变量比较统一
include(PHP168_PATH."cache/gather_list.end_preg.php");
$writefile=$htmlcode; //方便外部正则语句的变量比较统一
}
write_file(PHP168_PATH."cache/gather_title.php","\r\n".$writefile,'a');
$page++;
if($urldb[$page]){
unset($urldb);
include(PHP168_PATH."cache/gather_title.php");
echo "$listurl<br>正在采集第[{$page}]页的标题与内容网址,请稍候...<hr>";
foreach( $urldb AS $key=>$value){
if($key>50){
break;
}
echo "$value<br>";
}
echo "<META HTTP-EQUIV=REFRESH CONTENT='0;URL=gather.php?lfj=$lfj&action=$action&id=$id&showurl=$showurl&testgather=$testgather&page=$page'>";
exit;
}else{
echo "<META HTTP-EQUIV=REFRESH CONTENT='0;URL=gather.php?lfj=$lfj&showurl=$showurl&testgather=$testgather&job=list_title&id=$id'>";
exit;
}
}
elseif($job=="list_title")
{
$rs=$ruledb=$db->get_one("SELECT * FROM {$pre}mv_gather_rule WHERE id='$id'");
if($rs[type]=="jump"){
$msg="注意:当前配置文件设置的参数:点击标题后跳转到外部网址,使得下面大部分无效";
}elseif($rs[type]=="iframe"){
$msg="注意:当前配置文件设置的参数:点击标题后框架外部网址,类似大旗、奇虎,使得下面大部分无效";
}
if($testgather){
$autosub="autosub();";
}
include(PHP168_PATH."cache/gather_title.php");
require("head.php");
require("template/gather/list_title.htm");
require("foot.php");
}
elseif($action=="list_title")
{
include(PHP168_PATH."cache/gather_title.php");
/*
foreach( $urldb AS $key=>$value){
if($postdb[$key]){
$UT[]="\$urldb[]='$value';";
}
}
*/
//倒序处理
$num=count($urldb)-1;
for($i=$num;$i>=0;$i--){
if($postdb[$i]){
$UT[]="\$urldb[]='{$urldb[$i]}';";
}
}
$writefile=implode("\r\n",$UT);
write_file(PHP168_PATH."cache/gather_title.php","<?php\r\n".$writefile);
$action='gather_content';
echo "<META HTTP-EQUIV=REFRESH CONTENT='1;URL=?lfj=$lfj&action=$action&id=$id&system_type=$system_type&GetFile=$GetFile&file_dir=$file_dir&makesmallpic=$makesmallpic&showpic=$showpic&username=$username&fid=$fid&testgather=$testgather&page=$page'>";
exit;
}
elseif($action=="gather_content")
{
unset($urldb);
$rsdb=$db->get_one("SELECT * FROM {$pre}mv_gather_rule WHERE id='$id'");
@include(PHP168_PATH."cache/gather_title.php");
$page=intval($page);
//$morepage大于0时.代表多页
list($curl,$title,$morepage)=explode("\t",$urldb[$page]);
if($show_content=file_get_contents($curl))
{
}
elseif($show_content=file($curl))
{
$show_content=implode("",$show_content);
}
elseif(copy($curl,PHP168_PATH."cache/gather_cache.php"))
{
$show_content=read_file(PHP168_PATH."cache/gather_cache.php");
}
elseif($show_content=sockOpenUrl($curl))
{
}
else
{
echo("服务器获取不了远程文件信息,因而采集失败$curl<br><br><br><br><br><br><br><br><br><br>");
}
//UTF8->GBK
if($rsdb[charset_type]==1){
require_once(PHP168_PATH."inc/class.chinese.php");
$cnvert = new Chinese("UTF8","GB2312",$show_content,PHP168_PATH."./inc/gbkcode/");
$show_content = $cnvert->ConvertIT();
}
if($rsdb[type]=='iframe'){//类似奇虎
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -