📄 spider.php
字号:
$query = "update ".$mysql_table_prefix."links set title='$title', description ='$desc', fulltxt = '$fulltxt', indexdate=now(), size = '$pageSize', md5sum='$newmd5sum', level=$thislevel where link_id=$link_id";
mysql_query($query);
echo mysql_error();
if ($command_line == 0) {
$report = " <font color=\"green\">Re-indexed</font><br>\n";
} else {
$report = " Re-indexed\n";
}
}
}else {
if ($command_line == 0) {
$report = " <font color=\"red\">Page contains less than $min_words_per_page words</font><br>\n";
} else {
$report = " Page contains less than 10 words.\n";
}
}
}
}
} else {
$deletable = 1;
if ($command_line == 0) {
$report = " <font color=red><b>" . $url_status['state'] . "</b></font><br>\n";
} else {
$report = $url_status['state']."\n";
}
}
if ($reindex ==1 && $deletable == 1) {
check_for_removal($url);
} else if ($reindex == 1) {
}
printLinksReport($numoflinks, $all_links, $command_line);
printStatusReport ($report);
}
function index_site($url, $reindex, $maxlevel, $soption, $url_inc, $url_not_inc, $can_leave_domain) {
global $mysql_table_prefix, $command_line, $mainurl;
$compurl = parse_url($url);
if ($compurl['path'] == '')
$url = $url . "/";
$t = microtime();
$a = getenv("REMOTE_ADDR");
$sessid = md5 ($t.$a);
$urlparts = parse_url($url);
$domain = $urlparts['host'];
$port = $urlparts['port'];
if (isset($urlparts['port'])) {
$port = (int)$urlparts['port'];
}else {
$port = 80;
}
$errno = 0;
$errmsg = "";
$fp = fsockopen($domain, $port, $errno, $errmsg);
if (!$fp) {
printConnectErrorReport($errmsg);
die();
}
fclose ($fp);
$result = mysql_query("select site_id from ".$mysql_table_prefix."sites where url='$url'");
echo mysql_error();
$row = mysql_fetch_row($result);
$site_id = $row[0];
if ($site_id != "" && $reindex == 1) {
mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$url', 0, '$sessid')");
echo mysql_error();
$result = mysql_query("select url, level from ".$mysql_table_prefix."links where site_id = $site_id");
while ($row = mysql_fetch_array($result)) {
$site_link = $row['url'];
$link_level = $row['level'];
if ($site_link != $url) {
mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$site_link', $link_level, '$sessid')");
}
}
$qry = "update ".$mysql_table_prefix."sites set indexdate=now(), spider_depth = $maxlevel, required = '$url_inc'," .
"disallowed = '$url_not_inc', can_leave_domain=$can_leave_domain where site_id=$site_id";
mysql_query ($qry);
echo mysql_error();
} else if ($site_id == '') {
mysql_query ("insert into ".$mysql_table_prefix."sites (url, indexdate, spider_depth, required, disallowed, can_leave_domain) " .
"values ('$url', now(), $maxlevel, '$url_inc', '$url_not_inc', $can_leave_domain)");
echo mysql_error();
$result = mysql_query("select site_ID from ".$mysql_table_prefix."sites where url='$url'");
$row = mysql_fetch_row($result);
$site_id = $row[0];
} else {
mysql_query ("update ".$mysql_table_prefix."sites set indexdate=now(), spider_depth = $maxlevel, required = '$url_inc'," .
"disallowed = '$url_not_inc', can_leave_domain=$can_leave_domain where site_id=$site_id");
echo mysql_error();
}
$result = mysql_query("select site_id, temp_id, level, count, num from ".$mysql_table_prefix."pending where site_id='$site_id'");
echo mysql_error();
$row = mysql_fetch_row($result);
$pending = $row[0];
$level = 0;
if ($pending == '') {
mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$url', 0, '$sessid')");
echo mysql_error();
} else if ($pending != '') {
print "<br>Continuing suspended indexing <br>";
mysql_query("select temp_id, level, count from ".$mysql_table_prefix."pending where site_id='$site_id'");
echo mysql_error();
$sessid = $row[1];
$level = $row[2];
$pend_count = $row[3] + 1;
$num = $row[4];
$pending = 1;
}
if ($reindex != 1) {
mysql_query ("insert into ".$mysql_table_prefix."pending (site_id, temp_id, level, count) values ('$site_id', '$sessid', '0', '0')");
echo mysql_error();
}
$time = time();
$omit = check_robot_txt($url);
printHeader ($omit, $url, $command_line);
$mainurl = $url;
while (($level <= $maxlevel && $soption == 'level') || ($soption == 'full')) {
if ($pending == 1) {
$count = $pend_count;
$pending = 0;
} else
$count = 0;
$links = array();
$result = mysql_query("select distinct link from ".$mysql_table_prefix."temp where level=$level && id='$sessid' order by link");
echo mysql_error();
$rows = mysql_num_rows($result);
if ($rows == 0) {
break;
}
$i = 0;
while ($row = mysql_fetch_array($result)) {
$links[] = $row['link'];
}
reset ($links);
while ($count < count($links)) {
$num++;
$thislink = $links[$count];
$urlparts = parse_url($thislink);
reset ($omit);
$forbidden = 0;
foreach ($omit as $omiturl) {
$omiturl = trim($omiturl);
$omiturl_parts = parse_url($omiturl);
if ($omiturl_parts['scheme'] == '') {
$check_omit = $urlparts['host'] . $omiturl;
} else {
$check_omit = $omiturl;
}
if (strpos($thislink, $check_omit)) {
printRobotsReport($num, $thislink, $command_line);
check_for_removal($thislink);
$forbidden = 1;
break;
}
}
if (!check_include($thislink, $url_inc, $url_not_inc )) {
printUrlStringReport($num, $thislink, $command_line);
check_for_removal($thislink);
$forbidden = 1;
}
if ($forbidden == 0) {
printRetrieving($num, $thislink, $command_line);
$query = "select md5sum, indexdate from ".$mysql_table_prefix."links where url='$thislink'";
$result = mysql_query($query);
echo mysql_error();
$rows = mysql_num_rows($result);
if ($rows == 0) {
index_url($thislink, $level+1, $site_id, '', $domain, '', $sessid, $can_leave_domain, $reindex);
mysql_query("update ".$mysql_table_prefix."pending set level = $level, count=$count, num=$num where site_id=$site_id");
echo mysql_error();
}else if ($rows <> 0 && $reindex == 1) {
$row = mysql_fetch_array($result);
$md5sum = $row[md5sum];
$indexdate = $row[indexdate];
index_url($thislink, $level+1, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex);
mysql_query("update ".$mysql_table_prefix."pending set level = $level, count=$count, num=$num where site_id=$site_id");
echo mysql_error();
}else {
printDatabaseReport($command_line);
}
}
$count++;
}
$level++;
}
mysql_query ("delete from ".$mysql_table_prefix."temp where id = '$sessid'");
echo mysql_error();
mysql_query ("delete from ".$mysql_table_prefix."pending where site_id = '$site_id'");
echo mysql_error();
printCompletedReport($command_line);
}
function index_all() {
global $mysql_table_prefix;
$result=mysql_query("select url, spider_depth, required, disallowed, can_leave_domain from ".$mysql_table_prefix."sites");
echo mysql_error();
while ($row=mysql_fetch_row($result)) {
$url = $row[0];
$depth = $row[1];
$include = $row[2];
$not_include = $row[3];
$can_leave_domain = $row[4];
if ($can_leave_domain=='') {
$can_leave_domain=0;
}
if ($depth == -1) {
$soption = 'full';
} else {
$soption = 'level';
}
index_site($url, 1, $depth, $soption, $include, $not_include, $can_leave_domain);
}
}
printQuitReport($command_line);
?>
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -