📄 spider.php
字号:
<?
/*******************************************
* Sphider Version 1.2
* This program is licensed under the GNU GPL.
* By Ando Saabas ando(a t)cs.ioc.ee
*
* Thanks to Antoine Bajolet for ideas and
* several code pieces
********************************************/
set_time_limit (0);
error_reporting (E_ALL ^ E_NOTICE);
$include_dir = "../include";
include "auth.php";
include "$include_dir/commonfuncs.php";
include "messages.php";
include "$include_dir/conf.php";
include "spiderfuncs.php";
$all = 0;
extract (getHttpVars());
$command_line = 0;
if (isset($_SERVER['argv']) && $_SERVER['argc'] >= 2) {
if (strtolower($_SERVER['argv'][1]) == 'all') {
$all = 1;
$command_line = 1;
} else {
$command_line = 1;
$url = $_SERVER['argv'][1];
foreach ($_SERVER['argv'] as $arg) {
if ($arg == 'full') {
$soption = 'full';
}
if ($arg == 'reindex') {
$reindex = 1;
}
if (is_num($arg)) {
$maxlevel = $arg;
$soption = 'level';
}
}
}
}
if ($soption == 'full') {
$maxlevel = -1;
}
if ($domaincb == '') {
$domaincb = 0;
}
if ($all == 1) {
index_all();
} else {
if ($reindex == 1 && $command_line == 1) {
$result=mysql_query("select url, spider_depth, required, disallowed, can_leave_domain from ".$mysql_table_prefix."sites where url='$url'");
echo mysql_error();
if($row=mysql_fetch_row($result)) {
$url = $row[0];
$maxlevel = $row[1];
$in= $row[2];
$out = $row[3];
$domaincb = $row[4];
if ($domaincb=='') {
$domaincb=0;
}
if ($depth == -1) {
$soption = 'full';
} else {
$soption = 'level';
}
}
}
index_site($url, $reindex, $maxlevel, $soption, $in, $out, $domaincb);
}
function index_url($url, $level, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex) {
global $entities;
global $command_line;
global $min_words_per_page;
global $supdomain;
global $mysql_table_prefix, $user_agent;
$needsReindex = 1;
$deletable = 0;
$url_status = url_status($url);
$thislevel = $level - 1;
if (strstr($url_status['state'], "Relocation")) {
$url = eregi_replace(" ", "", url_purify($url_status['path'], $url, $can_leave_domain));
if ($url <> '') {
$result = mysql_query("select link from ".$mysql_table_prefix."temp where link='$url' && id = '$sessid'");
echo mysql_error();
$rows = mysql_numrows($result);
if ($rows == 0) {
mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$url', '$level', '$sessid')");
echo mysql_error();
}
}
$url_status['state'] == "redirected";
}
/*
if ($indexdate <> '' && $url_status['date'] <> '') {
if ($indexdate > $url_status['date']) {
$url_status['state'] = "Date checked. Page contents not changed";
$needsReindex = 0;
}
}*/
ini_set("user_agent", $user_agent);
if ($url_status['state'] == 'ok') {
$OKtoIndex = 1;
$fl = @fopen($url, "r");
if ($fl) {
while ($buffer = @fgets($fl, 4096)) {
$file .= $buffer;
}
fclose ($fl);
} else {
$contents = getFileContents($url);
$file = $contents['file'];
}
$pageSize = number_format(strlen($file)/1024, 2, ".", "");
printPageSizeReport($pageSize);
if ($url_status['content'] != 'text') {
$file = extract_text($file, $url_status['content']);
}
printIndexStartReport();
$newmd5sum = md5($file);
if (isDuplicateMD5($newmd5sum)) {
$OKtoIndex = 0;
if ($command_line == 0) {
$report = " <font color=\"red\"><b>Page is a duplicate.</b></font><br>\n";
} else {
$report = " Page is a duplicate.\n";
}
}
if ($md5sum == $newmd5sum) {
if ($command_line == 0) {
$report = " <font color=\"red\"><b>MD5 sum checked. Page content not changed</b></font><br>\n";
} else {
$report = " MD5 sum checked. Page content not changed.\n";
}
$OKtoIndex = 0;
}
if (($md5sum != $newmd5sum || $reindex ==1) && $OKtoIndex == 1) {
$urlparts = parse_url($url);
$newdomain = $urlparts['host'];
$type = 0;
/* if ($newdomain <> $domain)
$domainChanged = 1;
if ($domaincb==1) {
$start = strlen($newdomain) - strlen($supdomain);
if (substr($newdomain, $start) == $supdomain) {
$domainChanged = 0;
}
}*/
// remove link to css file
//get all links from file
$data = clean_file($file, $url, $url_status['content']);
if ($data['noindex'] == 1) {
$OKtoIndex = 0;
$deletable = 1;
if ($command_line == 0) {
$report = " <font color=\"red\">No-Index flag set in meta tags.</font><br>\n";
} else {
$report = " No-Index flag set in meta tags.\n";
}
}
$wordarray = unique_array(explode(" ", $data['content']));
if ($data['nofollow'] != 1) {
$links = get_links($file, $url, $can_leave_domain);
$links = distinct_array($links);
$all_links = count($links);
$numoflinks = 0;
//if there are any, add to the temp table, but only if there isnt such url already
if (is_array($links)) {
reset ($links);
while ($thislink = each($links)) {
$result = mysql_query("select link from ".$mysql_table_prefix."temp where link='$thislink[1]' && id ='$sessid'");
echo mysql_error();
$rows = mysql_numrows($result);
if ($rows == 0) {
$numoflinks++;
mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$thislink[1]', '$level', '$sessid')");
echo mysql_error();
}
}
}
} else {
printNoFollow($command_line);
}
if ($OKtoIndex == 1) {
$title = $data['title'];
$host = $data['host'];
$path = $data['path'];
$fulltxt = $data['fulltext'];
$desc = $data['description'];
$wordarray = calc_weights ($wordarray, $title, $host, $path, $data['keywords']);
//if there are words to index, add the link to the database, get its id, and add the word + their relation
if (is_array($wordarray) && count($wordarray) > $min_words_per_page) {
if ($md5sum == '') {
mysql_query ("insert into ".$mysql_table_prefix."links (site_id, url, title, description, fulltxt, indexdate, size, md5sum, level) values ('$site_id', '$url', '$title', '$desc', '$fulltxt', curdate(), '$pageSize', '$newmd5sum', $thislevel)");
echo mysql_error();
$result = mysql_query("select link_id from ".$mysql_table_prefix."links where url='$url'");
echo mysql_error();
$row = mysql_fetch_row($result);
$link_id = $row[0];
save_keywords($wordarray, $link_id);
if ($command_line == 0) {
$report = " <font color=\"green\">Indexed</font><br>\n";
} else {
$report = " Indexed\n";
}
}else if (($md5sum <> '') && ($md5sum <> $newmd5sum)) { //if page has changed, start updating
$result = mysql_query("select link_id from ".$mysql_table_prefix."links where url='$url'");
echo mysql_error();
$row = mysql_fetch_row($result);
$link_id = $row[0];
mysql_query ("delete from ".$mysql_table_prefix."link_keyword where link_id=$link_id");
echo mysql_error();
save_keywords($wordarray, $link_id);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -