📄 spiderfuncs.php
字号:
<?
function getFileContents($url) {
global $user_agent;
$urlparts = parse_url($url);
$path = $urlparts['path'];
$host = $urlparts['host'];
if ($urlparts['query'] != "")
$path .= "?".$urlparts['query'];
if (isset ($urlparts['port'])) {
$port = (int) $urlparts['port'];
} else
if ($urlparts['scheme'] == "http") {
$port = 80;
} else
if ($urlparts['scheme'] == "https") {
$port = 443;
}
if ($port == 80) {
$portq = "";
} else {
$portq = ":$port";
}
$all = "*/*";
$request = "GET $path HTTP/1.0\r\nHost: $host$portq\r\nAccept: $all\r\nAccept-Encoding: identity\r\nUser-Agent: $user_agent\r\n\r\n";
$fsocket_timeout = 30;
if (substr($url, 0, 5) == "https") {
$target = "ssl://".$host;
} else {
$target = $host;
}
$errno = 0;
$errstr = "";
$fp = @ fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
if (!$fp) {
$contents['state'] = "NOHOST";
print "Error: $errstr";
return $contents;
} else {
if (!fputs($fp, $request)) {
$contents['state'] = "Cannot send request";
return $contents;
}
$data = null;
socket_set_timeout($fp, $fsocket_timeout);
$status = socket_get_status($fp);
while (!feof($fp) && !$status['timed_out']) {
$data .= fgets($fp, 8192);
}
fclose($fp);
if ($status['timed_out'] == 1) {
$contents['state'] = "timeout";
} else
$contents['state'] = "ok";
$contents['file'] = substr($data, strpos($data, "\r\n\r\n") + 4);
}
return $contents;
}
/*
check if file is available and in readable form
*/
function url_status($url) {
global $user_agent, $index_pdf, $index_doc;
$urlparts = parse_url($url);
$path = $urlparts['path'];
$host = $urlparts['host'];
if ($urlparts['query'] != "")
$path .= "?".$urlparts['query'];
if (isset ($urlparts['port'])) {
$port = (int) $urlparts['port'];
} else
if ($urlparts['scheme'] == "http") {
$port = 80;
} else
if ($urlparts['scheme'] == "https") {
$port = 443;
}
if ($port == 80) {
$portq = "";
} else {
$portq = ":$port";
}
$all = "*/*"; //just to prevent "comment effect" in get accept
$request = "HEAD $path HTTP/1.1\r\nHost: $host$portq\r\nAccept: $all\r\nAccept-Charset: iso-8859-1\r\nAccept-Encoding: identity\r\nUser-Agent: $user_agent\r\n\r\n";
if (substr($url, 0, 5) == "https") {
$target = "ssl://".$host;
} else {
$target = $host;
}
$fsocket_timeout = 30;
$errno = 0;
$errstr = "";
$fp = fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
$linkstate = "ok";
if (!$fp) {
$status['state'] = "NOHOST";
} else {
socket_set_timeout($fp, 30);
fputs($fp, $request);
$answer = fgets($fp, 4096);
$regs = Array ();
if (ereg("HTTP/[0-9.]+ (([0-9])[0-9]{2})", $answer, $regs)) {
$httpcode = $regs[2];
$full_httpcode = $regs[1];
if ($httpcode <> 2 && $httpcode <> 3) {
$status['state'] = "Unreachable: http $full_httpcode";
$linkstate = "Unreachable";
}
}
if ($linkstate <> "Unreachable") {
while ($answer) {
$answer = fgets($fp, 4096);
if (ereg("Location: *([^\n\r ]+)", $answer, $regs) && $httpcode == 3 && $full_httpcode != 302) {
$status['path'] = $regs[1];
$status['state'] = "Relocation: http $full_httpcode";
fclose($fp);
return $status;
}
if (eregi("Last-Modified: *([a-z0-9,: ]+)", $answer, $regs)) {
$status['date'] = $regs[1];
}
if (eregi("Content-Type:", $answer)) {
$content = $answer;
$answer = '';
break;
}
}
$socket_status = socket_get_status($fp);
if (eregi("Content-Type: *([a-z/]*)", $content, $regs)) {
if ($regs[1] == 'text/html' || $regs[1] == 'text/' || $regs[1] == 'text/plain') {
$status['content'] = 'text';
$status['state'] = 'ok';
} else if ($regs[1] == 'application/pdf' && $index_pdf == 1) {
$status['content'] = 'pdf';
$status['state'] = 'ok';
} else if ($regs[1] == 'application/msword' && $index_doc == 1) {
$status['content'] = 'doc';
$status['state'] = 'ok';
} else {
$status['state'] = "Not text or html";
}
} else
if ($socket_status['timed_out'] == 1) {
$status['state'] = "Timed out (no reply from server)";
} else
$status['state'] = "Not text or html";
}
}
fclose($fp);
return $status;
}
/*
Read robots.txt file in the server, to find any disallowed files/folders
*/
function check_robot_txt($url) {
$urlparts = parse_url($url);
$url = 'http://'.$urlparts['host']."/robots.txt";
$url_status = url_status($url);
$omit = array ();
if ($url_status['state'] == "ok") {
$robot = file($url);
if (!$robot) {
$contents = getFileContents($url);
$file = $contents['file'];
$robot = explode("\n", $file);
}
$regs = Array ();
$user_agent= "";
while (list ($id, $line) = each($robot)) {
if (eregi("^user-agent: *([^#]+) *", $line, $regs)) {
$user_agent = trim($regs[1]);
if ($user_agent == '*' || $user_agent == 'Sphider')
$check = 1;
else
$check = 0;
}
if (eregi("disallow: *([^#]+)", $line, $regs) && $check == 1) {
$disallow_str = eregi_replace("[\n ]+", "", $regs[1]);
if (trim($disallow_str) != "") {
$omit[] = $disallow_str;
} else {
if ($user_agent == '*' || $user_agent == 'Sphider') {
return null;
}
}
}
}
}
return $omit;
}
/*
Remove the file part from an url (to build an url from an url and given relative path)
*/
function remove_file_from_url($url) {
$url_parts = parse_url($url);
$path = $url_parts['path'];
$regs = Array ();
if (eregi('([^/]+)$', $path, $regs)) {
$file = $regs[1];
$check = $file.'$';
$path = eregi_replace($check, "", $path);
}
$url = $url_parts['scheme']."://".$url_parts['host'].$path;
return $url;
}
/*
Extract links from html
*/
function get_links($file, $url, $can_leave_domain) {
$chunklist = array ();
$chunklist = explode("\n", $file);
$links = array ();
$regs = Array ();
while (list ($id, $chunk) = each($chunklist)) {
if (strstr(strtolower($chunk), "href")) {
while (preg_match("/(href)\s*=\s*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/\?~=&;\\\(\),._a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?(\s*rel\s*=\s*[\'\"]?(nofollow)[\'\"]?)?/i", $chunk, $regs)) {
if (!isset ($regs[10])) { //if nofollow is not set
if (($a = url_purify($regs[2], $url, $can_leave_domain)) != '')
$links[] = $a;
}
$chunk = str_replace($regs[0], "", $chunk);
}
}
if (strstr(strtolower($chunk), "frame") && strstr(strtolower($chunk), "src")) {
while (eregi("(frame[^>]*src[[:blank:]]*)=[[:blank:]]*[\'\"]?(([[a-z]{3,5}://(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%/?=&;\\\(\),._a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?", $chunk, $regs)) {
if (($a = url_purify($regs[2], $url, $can_leave_domain)) != '')
$links[] = $a;
$chunk = str_replace($regs[0], "", $chunk);
}
}
if (strstr(strtolower($chunk), "window") && strstr(strtolower($chunk), "location")) {
while (eregi("(window[.]location)[[:blank:]]*=[[:blank:]]*[\'\"]?(([[a-z]{3,5}://(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%/?=&;\\\(\),._a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?", $chunk, $regs)) {
if (($a = url_purify($regs[2], $url, $can_leave_domain)) != '')
$links[] = $a;
$chunk = str_replace($regs[0], "", $chunk);
}
}
if (strstr(strtolower($chunk), "http-equiv")) {
while (eregi("(http-equiv=['\"]refresh['\"] *content=['\"][0-9]+;url)[[:blank:]]*=[[:blank:]]*[\'\"]?(([[a-z]{3,5}://(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%/?=&;\\\(\),._a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?", $chunk, $regs)) {
if (($a = url_purify($regs[2], $url, $can_leave_domain)) != '')
$links[] = $a;
$chunk = str_replace($regs[0], "", $chunk);
}
}
if (strstr(strtolower($chunk), "window") && strstr(strtolower($chunk), "open")) {
while (eregi("(window[.]open[[:blank:]]*[(])[[:blank:]]*[\'\"]?(([[a-z]{3,5}://(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%/?=&;\\\(\),._a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?", $chunk, $regs)) {
if (($a = url_purify($regs[2], $url, $can_leave_domain)) != '')
$links[] = $a;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -