📄 spiderfuncs.php

📁 开源的蜘蛛程序
💻 PHP
📖 第 1 页 / 共 2 页
字号:
12 下一页
<?
function getFileContents($url) {
	global $user_agent;
	$urlparts = parse_url($url);
	$path = $urlparts['path'];
	$host = $urlparts['host'];
	if ($urlparts['query'] != "")
		$path .= "?".$urlparts['query'];
	if (isset ($urlparts['port'])) {
		$port = (int) $urlparts['port'];
	} else
		if ($urlparts['scheme'] == "http") {
			$port = 80;
		} else
			if ($urlparts['scheme'] == "https") {
				$port = 443;
			}

	if ($port == 80) {
		$portq = "";
	} else {
		$portq = ":$port";
	}

	$all = "*/*";

	$request = "GET $path HTTP/1.0\r\nHost: $host$portq\r\nAccept: $all\r\nAccept-Encoding: identity\r\nUser-Agent: $user_agent\r\n\r\n";

	$fsocket_timeout = 30;
	if (substr($url, 0, 5) == "https") {
		$target = "ssl://".$host;
	} else {
		$target = $host;
	}


	$errno = 0;
	$errstr = "";
	$fp = @ fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);

	if (!$fp) {
		$contents['state'] = "NOHOST";
		print "Error: $errstr";
		return $contents;
	} else {
		if (!fputs($fp, $request)) {
			$contents['state'] = "Cannot send request";
			return $contents;
		}
		$data = null;
		socket_set_timeout($fp, $fsocket_timeout);
		$status = socket_get_status($fp);
		while (!feof($fp) && !$status['timed_out']) {
			$data .= fgets($fp, 8192);
		}
		fclose($fp);
		if ($status['timed_out'] == 1) {
			$contents['state'] = "timeout";
		} else
			$contents['state'] = "ok";
		$contents['file'] = substr($data, strpos($data, "\r\n\r\n") + 4);
	}
	return $contents;
}

/*
check if file is available and in readable form
*/
function url_status($url) {
	global $user_agent, $index_pdf, $index_doc;
	$urlparts = parse_url($url);
	$path = $urlparts['path'];
	$host = $urlparts['host'];
	if ($urlparts['query'] != "")
		$path .= "?".$urlparts['query'];

	if (isset ($urlparts['port'])) {
		$port = (int) $urlparts['port'];
	} else
		if ($urlparts['scheme'] == "http") {
			$port = 80;
		} else
			if ($urlparts['scheme'] == "https") {
				$port = 443;
			}

	if ($port == 80) {
		$portq = "";
	} else {
		$portq = ":$port";
	}

	$all = "*/*"; //just to prevent "comment effect" in get accept
	$request = "HEAD $path HTTP/1.1\r\nHost: $host$portq\r\nAccept: $all\r\nAccept-Charset: iso-8859-1\r\nAccept-Encoding: identity\r\nUser-Agent: $user_agent\r\n\r\n";

	if (substr($url, 0, 5) == "https") {
		$target = "ssl://".$host;
	} else {
		$target = $host;
	}

	$fsocket_timeout = 30;
	$errno = 0;
	$errstr = "";
	$fp = fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);

	$linkstate = "ok";
	if (!$fp) {
		$status['state'] = "NOHOST";
	} else {
		socket_set_timeout($fp, 30);
		fputs($fp, $request);
		$answer = fgets($fp, 4096);
		$regs = Array ();
		if (ereg("HTTP/[0-9.]+ (([0-9])[0-9]{2})", $answer, $regs)) {
			$httpcode = $regs[2];
			$full_httpcode = $regs[1];

			if ($httpcode <> 2 && $httpcode <> 3) {
				$status['state'] = "Unreachable: http $full_httpcode";
				$linkstate = "Unreachable";
			}
		}

		if ($linkstate <> "Unreachable") {
			while ($answer) {
				$answer = fgets($fp, 4096);

				if (ereg("Location: *([^\n\r ]+)", $answer, $regs) && $httpcode == 3 && $full_httpcode != 302) {
					$status['path'] = $regs[1];
					$status['state'] = "Relocation: http $full_httpcode";
					fclose($fp);
					return $status;
				}

				if (eregi("Last-Modified: *([a-z0-9,: ]+)", $answer, $regs)) {
					$status['date'] = $regs[1];
				}

				if (eregi("Content-Type:", $answer)) {
					$content = $answer;
					$answer = '';
					break;
				}
			}
			$socket_status = socket_get_status($fp);
			if (eregi("Content-Type: *([a-z/]*)", $content, $regs)) {
				if ($regs[1] == 'text/html' || $regs[1] == 'text/' || $regs[1] == 'text/plain') {
					$status['content'] = 'text';
					$status['state'] = 'ok';
				} else if ($regs[1] == 'application/pdf' && $index_pdf == 1) {
					$status['content'] = 'pdf';
					$status['state'] = 'ok';
				} else if ($regs[1] == 'application/msword' && $index_doc == 1) {
					$status['content'] = 'doc';
					$status['state'] = 'ok';
				} else {
					$status['state'] = "Not text or html";
				}

			} else
				if ($socket_status['timed_out'] == 1) {
					$status['state'] = "Timed out (no reply from server)";

				} else
					$status['state'] = "Not text or html";

		}
	}
	fclose($fp);
	return $status;
}

/*
Read robots.txt file in the server, to find any disallowed files/folders
*/
function check_robot_txt($url) {
	$urlparts = parse_url($url);
	$url = 'http://'.$urlparts['host']."/robots.txt";

	$url_status = url_status($url);
	$omit = array ();

	if ($url_status['state'] == "ok") {
		$robot = file($url);
		if (!$robot) {
			$contents = getFileContents($url);
			$file = $contents['file'];
			$robot = explode("\n", $file);
		}

		$regs = Array ();
		$user_agent= "";
		while (list ($id, $line) = each($robot)) {
			if (eregi("^user-agent: *([^#]+) *", $line, $regs)) {
				$user_agent = trim($regs[1]);
				if ($user_agent == '*' || $user_agent == 'Sphider')
					$check = 1;
				else
					$check = 0;
			}

			if (eregi("disallow: *([^#]+)", $line, $regs) && $check == 1) {
				$disallow_str = eregi_replace("[\n ]+", "", $regs[1]);
				if (trim($disallow_str) != "") {
					$omit[] = $disallow_str;
				} else {
					if ($user_agent == '*' || $user_agent == 'Sphider') {
						return null;
					}
				}
			}
		}
	}

	return $omit;
}

/*
Remove the file part from an url (to build an url from an url and given relative path)
*/
function remove_file_from_url($url) {
	$url_parts = parse_url($url);
	$path = $url_parts['path'];

	$regs = Array ();
	if (eregi('([^/]+)$', $path, $regs)) {
		$file = $regs[1];
		$check = $file.'$';
		$path = eregi_replace($check, "", $path);
	}
	$url = $url_parts['scheme']."://".$url_parts['host'].$path;
	return $url;
}

/*
Extract links from html
*/
function get_links($file, $url, $can_leave_domain) {
	$chunklist = array ();

	$chunklist = explode("\n", $file);
	$links = array ();
	$regs = Array ();
	while (list ($id, $chunk) = each($chunklist)) {
		if (strstr(strtolower($chunk), "href")) {
			while (preg_match("/(href)\s*=\s*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/\?~=&;\\\(\),._a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?(\s*rel\s*=\s*[\'\"]?(nofollow)[\'\"]?)?/i", $chunk, $regs)) {
				if (!isset ($regs[10])) { //if nofollow is not set
					if (($a = url_purify($regs[2], $url, $can_leave_domain)) != '')
						$links[] = $a;
				}

				$chunk = str_replace($regs[0], "", $chunk);
			}
		}

		if (strstr(strtolower($chunk), "frame") && strstr(strtolower($chunk), "src")) {
			while (eregi("(frame[^>]*src[[:blank:]]*)=[[:blank:]]*[\'\"]?(([[a-z]{3,5}://(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%/?=&;\\\(\),._a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?", $chunk, $regs)) {
				if (($a = url_purify($regs[2], $url, $can_leave_domain)) != '')
					$links[] = $a;

				$chunk = str_replace($regs[0], "", $chunk);
			}
		}

		if (strstr(strtolower($chunk), "window") && strstr(strtolower($chunk), "location")) {
			while (eregi("(window[.]location)[[:blank:]]*=[[:blank:]]*[\'\"]?(([[a-z]{3,5}://(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%/?=&;\\\(\),._a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?", $chunk, $regs)) {
				if (($a = url_purify($regs[2], $url, $can_leave_domain)) != '')
					$links[] = $a;

				$chunk = str_replace($regs[0], "", $chunk);
			}
		}

		if (strstr(strtolower($chunk), "http-equiv")) {
			while (eregi("(http-equiv=['\"]refresh['\"] *content=['\"][0-9]+;url)[[:blank:]]*=[[:blank:]]*[\'\"]?(([[a-z]{3,5}://(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%/?=&;\\\(\),._a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?", $chunk, $regs)) {
				if (($a = url_purify($regs[2], $url, $can_leave_domain)) != '')
					$links[] = $a;

				$chunk = str_replace($regs[0], "", $chunk);
			}
		}

		if (strstr(strtolower($chunk), "window") && strstr(strtolower($chunk), "open")) {
			while (eregi("(window[.]open[[:blank:]]*[(])[[:blank:]]*[\'\"]?(([[a-z]{3,5}://(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%/?=&;\\\(\),._a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?", $chunk, $regs)) {
				if (($a = url_purify($regs[2], $url, $can_leave_domain)) != '')
					$links[] = $a;
12 下一页
💿 文件大小 41 K
👤 上传用户 cal04
📂 所属分类 *行业应用
🏷️ 相关标签

#开源 #程序
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -