⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 spider.php

📁 开源的蜘蛛程序
💻 PHP
📖 第 1 页 / 共 2 页
字号:
							$query = "update ".$mysql_table_prefix."links set title='$title', description ='$desc', fulltxt = '$fulltxt', indexdate=now(), size = '$pageSize', md5sum='$newmd5sum', level=$thislevel where link_id=$link_id";
							mysql_query($query);
							echo mysql_error();
							if ($command_line == 0) {
								$report = " <font color=\"green\">Re-indexed</font><br>\n";
							} else {
								$report = " Re-indexed\n";
							}
						}
					}else {
						if ($command_line == 0) { 
							$report = " <font color=\"red\">Page contains less than $min_words_per_page words</font><br>\n";
						} else {
						   $report = " Page contains less than 10 words.\n";
						}
					}
				}
			}
		} else {
			$deletable = 1;
			if ($command_line == 0) { 
				$report = " <font color=red><b>" . $url_status['state'] . "</b></font><br>\n";
			} else {
			   $report = $url_status['state']."\n";
			}
		}
		if ($reindex ==1 && $deletable == 1) {
			check_for_removal($url); 
		} else if ($reindex == 1) {
			
		}
		printLinksReport($numoflinks, $all_links, $command_line);
		printStatusReport ($report);
	}


	function index_site($url, $reindex, $maxlevel, $soption, $url_inc, $url_not_inc, $can_leave_domain) {
		global $mysql_table_prefix, $command_line, $mainurl;
		$compurl = parse_url($url);
		if ($compurl['path'] == '')
			$url = $url . "/";
	
		$t = microtime();
		$a =  getenv("REMOTE_ADDR");
		$sessid = md5 ($t.$a);
	
	
		$urlparts = parse_url($url);
	
		$domain = $urlparts['host'];
		$port = $urlparts['port'];
		if (isset($urlparts['port'])) {
			$port = (int)$urlparts['port'];
		}else {
			$port = 80;
		}

		
		$errno = 0;
		$errmsg = "";
		$fp = fsockopen($domain, $port, $errno, $errmsg);
		if (!$fp) {
			printConnectErrorReport($errmsg);
			die();
		}
	
		fclose ($fp);
	
	
		$result = mysql_query("select site_id from ".$mysql_table_prefix."sites where url='$url'");
		echo mysql_error();
		$row = mysql_fetch_row($result);
		$site_id = $row[0];
		
		if ($site_id != "" && $reindex == 1) {
			mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$url', 0, '$sessid')");
			echo mysql_error();
			$result = mysql_query("select url, level from ".$mysql_table_prefix."links where site_id = $site_id");
			while ($row = mysql_fetch_array($result)) {
				$site_link = $row['url'];
				$link_level = $row['level'];
				if ($site_link != $url) {
					mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$site_link', $link_level, '$sessid')");
				}
			}
			
			$qry = "update ".$mysql_table_prefix."sites set indexdate=now(), spider_depth = $maxlevel, required = '$url_inc'," .
					"disallowed = '$url_not_inc', can_leave_domain=$can_leave_domain where site_id=$site_id";
			mysql_query ($qry);
			echo mysql_error();
		} else if ($site_id == '') {
			mysql_query ("insert into ".$mysql_table_prefix."sites (url, indexdate, spider_depth, required, disallowed, can_leave_domain) " .
					"values ('$url', now(), $maxlevel, '$url_inc', '$url_not_inc', $can_leave_domain)");
			echo mysql_error();
			$result = mysql_query("select site_ID from ".$mysql_table_prefix."sites where url='$url'");
			$row = mysql_fetch_row($result);
			$site_id = $row[0];
		} else {
			mysql_query ("update ".$mysql_table_prefix."sites set indexdate=now(), spider_depth = $maxlevel, required = '$url_inc'," .
					"disallowed = '$url_not_inc', can_leave_domain=$can_leave_domain where site_id=$site_id");
			echo mysql_error();
		}
	
	
		$result = mysql_query("select site_id, temp_id, level, count, num from ".$mysql_table_prefix."pending where site_id='$site_id'");
		echo mysql_error();
		$row = mysql_fetch_row($result);
		$pending = $row[0];
		$level = 0;
		if ($pending == '') {
			mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$url', 0, '$sessid')");
			echo mysql_error();
		} else if ($pending != '') {
			print "<br>Continuing suspended indexing <br>";
			mysql_query("select temp_id, level, count from ".$mysql_table_prefix."pending where site_id='$site_id'");
			echo mysql_error();
			$sessid = $row[1];
			$level = $row[2];
			$pend_count = $row[3] + 1;
			$num = $row[4];
			$pending = 1;
		}
	
		if ($reindex != 1) {
			mysql_query ("insert into ".$mysql_table_prefix."pending (site_id, temp_id, level, count) values ('$site_id', '$sessid', '0', '0')");
			echo mysql_error();
		}
	
	
		$time = time();
	
	
		$omit = check_robot_txt($url);
	
		printHeader ($omit, $url, $command_line);
	
	
		$mainurl = $url;

	
		while (($level <= $maxlevel && $soption == 'level') || ($soption == 'full')) {
			if ($pending == 1) {
				$count = $pend_count;
				$pending = 0;
			} else
				$count = 0;
	
			$links = array();
	
			$result = mysql_query("select distinct link from ".$mysql_table_prefix."temp where level=$level && id='$sessid' order by link");
			echo mysql_error();
			$rows = mysql_num_rows($result);
	
			if ($rows == 0) {
				break;
			}
	
			$i = 0;
	
			while ($row = mysql_fetch_array($result)) {
				$links[] = $row['link'];
			}
	
			reset ($links);
	
	
			while ($count < count($links)) {
				$num++;
				$thislink = $links[$count];
				$urlparts = parse_url($thislink);
				reset ($omit);
				$forbidden = 0;
				foreach ($omit as $omiturl) {
					$omiturl = trim($omiturl);
	
					$omiturl_parts = parse_url($omiturl);
					if ($omiturl_parts['scheme'] == '') {
						$check_omit = $urlparts['host'] . $omiturl;
					} else {
						$check_omit = $omiturl;
					}
	
					if (strpos($thislink, $check_omit)) {
						printRobotsReport($num, $thislink, $command_line);
						check_for_removal($thislink); 
						$forbidden = 1;
						break;
					}
				}
				
				if (!check_include($thislink, $url_inc, $url_not_inc )) {
					printUrlStringReport($num, $thislink, $command_line);
					check_for_removal($thislink); 
					$forbidden = 1;
				} 
	
				if ($forbidden == 0) {
					printRetrieving($num, $thislink, $command_line);
					$query = "select md5sum, indexdate from ".$mysql_table_prefix."links where url='$thislink'";
					$result = mysql_query($query);
					echo mysql_error();
					$rows = mysql_num_rows($result);
					if ($rows == 0) {
						index_url($thislink, $level+1, $site_id, '',  $domain, '', $sessid, $can_leave_domain, $reindex);
						mysql_query("update ".$mysql_table_prefix."pending set level = $level, count=$count, num=$num where site_id=$site_id");
						echo mysql_error();
					}else if ($rows <> 0 && $reindex == 1) {
						$row = mysql_fetch_array($result);
						$md5sum = $row[md5sum];
						$indexdate = $row[indexdate];
						index_url($thislink, $level+1, $site_id, $md5sum,  $domain, $indexdate, $sessid, $can_leave_domain, $reindex);
						mysql_query("update ".$mysql_table_prefix."pending set level = $level, count=$count, num=$num where site_id=$site_id");
						echo mysql_error();
					}else {
						printDatabaseReport($command_line);
					}

				}
				$count++;
			}
			$level++;
		}
		mysql_query ("delete from ".$mysql_table_prefix."temp where id = '$sessid'");
		echo mysql_error();
		mysql_query ("delete from ".$mysql_table_prefix."pending where site_id = '$site_id'");
		echo mysql_error();
		printCompletedReport($command_line);

	}

	function index_all() {
		global $mysql_table_prefix;
		$result=mysql_query("select url, spider_depth, required, disallowed, can_leave_domain from ".$mysql_table_prefix."sites");
		echo mysql_error();
    	while ($row=mysql_fetch_row($result)) {
    		$url = $row[0];
	   		$depth = $row[1];
    		$include = $row[2];
    		$not_include = $row[3];
    		$can_leave_domain = $row[4];
    		if ($can_leave_domain=='') {
    			$can_leave_domain=0;
    		}
    		if ($depth == -1) {
    			$soption = 'full';
    		} else {
    			$soption = 'level';
    		}
			index_site($url, 1, $depth, $soption, $include, $not_include, $can_leave_domain);
		}
	}

printQuitReport($command_line);
?>

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -