⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 spider.php

📁 开源的蜘蛛程序
💻 PHP
📖 第 1 页 / 共 2 页
字号:
<?
/*******************************************
* Sphider Version 1.2
* This program is licensed under the GNU GPL.
* By Ando Saabas		  ando(a t)cs.ioc.ee
*
* Thanks to Antoine Bajolet for ideas and
* several code pieces
********************************************/
	
	set_time_limit (0);
	error_reporting (E_ALL ^ E_NOTICE);
	$include_dir = "../include";
	include "auth.php";
	include "$include_dir/commonfuncs.php";
	include "messages.php";
	include "$include_dir/conf.php";
	include "spiderfuncs.php";

	$all = 0; 
	
	extract (getHttpVars());
	
	$command_line = 0;
	
	if (isset($_SERVER['argv']) && $_SERVER['argc'] >= 2) {
		if (strtolower($_SERVER['argv'][1]) == 'all') {
			$all = 1;
			$command_line = 1;
		} else {
			$command_line = 1;
			$url = $_SERVER['argv'][1];
			foreach ($_SERVER['argv'] as $arg) {
				if ($arg == 'full') {
					$soption = 'full';
				}
				if ($arg == 'reindex') {
					$reindex = 1;
				}
				if (is_num($arg)) {
					$maxlevel = $arg;
					$soption = 'level';
				}
			}
		}
	}

	if ($soption == 'full') {
		$maxlevel = -1;
	}
	
	if ($domaincb == '') {
		$domaincb = 0;
	}

	if ($all ==  1) {
		index_all();
	} else {
		if ($reindex == 1 && $command_line == 1) {
			$result=mysql_query("select url, spider_depth, required, disallowed, can_leave_domain from ".$mysql_table_prefix."sites where url='$url'");
			echo mysql_error();
			if($row=mysql_fetch_row($result)) {
				$url = $row[0];
				$maxlevel = $row[1];
				$in= $row[2];
				$out = $row[3];
				$domaincb = $row[4];
				if ($domaincb=='') {
					$domaincb=0;
				}
				if ($depth == -1) {
					$soption = 'full';
				} else {
					$soption = 'level';
				}
			}

		}

		index_site($url, $reindex, $maxlevel, $soption, $in, $out, $domaincb);
	}

	
	function index_url($url, $level, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex) {
		global $entities;
		global $command_line;
		global $min_words_per_page;
		global $supdomain;
		global $mysql_table_prefix, $user_agent;
		$needsReindex = 1;
		$deletable = 0;
		$url_status = url_status($url);
		$thislevel = $level - 1;

		if (strstr($url_status['state'], "Relocation")) {
			$url = eregi_replace(" ", "", url_purify($url_status['path'], $url, $can_leave_domain));

			if ($url <> '') {
				$result = mysql_query("select link from ".$mysql_table_prefix."temp where link='$url' && id = '$sessid'");
				echo mysql_error();
				$rows = mysql_numrows($result);
				if ($rows == 0) {
					mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$url', '$level', '$sessid')");
					echo mysql_error();
				}
			}

			$url_status['state'] == "redirected";
		}

		/*
		if ($indexdate <> '' && $url_status['date'] <> '') {
			if ($indexdate > $url_status['date']) {
				$url_status['state'] = "Date checked. Page contents not changed";
				$needsReindex = 0;
			}
		}*/
		ini_set("user_agent", $user_agent);
		if ($url_status['state'] == 'ok') {
			$OKtoIndex = 1;
			$fl = @fopen($url, "r");
			if ($fl) {
				while ($buffer = @fgets($fl, 4096)) {
					$file .= $buffer;
				
				}
				fclose ($fl);
			} else {
				$contents = getFileContents($url);
				$file = $contents['file'];
			}
			
		
			$pageSize = number_format(strlen($file)/1024, 2, ".", "");
			printPageSizeReport($pageSize);

			if ($url_status['content'] != 'text') {
				$file = extract_text($file, $url_status['content']);
			}
			
			printIndexStartReport();
		

			$newmd5sum = md5($file);
			
			if (isDuplicateMD5($newmd5sum)) {
				$OKtoIndex = 0;
				if ($command_line == 0) { 
					$report = " <font color=\"red\"><b>Page is a duplicate.</b></font><br>\n";
				} else {
					$report = " Page is a duplicate.\n";
				}
			}

			if ($md5sum == $newmd5sum) {
				if ($command_line == 0) { 
					$report = " <font color=\"red\"><b>MD5 sum checked. Page content not changed</b></font><br>\n";
				} else {
					$report = " MD5 sum checked. Page content not changed.\n";
				}
				$OKtoIndex = 0;
			}
			if (($md5sum != $newmd5sum || $reindex ==1) && $OKtoIndex == 1) {
				$urlparts = parse_url($url);
				$newdomain = $urlparts['host'];
				$type = 0;


				
		/*		if ($newdomain <> $domain)
					$domainChanged = 1;

				if ($domaincb==1) {
					$start = strlen($newdomain) - strlen($supdomain);
					if (substr($newdomain, $start) == $supdomain) {
						$domainChanged = 0;
					}
				}*/

				// remove link to css file
				//get all links from file
				$data = clean_file($file, $url, $url_status['content']);

				if ($data['noindex'] == 1) {
					$OKtoIndex = 0;
					$deletable = 1;
					if ($command_line == 0) {
						$report = " <font color=\"red\">No-Index flag set in meta tags.</font><br>\n";
					} else {
						$report = " No-Index flag set in meta tags.\n";
					}
				}

				$wordarray = unique_array(explode(" ", $data['content']));



				if ($data['nofollow'] != 1) {
					$links = get_links($file, $url, $can_leave_domain);
					$links = distinct_array($links);
					$all_links = count($links);
					$numoflinks = 0;
					//if there are any, add to the temp table, but only if there isnt such url already
					if (is_array($links)) {
						reset ($links);

						while ($thislink = each($links)) {
							$result = mysql_query("select link from ".$mysql_table_prefix."temp where link='$thislink[1]' && id ='$sessid'");
							echo mysql_error();
							$rows = mysql_numrows($result);
							if ($rows == 0) {
								$numoflinks++;
								mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$thislink[1]', '$level', '$sessid')");
								echo mysql_error();
							}
						}
					}
				} else {
					printNoFollow($command_line);
				}
				
				if ($OKtoIndex == 1) {
					
					$title = $data['title'];
					$host = $data['host'];
					$path = $data['path'];
					$fulltxt = $data['fulltext'];
					$desc = $data['description'];
					$wordarray = calc_weights ($wordarray, $title, $host, $path, $data['keywords']);


					//if there are words to index, add the link to the database, get its id, and add the word + their relation
					if (is_array($wordarray) && count($wordarray) > $min_words_per_page) {
						if ($md5sum == '') {
							mysql_query ("insert into ".$mysql_table_prefix."links (site_id, url, title, description, fulltxt, indexdate, size, md5sum, level) values ('$site_id', '$url', '$title', '$desc', '$fulltxt', curdate(), '$pageSize', '$newmd5sum', $thislevel)");
							echo mysql_error();
							$result = mysql_query("select link_id from ".$mysql_table_prefix."links where url='$url'");
							echo mysql_error();
							$row = mysql_fetch_row($result);
							$link_id = $row[0];
							save_keywords($wordarray, $link_id);
							if ($command_line == 0) {
								$report = " <font color=\"green\">Indexed</font><br>\n";
							} else {
								$report = " Indexed\n";
							}
						}else if (($md5sum <> '') && ($md5sum <> $newmd5sum)) { //if page has changed, start updating

							$result = mysql_query("select link_id from ".$mysql_table_prefix."links where url='$url'");
							echo mysql_error();
							$row = mysql_fetch_row($result);
							$link_id = $row[0];
							mysql_query ("delete from ".$mysql_table_prefix."link_keyword where link_id=$link_id");
							echo mysql_error();
							save_keywords($wordarray, $link_id);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -