📄 spiderfuncs.php

📁 开源的蜘蛛程序
💻 PHP
📖 第 1 页 / 共 2 页
字号:
上一页 12

				$chunk = str_replace($regs[0], "", $chunk);
			}
		}

	}

	return $links;
}

/*
Function to build a unique word array from the text of a webpage, together with the count of each word 
*/
function unique_array($arr) {
	global $min_word_length;
	global $common;
	global $word_upper_bound;
	global $index_numbers;
	sort($arr);
	reset($arr);
	$newarr = array ();

	$i = 0;
	$counter = 1;
	$element = current($arr);

	if ($index_numbers == 1) {
		$pattern = "[a-z0-9]+";
	} else {
		$pattern = "[a-z]+";
	}

	$regs = Array ();
	for ($n = 0; $n < sizeof($arr); $n ++) {
		//check if word is long enough, contains alphabetic characters and is not a common word
		if (strlen($element) >= $min_word_length && eregi($pattern, remove_accents($element)) && (@ $common[$element] <> 1)) {
			//to eliminate/count multiple instance of words
			if (next($arr) != $element) {
				if (eregi("^(-|\\\')(.*)", $element, $regs))
					$element = $regs[2];

				if (eregi("(.*)(\\\'|-)$", $element, $regs))
					$element = $regs[1];

				$newarr[$i][1] = $element;
				$newarr[$i][2] = $counter;
				$element = current($arr);
				$i ++;
				$counter = 1;
			} else {
				if ($counter < $word_upper_bound)
					$counter ++;
			}
		} else
			$element = next($arr);
	}
	return $newarr;
}

/*
Checks if url is legal, relative to the main url.
*/
function url_purify($url, $parent_url, $can_leave_domain) {
	global $ext;
	global $mainurl;


	reset($ext);
	
	$url = convert_url($url);
	while (list ($id, $excl) = each($ext))
		if (eregi("\.$excl$", $url))
			return '';

	$urlparts = parse_url($url);

	if (eregi("[/]?mailto:|[/]?javascript:|[/]?news:", $url)) {
		return '';
	}
	$scheme = $urlparts['scheme'];

	//only http and https links are followed
	if (!($scheme == 'http' || $scheme == '' || $scheme == 'https')) {
		return '';
	}

	//parent url might be used to build an url from relative path
	$parent_url = remove_file_from_url($parent_url);
	$parent_url_parts = parse_url($parent_url);


	if (substr($url, 0, 1) == '/') {
		$url = $parent_url_parts['scheme']."://".$parent_url_parts['host'].$url;
	} else
		if ($urlparts['scheme'] == '') {
			$url = $parent_url.$url;
		}

	$url_parts = parse_url($url);

	$urlpath = $url_parts['path'];

	$regs = Array ();
	while (ereg("[^/]*/[.]{2}/", $urlpath, $regs)) {
		$urlpath = str_replace($regs[0], "", $urlpath);
	}

	//remove relative path instructions like ../ etc 
	$urlpath = str_replace("./", "", ereg_replace("^[.]/", "", ereg_replace("[^/]*/[.]{2}/", "", ereg_replace("^[.]/", "", ereg_replace("/+", "/", $urlpath)))));
	$query = "";
	if ($url_parts['query'] != "") {
		$query = "?".$url_parts['query'];
	}
	$url = $url_parts['scheme']."://".$url_parts['host'].$urlpath.$query;

	//if we index sub-domains
	if ($can_leave_domain == 1) {
		return $url;
	}

	$mainurl = remove_file_from_url($mainurl);

	//only urls in staying in the starting domain/directory are followed	
	if (strstr($url, $mainurl) == false)
		return '';
	else
		return $url;
}

function save_keywords($wordarray, $link_id) {
	global $mysql_table_prefix;
	reset($wordarray);

	while ($thisword = each($wordarray)) {
		$word = $thisword[1][1];
		$weight = $thisword[1][2];
		$result = mysql_query("select keyword_ID from ".$mysql_table_prefix."keywords where keyword='$word'");
		echo mysql_error();
		$rows = mysql_numrows($result);

		if ($rows == 0) {
			mysql_query("insert into ".$mysql_table_prefix."keywords (keyword) values ('$word')");
			echo mysql_error();			
			$result = mysql_query("select keyword_ID from ".$mysql_table_prefix."keywords where keyword='$word'");
			echo mysql_error();
			$row = mysql_fetch_row($result);
			$keyword_id = $row[0];
			echo mysql_error();
		} else {
			$row = mysql_fetch_row($result);
			$keyword_id = $row[0];
		}
		mysql_query("insert into ".$mysql_table_prefix."link_keyword (link_id, keyword_id, weight) values ('$link_id', '$keyword_id', '$weight')");
		echo mysql_error();
	}
}

function get_head_data($file) {
	$headdata = "";
	$first = strpos(strtolower($file), "<head");
	$next = strpos(strtolower($file), "</head>");
	if ($next > $first) {
		$headdata = substr($file, $first, $next -1);
	}
	$description = "";
	$robots = "";
	$keywords = "";
	$res = Array ();
	if ($headdata != "") {
		preg_match("/<meta +name *=[\"']?robots[\"']? *content=[\"']?([^<>'\"]+)[\"']?/i", $headdata, $res);
		if (isset ($res)) {
			$robots = $res[1];
		}

		preg_match("/<meta +name *=[\"']?description[\"']? *content=[\"']?([^<>'\"]+)[\"']?/i", $headdata, $res);
		if (isset ($res)) {
			$description = $res[1];
		}

		preg_match("/<meta +name *=[\"']?keywords[\"']? *content=[\"']?([^<>'\"]+)[\"']?/i", $headdata, $res);
		if (isset ($res)) {
			$keywords = $res[1];
		}

		$keywords = preg_replace("/[, ]+/", " ", $keywords);
		$robots = explode(",", strtolower($robots));
		foreach ($robots as $x) {
			if (trim($x) == "noindex") {
				$noindex = 1;
			}
			if (trim($x) == "nofollow") {
				$nofollow = 1;
			}
		}
		$data['description'] = addslashes($description);
		$data['keywords'] = addslashes($keywords);
		$data['nofollow'] = $nofollow;
		$data['noindex'] = $noindex;
	}
	return $data;
}

function clean_file($file, $url, $type) {

	global $entities, $index_host, $index_meta_keywords;

	$urlparts = parse_url($url);
	$host = $urlparts['host'];
	//remove filename from path
	$path = eregi_replace('([^/]+)$', "", $urlparts['path']);

	$file = eregi_replace("<link rel[^<>]*>", " ", $file);

	$first = strpos(strtolower($file), "<!--sphider_noindex-->");
	$count = 0;
	while (!($first === FALSE) && $count < 20) {
		$count ++;
		$next = strpos(strtolower($file), "<!--/sphider_noindex-->");
		$file = str_replace(substr($file, $first, $next - $first +23), " ", $file);
		$first = strpos(strtolower($file), "<!--sphider_noindex-->");
	}
	
	$first = strpos(strtolower($file), "<!--");
	$count = 0;
	while(!($first === FALSE) && $count < 20) {
		$count ++;
		$next = strpos(strtolower($file), "-->");
		$file = str_replace(substr($file, $first, $next - $first +3), " ", $file);
		$first = strpos(strtolower($file), "<!--");
	}

	

	$first = strpos(strtolower($file), "<script");
	$count = 0;
	while (!($first === FALSE) && $count < 20) {
		$count ++;
		$next = strpos(strtolower($file), "</script");
		$file = str_replace(substr($file, $first, $next - $first +9), " ", $file);
		$first = strpos(strtolower($file), "<script");
	}

	$headdata = get_head_data($file);

	$regs = Array ();
	if (eregi("<title *>([^<>]*)</title *>", $file, $regs)) {
		$title = $regs[1];
		$file = str_replace($regs[0], "", $file);
	} else if ($type == 'pdf' || $type == 'doc') { //the title of a non-html file is its first few words
		$title = substr($file, 0, strrpos(substr($file, 0, 40), " "));
	}

	$file = eregi_replace("(<style[^>]*>[^<>]*</style>)", " ", $file);

	//create spaces between tags, so that remove tags doesnt concatenate strings
	$file = preg_replace("/<[\w ]+>/", "\\0 ", $file);

	$file = preg_replace("/<\/[\w ]+>/", "\\0 ", $file);

	$file = preg_replace("/\s+/", " ", $file);

	$file = strip_tags($file);

	$file = preg_replace("/&nbsp;/", " ", $file);

	$file = preg_replace("/ +/", " ", $file);

	$fulltext = $file;
	$file .= " ".$title;
	if ($index_host == 1) {
		$file = $file." ".$host." ".$path;
	}
	if ($index_meta_keywords == 1) {
		$file = $file." ".$headdata['keywords'];
	}
	reset($entities);
	while ($char = each($entities)) {
		$file = eregi_replace($char[0], $char[1], $file);
	}

	//replace codes with ascii chars
	$file = ereg_replace('&#([0-9]+);', chr('\1'), $file);
	$file = strtolower($file);

	$file = eregi_replace("&[a-z]{1,6};", " ", $file);

	$file = preg_replace("/[\*\^\+\?\\\.\[\]\^\$\|\{\)\(\}~!\"\/@#
上一页 12
💿 文件大小 41 K
👤 上传用户 cal04
📂 所属分类 *行业应用
🏷️ 相关标签

#开源 #程序
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -