📄 spiderfuncs.php
字号:
$chunk = str_replace($regs[0], "", $chunk);
}
}
}
return $links;
}
/*
Function to build a unique word array from the text of a webpage, together with the count of each word
*/
function unique_array($arr) {
global $min_word_length;
global $common;
global $word_upper_bound;
global $index_numbers;
sort($arr);
reset($arr);
$newarr = array ();
$i = 0;
$counter = 1;
$element = current($arr);
if ($index_numbers == 1) {
$pattern = "[a-z0-9]+";
} else {
$pattern = "[a-z]+";
}
$regs = Array ();
for ($n = 0; $n < sizeof($arr); $n ++) {
//check if word is long enough, contains alphabetic characters and is not a common word
if (strlen($element) >= $min_word_length && eregi($pattern, remove_accents($element)) && (@ $common[$element] <> 1)) {
//to eliminate/count multiple instance of words
if (next($arr) != $element) {
if (eregi("^(-|\\\')(.*)", $element, $regs))
$element = $regs[2];
if (eregi("(.*)(\\\'|-)$", $element, $regs))
$element = $regs[1];
$newarr[$i][1] = $element;
$newarr[$i][2] = $counter;
$element = current($arr);
$i ++;
$counter = 1;
} else {
if ($counter < $word_upper_bound)
$counter ++;
}
} else
$element = next($arr);
}
return $newarr;
}
/*
Checks if url is legal, relative to the main url.
*/
function url_purify($url, $parent_url, $can_leave_domain) {
global $ext;
global $mainurl;
reset($ext);
$url = convert_url($url);
while (list ($id, $excl) = each($ext))
if (eregi("\.$excl$", $url))
return '';
$urlparts = parse_url($url);
if (eregi("[/]?mailto:|[/]?javascript:|[/]?news:", $url)) {
return '';
}
$scheme = $urlparts['scheme'];
//only http and https links are followed
if (!($scheme == 'http' || $scheme == '' || $scheme == 'https')) {
return '';
}
//parent url might be used to build an url from relative path
$parent_url = remove_file_from_url($parent_url);
$parent_url_parts = parse_url($parent_url);
if (substr($url, 0, 1) == '/') {
$url = $parent_url_parts['scheme']."://".$parent_url_parts['host'].$url;
} else
if ($urlparts['scheme'] == '') {
$url = $parent_url.$url;
}
$url_parts = parse_url($url);
$urlpath = $url_parts['path'];
$regs = Array ();
while (ereg("[^/]*/[.]{2}/", $urlpath, $regs)) {
$urlpath = str_replace($regs[0], "", $urlpath);
}
//remove relative path instructions like ../ etc
$urlpath = str_replace("./", "", ereg_replace("^[.]/", "", ereg_replace("[^/]*/[.]{2}/", "", ereg_replace("^[.]/", "", ereg_replace("/+", "/", $urlpath)))));
$query = "";
if ($url_parts['query'] != "") {
$query = "?".$url_parts['query'];
}
$url = $url_parts['scheme']."://".$url_parts['host'].$urlpath.$query;
//if we index sub-domains
if ($can_leave_domain == 1) {
return $url;
}
$mainurl = remove_file_from_url($mainurl);
//only urls in staying in the starting domain/directory are followed
if (strstr($url, $mainurl) == false)
return '';
else
return $url;
}
function save_keywords($wordarray, $link_id) {
global $mysql_table_prefix;
reset($wordarray);
while ($thisword = each($wordarray)) {
$word = $thisword[1][1];
$weight = $thisword[1][2];
$result = mysql_query("select keyword_ID from ".$mysql_table_prefix."keywords where keyword='$word'");
echo mysql_error();
$rows = mysql_numrows($result);
if ($rows == 0) {
mysql_query("insert into ".$mysql_table_prefix."keywords (keyword) values ('$word')");
echo mysql_error();
$result = mysql_query("select keyword_ID from ".$mysql_table_prefix."keywords where keyword='$word'");
echo mysql_error();
$row = mysql_fetch_row($result);
$keyword_id = $row[0];
echo mysql_error();
} else {
$row = mysql_fetch_row($result);
$keyword_id = $row[0];
}
mysql_query("insert into ".$mysql_table_prefix."link_keyword (link_id, keyword_id, weight) values ('$link_id', '$keyword_id', '$weight')");
echo mysql_error();
}
}
function get_head_data($file) {
$headdata = "";
$first = strpos(strtolower($file), "<head");
$next = strpos(strtolower($file), "</head>");
if ($next > $first) {
$headdata = substr($file, $first, $next -1);
}
$description = "";
$robots = "";
$keywords = "";
$res = Array ();
if ($headdata != "") {
preg_match("/<meta +name *=[\"']?robots[\"']? *content=[\"']?([^<>'\"]+)[\"']?/i", $headdata, $res);
if (isset ($res)) {
$robots = $res[1];
}
preg_match("/<meta +name *=[\"']?description[\"']? *content=[\"']?([^<>'\"]+)[\"']?/i", $headdata, $res);
if (isset ($res)) {
$description = $res[1];
}
preg_match("/<meta +name *=[\"']?keywords[\"']? *content=[\"']?([^<>'\"]+)[\"']?/i", $headdata, $res);
if (isset ($res)) {
$keywords = $res[1];
}
$keywords = preg_replace("/[, ]+/", " ", $keywords);
$robots = explode(",", strtolower($robots));
foreach ($robots as $x) {
if (trim($x) == "noindex") {
$noindex = 1;
}
if (trim($x) == "nofollow") {
$nofollow = 1;
}
}
$data['description'] = addslashes($description);
$data['keywords'] = addslashes($keywords);
$data['nofollow'] = $nofollow;
$data['noindex'] = $noindex;
}
return $data;
}
function clean_file($file, $url, $type) {
global $entities, $index_host, $index_meta_keywords;
$urlparts = parse_url($url);
$host = $urlparts['host'];
//remove filename from path
$path = eregi_replace('([^/]+)$', "", $urlparts['path']);
$file = eregi_replace("<link rel[^<>]*>", " ", $file);
$first = strpos(strtolower($file), "<!--sphider_noindex-->");
$count = 0;
while (!($first === FALSE) && $count < 20) {
$count ++;
$next = strpos(strtolower($file), "<!--/sphider_noindex-->");
$file = str_replace(substr($file, $first, $next - $first +23), " ", $file);
$first = strpos(strtolower($file), "<!--sphider_noindex-->");
}
$first = strpos(strtolower($file), "<!--");
$count = 0;
while(!($first === FALSE) && $count < 20) {
$count ++;
$next = strpos(strtolower($file), "-->");
$file = str_replace(substr($file, $first, $next - $first +3), " ", $file);
$first = strpos(strtolower($file), "<!--");
}
$first = strpos(strtolower($file), "<script");
$count = 0;
while (!($first === FALSE) && $count < 20) {
$count ++;
$next = strpos(strtolower($file), "</script");
$file = str_replace(substr($file, $first, $next - $first +9), " ", $file);
$first = strpos(strtolower($file), "<script");
}
$headdata = get_head_data($file);
$regs = Array ();
if (eregi("<title *>([^<>]*)</title *>", $file, $regs)) {
$title = $regs[1];
$file = str_replace($regs[0], "", $file);
} else if ($type == 'pdf' || $type == 'doc') { //the title of a non-html file is its first few words
$title = substr($file, 0, strrpos(substr($file, 0, 40), " "));
}
$file = eregi_replace("(<style[^>]*>[^<>]*</style>)", " ", $file);
//create spaces between tags, so that remove tags doesnt concatenate strings
$file = preg_replace("/<[\w ]+>/", "\\0 ", $file);
$file = preg_replace("/<\/[\w ]+>/", "\\0 ", $file);
$file = preg_replace("/\s+/", " ", $file);
$file = strip_tags($file);
$file = preg_replace("/ /", " ", $file);
$file = preg_replace("/ +/", " ", $file);
$fulltext = $file;
$file .= " ".$title;
if ($index_host == 1) {
$file = $file." ".$host." ".$path;
}
if ($index_meta_keywords == 1) {
$file = $file." ".$headdata['keywords'];
}
reset($entities);
while ($char = each($entities)) {
$file = eregi_replace($char[0], $char[1], $file);
}
//replace codes with ascii chars
$file = ereg_replace('&#([0-9]+);', chr('\1'), $file);
$file = strtolower($file);
$file = eregi_replace("&[a-z]{1,6};", " ", $file);
$file = preg_replace("/[\*\^\+\?\\\.\[\]\^\$\|\{\)\(\}~!\"\/@#
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -