⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 search.rediris.in

📁 harvest是一个下载html网页得机器人
💻 IN
📖 第 1 页 / 共 3 页
字号:
#----------------------------------------------------------------------# JMM - 20010529# We consider some metadata in Dublin Core format the most important#---------------------------------------------------------------------,		$rank = $rank + 10000 - length($line)		   if (index($line,"dc.subject#") > 0);		$rank = $rank + 5000 - length($line)	      	   if (index($line,"dc.title#") > 0);		$rank = $rank + 1000 - length($line)		   if (index($line,"dc.description#") > 0);		$rank = $rank + 500 - length($line)	      	   if (index($line,"dc.creator#") > 0);		$rank = $rank + 100 - length($line)	      	   if (index($line,"dc.publisher#") > 0);		$rank = $rank + 100 - length($line)		   if (index($line,"dc.language#") > 0);#---------------------------------------------------------------------'		#title is most important		$rank += 600 - length($line)*3		    if (index($line,"title#") > 0 && length($line) < 200);		#url is also important		$rank += 150 - length($line)/1.5		    if (index($line,"url#") > 0 && length($line) < 200);		#there are some very long headings...		$rank += 110 - length($line)/2		    if (index($line,"headings#") > 0 && length($line)<200);		# used for PDF, PS etc.		$rank += 80 - length($line)/3		    if (index($line,"description#") > 0 && length($line)<200);		$rank += 60 if (index($line,"subject#") > 0);		$rank += 30 if (index($line,"images#") > 0);		# these are sometimes too long.		$rank += 25 - length($line)/8		    if (index($line,"url-references#") > 0 );		# somehow redundant, not as important as it may seem...		$rank += 20 if (index($line,"keywords#") > 0);		$rank += 10 if (index($line,"body#") > 0);		# Consider length of URL of every Page		$rank += 400 - length($line)*2		    if (index($line,"120 - ") == 0 && length($line) < 200);	    }	    $lastline = $line;	}	$ratings{$objnum} = $rank;	$object_index[$objnum] = $objnum;	$objnum++;    }#   pop off the "126" and "103" results (I hope they're at the end!)    pop(@object_index);    pop(@object_index);    @sorted_index= sort byrate @object_index;    $objnum=0;#----------------------------------------------------------------------# JMM - 20010605# The first element has the higher weight. We need it to calculate# the number of balls to show with an resource.#---------------------------------------------------------------------,    $rGreaterWeight = $ratings{$sorted_index[0]};#---------------------------------------------------------------------'    foreach $index (@sorted_index) {	$OBJ[$objnum] = $objects[$index];#----------------------------------------------------------------------# JMM - 20010529# We save the weight of each object.#---------------------------------------------------------------------,	$raWeight[$objnum] = $ratings{$index};#---------------------------------------------------------------------'	$objnum++;    }}sub byrate {    $ratings{$b} <=> $ratings{$a};}# translate SGML entities# produced from# perl -ne 'if (/^ *<.ENTITY *([\S]*) *CDATA *"&#([\d]+);".*>/)#              { if ($2 <0x100) {printf "s/&$1;/\\x%x/g;\n", $2;}# 		 else {printf "s/&$1;/\\\\#$2/g;\n";}}'sub entities {    $_ = $_[0];    # ISO Latin 1s/&nbsp;/\xa0/g;s/&iexcl;/\xa1/g;s/&cent;/\xa2/g;s/&pound;/\xa3/g;s/&curren;/\xa4/g;s/&yen;/\xa5/g;s/&brvbar;/\xa6/g;s/&sect;/\xa7/g;s/&uml;/\xa8/g;s/&copy;/\xa9/g;s/&ordf;/\xaa/g;s/&laquo;/\xab/g;s/&not;/\xac/g;s/&shy;/\xad/g;s/&reg;/\xae/g;s/&macr;/\xaf/g;s/&deg;/\xb0/g;s/&plusmn;/\xb1/g;s/&sup2;/\xb2/g;s/&sup3;/\xb3/g;s/&acute;/\xb4/g;s/&micro;/\xb5/g;s/&para;/\xb6/g;s/&middot;/\xb7/g;s/&cedil;/\xb8/g;s/&sup1;/\xb9/g;s/&ordm;/\xba/g;s/&raquo;/\xbb/g;s/&frac14;/\xbc/g;s/&frac12;/\xbd/g;s/&frac34;/\xbe/g;s/&iquest;/\xbf/g;s/&Agrave;/\xc0/g;s/&Aacute;/\xc1/g;s/&Acirc;/\xc2/g;s/&Atilde;/\xc3/g;s/&Auml;/\xc4/g;s/&Aring;/\xc5/g;s/&AElig;/\xc6/g;s/&Ccedil;/\xc7/g;s/&Egrave;/\xc8/g;s/&Eacute;/\xc9/g;s/&Ecirc;/\xca/g;s/&Euml;/\xcb/g;s/&Igrave;/\xcc/g;s/&Iacute;/\xcd/g;s/&Icirc;/\xce/g;s/&Iuml;/\xcf/g;s/&ETH;/\xd0/g;s/&Ntilde;/\xd1/g;s/&Ograve;/\xd2/g;s/&Oacute;/\xd3/g;s/&Ocirc;/\xd4/g;s/&Otilde;/\xd5/g;s/&Ouml;/\xd6/g;s/&times;/\xd7/g;s/&Oslash;/\xd8/g;s/&Ugrave;/\xd9/g;s/&Uacute;/\xda/g;s/&Ucirc;/\xdb/g;s/&Uuml;/\xdc/g;s/&Yacute;/\xdd/g;s/&THORN;/\xde/g;s/&szlig;/\xdf/g;s/&agrave;/\xe0/g;s/&aacute;/\xe1/g;s/&acirc;/\xe2/g;s/&atilde;/\xe3/g;s/&auml;/\xe4/g;s/&aring;/\xe5/g;s/&aelig;/\xe6/g;s/&ccedil;/\xe7/g;s/&egrave;/\xe8/g;s/&eacute;/\xe9/g;s/&ecirc;/\xea/g;s/&euml;/\xeb/g;s/&igrave;/\xec/g;s/&iacute;/\xed/g;s/&icirc;/\xee/g;s/&iuml;/\xef/g;s/&eth;/\xf0/g;s/&ntilde;/\xf1/g;s/&ograve;/\xf2/g;s/&oacute;/\xf3/g;s/&ocirc;/\xf4/g;s/&otilde;/\xf5/g;s/&ouml;/\xf6/g;s/&divide;/\xf7/g;s/&oslash;/\xf8/g;s/&ugrave;/\xf9/g;s/&uacute;/\xfa/g;s/&ucirc;/\xfb/g;s/&uuml;/\xfc/g;s/&yacute;/\xfd/g;s/&thorn;/\xfe/g;s/&yuml;/\xff/g;    # Cougar HTMLsyms/&fnof;/\\#402/g;s/&Alpha;/\\#913/g;s/&Beta;/\\#914/g;s/&Gamma;/\\#915/g;s/&Delta;/\\#916/g;s/&Epsilon;/\\#917/g;s/&Zeta;/\\#918/g;s/&Eta;/\\#919/g;s/&Theta;/\\#920/g;s/&Iota;/\\#921/g;s/&Kappa;/\\#922/g;s/&Lambda;/\\#923/g;s/&Mu;/\\#924/g;s/&Nu;/\\#925/g;s/&Xi;/\\#926/g;s/&Omicron;/\\#927/g;s/&Pi;/\\#928/g;s/&Rho;/\\#929/g;s/&Sigma;/\\#931/g;s/&Tau;/\\#932/g;s/&Upsilon;/\\#933/g;s/&Phi;/\\#934/g;s/&Chi;/\\#935/g;s/&Psi;/\\#936/g;s/&Omega;/\\#937/g;s/&alpha;/\\#945/g;s/&beta;/\\#946/g;s/&gamma;/\\#947/g;s/&delta;/\\#948/g;s/&epsilon;/\\#949/g;s/&zeta;/\\#950/g;s/&eta;/\\#951/g;s/&theta;/\\#952/g;s/&iota;/\\#953/g;s/&kappa;/\\#954/g;s/&lambda;/\\#955/g;s/&mu;/\\#956/g;s/&nu;/\\#957/g;s/&xi;/\\#958/g;s/&omicron;/\\#959/g;s/&pi;/\\#960/g;s/&rho;/\\#961/g;s/&sigmaf;/\\#962/g;s/&sigma;/\\#963/g;s/&tau;/\\#964/g;s/&upsilon;/\\#965/g;s/&phi;/\\#966/g;s/&chi;/\\#967/g;s/&psi;/\\#968/g;s/&omega;/\\#969/g;s/&thetasym;/\\#977/g;s/&upsih;/\\#978/g;s/&piv;/\\#982/g;s/&bull;/\\#8226/g;s/&hellip;/\\#8230/g;s/&prime;/\\#8242/g;s/&Prime;/\\#8243/g;s/&oline;/\\#8254/g;s/&frasl;/\\#8260/g;s/&weierp;/\\#8472/g;s/&image;/\\#8465/g;s/&real;/\\#8476/g;s/&trade;/\\#8482/g;s/&alefsym;/\\#8501/g;s/&larr;/\\#8592/g;s/&uarr;/\\#8593/g;s/&rarr;/\\#8594/g;s/&darr;/\\#8595/g;s/&harr;/\\#8596/g;s/&crarr;/\\#8629/g;s/&lArr;/\\#8656/g;s/&uArr;/\\#8657/g;s/&rArr;/\\#8658/g;s/&dArr;/\\#8659/g;s/&hArr;/\\#8660/g;s/&forall;/\\#8704/g;s/&part;/\\#8706/g;s/&exist;/\\#8707/g;s/&empty;/\\#8709/g;s/&nabla;/\\#8711/g;s/&isin;/\\#8712/g;s/&notin;/\\#8713/g;s/&ni;/\\#8715/g;s/&prod;/\\#8719/g;s/&sum;/\\#8721/g;s/&minus;/\\#8722/g;s/&lowast;/\\#8727/g;s/&radic;/\\#8730/g;s/&prop;/\\#8733/g;s/&infin;/\\#8734/g;s/&ang;/\\#8736/g;s/&and;/\\#8869/g;s/&or;/\\#8870/g;s/&cap;/\\#8745/g;s/&cup;/\\#8746/g;s/&int;/\\#8747/g;s/&there4;/\\#8756/g;s/&sim;/\\#8764/g;s/&cong;/\\#8773/g;s/&asymp;/\\#8776/g;s/&ne;/\\#8800/g;s/&equiv;/\\#8801/g;s/&le;/\\#8804/g;s/&ge;/\\#8805/g;s/&sub;/\\#8834/g;s/&sup;/\\#8835/g;s/&nsub;/\\#8836/g;s/&sube;/\\#8838/g;s/&supe;/\\#8839/g;s/&oplus;/\\#8853/g;s/&otimes;/\\#8855/g;s/&perp;/\\#8869/g;s/&sdot;/\\#8901/g;s/&lceil;/\\#8968/g;s/&rceil;/\\#8969/g;s/&lfloor;/\\#8970/g;s/&rfloor;/\\#8971/g;s/&lang;/\\#9001/g;s/&rang;/\\#9002/g;s/&loz;/\\#9674/g;s/&spades;/\\#9824/g;s/&clubs;/\\#9827/g;s/&hearts;/\\#9829/g;s/&diams;/\\#9830/g;    # Cougar HTMLmiscs/&quot;/\x22/g;s/&amp;/\x26/g;s/&lt;/\x3c/g;s/&gt;/\x3e/g;s/&OElig;/\\#338/g;s/&oelig;/\\#339/g;s/&Scaron;/\\#352/g;s/&scaron;/\\#353/g;s/&Yuml;/\\#376/g;s/&circ;/\\#710/g;s/&tilde;/\\#732/g;s/&ensp;/\\#8194/g;s/&emsp;/\\#8195/g;s/&thinsp;/\\#8201/g;s/&zwnj;/\\#8204/g;s/&zwj;/\\#8205/g;s/&lrm;/\\#8206/g;s/&rlm;/\\#8207/g;s/&ndash;/\\#8211/g;s/&mdash;/\\#8212/g;s/&lsquo;/\\#8216/g;s/&rsquo;/\\#8217/g;s/&sbquo;/\\#8218/g;s/&ldquo;/\\#8220/g;s/&rdquo;/\\#8221/g;s/&bdquo;/\\#8222/g;s/&dagger;/\\#8224/g;s/&Dagger;/\\#8225/g;s/&permil;/\\#8240/g;s/&lsaquo;/\\#8249/g;s/&rsaquo;/\\#8250/g;    return($_);}# David Hoekman  Oct 1, 1997# Split search results into segments, 'n' results at a time.  The value of# 'n' is set by the $perpage variable, which is defined via the 'perpageflag'# query option.  Each segment is written to a separate temporary file; the# location of these files must be configured locally.  This routine also# removes old temporary files, when they are older than a certain age.#sub N_at_a_time {	local($more) = @_;	local($previous_page, $next_page);	local($tmp_prefix) = "$^T-${$}";	# prefix for temporary files	# if this is the last page of results, remove old temporary files	# (if $expire=3, then 3/24 sets a 3 hour expiration cycle)	unless ($more) {		chdir "$tmp_dir";		foreach $file (<[1-9]*-[1-9]*-[1-9]*\.html>) {			unlink "$file" if -M $file > $expire/24;		}	}	# write stored 'first' page (duplicates data sent to STDOUT/browser)	if (! $current_page) {		$current_page = 1;		open(TEMP,">$tmp_dir/$tmp_prefix-1.html") ||			&fatal("<b>Error: cannot write temporary files at '$tmp_dir'!</b>");		print TEMP "<html>\n";		print TEMP &expand ($CFG{'ResultHeader'});		print TEMP @first_page;		print TEMP &expand ($CFG{'ResultSetEnd'});#----------------------------------------------------------------------# JMM - 20010531# We change this lines to print the navigation bar $rNavigationBar (we# constructed it when we evaluated $CFG{'CreateNavBars'})#---------------------------------------------------------------------,#		print TEMP qq{<BR>Total: $totnumber} if ($more && defined $sort);		print TEMP "<p>\n<center>";#		print TEMP qq{[<a href="/$tmp_www/$tmp_prefix-2.html">Next</a>]}#			if $totnumber > $nobjects;#		print TEMP qq{[<a href="$hp_url">New Search</a>]}#			if $hp_url ne '';##        	print TEMP "$rNavigationBar";#---------------------------------------------------------------------'                print TEMP "<p>\n</center>";#----------------------------------------------------------------------# JMM - 20010531# We use $CFG{'ResultTrailer'} to finish the page#---------------------------------------------------------------------,#		print TEMP "</body></html>\n";		print TEMP &expand ($CFG{'ResultTrailer'});#---------------------------------------------------------------------'		undef @first_page;	}	# finish off current page	print &expand ($CFG{'ResultSetEnd'}) if $more;	$previous_page = $current_page - 1; $next_page = $current_page + 1;#----------------------------------------------------------------------# JMM - 20010531# We print the total later#---------------------------------------------------------------------,#	print qq{<BR>Total: $totnumber} if ($more && defined $sort);#	print "<p>\n<center>";#---------------------------------------------------------------------'#----------------------------------------------------------------------# JMM - 20010531# We print the navigation bar#---------------------------------------------------------------------,#	print qq{[<a href="/$tmp_www/$tmp_prefix-$previous_page.html">Previous</a>]}#		if $previous_page > 0;#	print qq{[<a href="/$tmp_www/$tmp_prefix-$next_page.html">Next</a>]}#		if $totnumber > $nobjects;#	print qq{[<a href="$hp_url">New Search</a>]} if $hp_url ne '';#	print "</center>\n";#---------------------------------------------------------------------'	print &expand ($CFG{'ResultTrailer'});#----------------------------------------------------------------------# JMM - 20010531# We use $CFG{'ResultTrailer'} to finish the page#---------------------------------------------------------------------,#	print "</body>\n</html>\n";#---------------------------------------------------------------------'	return unless $more; # stop here if no more results!	# increment page counter, open next results page	$current_page++;	close(TEMP);	open(TEMP,">$tmp_dir/$tmp_prefix-$current_page.html") ||		&fatal("<b>Error: cannot write temporary files at '$tmp_dir'!</b>");	select(TEMP);	# print beginning of next page	print &expand ($CFG{'ResultHeader'});	eval ($CFG{'CreateNavBars'});	print &expand ($CFG{'ResultSetBegin'});}sub ceil {	my $x = shift;	$x += 1.0 unless ($x == int($x));	return int($x);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -