⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 search.rediris.in

📁 harvest是一个下载html网页得机器人
💻 IN
📖 第 1 页 / 共 3 页
字号:
	    # 111 - Error Message that ends Broker Results	    if (/^103 - (.*)$/o || /^111 - (.*)$/o) {		print &expand ($CFG{'ResultSetEnd'});		push(@first_page, &expand ($CFG{'ResultSetEnd'}))		    if $perpage && ! $current_page;		$msg = $1;		if ($msg =~ /PARSE ERROR/o) {		    print &expand ($CFG{'ParseError'});		} elsif ($msg =~ /heavily loaded/o) {		    print &expand ($CFG{'BrokerLoad'});		} elsif ($msg =~ /(\d+) Returned Object/o) {#----------------------------------------------------------------------# JMM - 20010604# We write the number of objects returned in the ResultsBar#----------------------------------------------------------------------			$msg = "";#----------------------------------------------------------------------		    $nreturned = $1;		    print &expand ($CFG{'EndBrokerResults'});		    print &expand ($CFG{'EmptySetWarning'})			if ($nreturned == 0);		    print &expand ($CFG{'TruncateWarning'})			if ($nopaquelines >= $maxresult);		} else {		    print &expand ($CFG{'FailBrokerResults'});		}		last;	    }	    # 120 - The URL of the match	    if (/^120 - (.*)$/o) {		$url = $1;		$url =~ /^([^:]+):\/*([^\/]+)(.*)$/o;		$A = $1;		$H = $2;		$P = $3;		@X = split ('/', $P);		$F = pop (@X);		$D = join ("/", @X, "");		$F = $url if ($url =~ /\/$/o);		if ($A eq 'news') {		    $F = $H;		    $H = $P = $D = undef;		}		$objectnum = sprintf ($CFG{'ObjectNumPrintf'},				      ++$nobjects);		#print &expand ($CFG{'PrintUrl'});		next;	    }	    # content summary URL	    if (/^125 - (.*)$/o) {		$cs_url = $1;		$cs_url =~ /^([^:]+):\/*([^\/]+)(.*)$/o;		$cs_a = $1;		$cs_h = $2;		$cs_p = $3;		@X    = split ('/', $cs_p);		$cs_f = pop (@X);		$cs_d = join ("/", @X, "");		$cs_f = $cs_url if ($cs_url =~ /\/$/o);		next;	    }	    # 130 - Object Ends, start a new one	    if (/^130/o) {#----------------------------------------------------------------------# JMM - 20010531# We use the array "aWeight" to mantain the weigth of this object. We# use it later to draw the balls.#---------------------------------------------------------------------,		$rWeight = $raWeight[$rIndex];		$rIndex++;#---------------------------------------------------------------------'		eval $CFG{'PerObjectFunction'}		if (defined ($CFG{'PerObjectFunction'}));		$attributes = '';		foreach $k (sort keys %SOIF) {		    $att = $k;		    $val = $SOIF{$k};		    eval ($CFG{'PerAttributeFunction'});		    $attributes .= &expand($CFG{'FormatAttribute'})			if ($att ne ''); # $att may be deleted in search.cf		}		print &expand ($CFG{'PrintObject'});		push(@first_page, &expand ($CFG{'PrintObject'}))		    if $perpage && ! $current_page;		&N_at_a_time(1)		    if $perpage && 0==$nobjects%$perpage && $nobjects>0 && $nobjects != $totnumber;#----------------------------------------------------------------------# JMM - 20010531# We have used $rWeight in $CFG{'PerObjectFunction'} and then we reset# it#---------------------------------------------------------------------,		$rWeight = 0;#---------------------------------------------------------------------'		$opaque = "";		$url	= "";		$cs_url = "";		$desc	= "";		%SOIF	= ();		next;	    }	}	&N_at_a_time(0) if $perpage;}sub fatal {	local ($key) = shift;	local ($msg) = defined ($CFG{$key}) ? &expand ($CFG{$key}) : $key;	print STDOUT "$msg";	if (open(BQLOG, ">>$BQLOG")) {		my $now = localtime;		printf BQLOG ("[%s] %s: %s", $now, $0, $msg);		close BQLOG;	}	exit(1);}# Perform simple variable substitution.  See Wall & Schwartz# 'Programming Perl' book, p 217.#sub expand2 {	$_ = shift;	s/"/\\"/g;	eval qq/"$_"/;}# Gross regexp's to do some ifdef-type substitutions# Jonathan Rochkind, jrochkin@cs.oberlin.edu, 21 March 1996.## Use:#       <?foo>abcxyz</?foo>     becomes       abcxyz#       <?>abcxyz</?>           is deleted## so the user can conditionally include a section of text by# enclosing it in <?$var>...</?$var>.  The section will be# included if $var is not the null string.  If a newline# follows </?> the newline is removed.#sub expand {	$_ = &expand2 (shift);	# First get rid of everything of the form "<?>Some Stuff</?>"	local($begin) = 0;	local($end) = 0;	while ($begin != -1) {           # While there is still a <?>	    $begin=index($_,'<?>');      # Find it's location.	    unless ($begin == -1) {      # As long as we did find it....	             # Find it's paired </?>	        $end=index($_,'</?>',$begin)+4;	             # Include the following \n too, if there...	        if (substr($_,$end,1) eq "\n") { ++$end; }	             # Now delete the whole "<?> ... </?>"	        substr($_,$begin,$end-$begin) = "";	    }	}	# Now just delete _all_ tags of the form <?x>, </?x>	# Don't even bother matching them up in pairs,	#     just delete them where you see em.	s/<\?[^>]+>//g;     # delete all <?x>	s/<\/\?[^>]+>//g;   # delete all </?x>	s/\n\n*/\n/g;	$_;}sub html_escape {	$_ = shift;	s/\&/\&amp;/g;		# do ampersand first!	s/</\&lt;/g;	s/>/\&gt;/g;	s/\&amp;(\w+;)/\&$1/g;	# try putting some HTML back   &Circ;	s/\&amp;(#\d+;)/\&$1/g;	# try putting some HTML back   &#38;	s/"/\&#34;/g;	$_;}sub parse_config {	local ($CF)	= shift;	local ($out)	= 1;	undef (local ($key));	undef (local ($val));	open (CF) || &fatal ("$CF: $!\n");	while (<CF>) {		next if (/^#/o);		if ($out && /<(\w+)>/o) {			$key = $1;			$val = "";			$out = 0;			next;		}		if (!$out && /<\/$key>/) {			#eval ("\$$key=$val");			print "setting $key...\n" if ($debug);			#print "to $val\n";			chop ($CFG{$key} = $val);			undef ($val);			undef ($key);			$out = 1;			next;		}		chop, chop if (/[^\\]\\$/o);	# chop newline if esc'd		$val .= $_ if (!$out && defined ($val));	}	close CF;	1;}sub client_socket {	local ($host, $port) = @_;	local ($sockaddr) = 'S n a4 x8';	local ($name, $aliases, $proto) = getprotobyname('tcp');	local ($connected) = 0;	# Lookup addresses for remote hostname	#	local($w,$x,$y,$z,@thataddrs) = gethostbyname($host);	&fatal("Unknown Host: $host\n") unless (@thataddrs);	# bind local socket to INADDR_ANY	#	local ($thissock) = pack($sockaddr, &AF_INET, 0, "\0\0\0\0");	&fatal("socket: $!\n") unless		socket (SOCK, &AF_INET, &SOCK_STREAM, $proto);	&fatal("bind: $!\n") unless		bind (SOCK, $thissock);	# Try all addresses	#	foreach $thataddr (@thataddrs) {		local ($that) = pack($sockaddr, &AF_INET, $port, $thataddr);		@IP = unpack('C4', $thataddr);		printf ("Trying connection to %d.%d.%d.%d<BR>\n", @IP)			if ($debug);		if (connect (SOCK, $that)) {			$connected = 1;			printf ("Connected to %d.%d.%d.%d!<BR>\n", @IP)				if ($debug);			last;		}	}	return () unless ($connected);	# Set socket to flush-after-write and return it	#	select (SOCK); $| = 1;	select (STDOUT);	return (SOCK);}sub sigharddie {	kill ('KILL', $$);}sub sigdie {	local ($sig) = @_;	&fatal ('sigdie');}sub dump_array {	local (%A) =  @_;	local ($key);	print "<PRE>\n";	foreach $key ( sort keys %A) {	        print "$key=$A{$key}\n";	}	print "</PRE>\n";	1;}sub broker_down {	local($host, $port) = @_;	print STDOUT <<EOM;<p>Sorry, but the Broker on<STRONG>$host, port $port</STRONG>is currently unavailable.  Please try again later.<p>EOM	&fatal("$host:$port: $!\n");}sub not_configured {	print <<"EOM";Content-type: text/html<html><head><TITLE>$0 not properly installed</TITLE></head><body><STRONG>WARNING:</STRONG> You have not installed the WWW interfaceto the Harvest Broker correctly.  The \$HARVEST_HOME directory<PRE>        <STRONG>$ENV{'HARVEST_HOME'}</STRONG></PRE> does not exist.<P>Please refer to the current<A HREF="http://harvest.sourceforge.net/harvest/doc/html/manual.html">HarvestUser's Manual</A>.</body></html>EOM	exit 0;}# Read defaults from .cfsub parse_defaults {   local($attr,$value);   local(@vars)=split(/\n/,$CFG{'Default'});   while (@vars) {	($attr,$value)=split(/:/,shift(@vars));	$attr=~s/\s//g;	$value=~s/^\s*//;	$DEF{$attr}=$value;   }}sub option {   local($attr)=@_[0];   return (defined $RQ{$attr}) ? $RQ{$attr} : $DEF{$attr};}####################################################################### Below is cgi.pl...## &get_request;# $area=$rqpairs{'area'};# $title=$rqpairs{'title'};# $name=$rqpairs{'name'};# $msg = $rqpairs{'message'};# $date = &get_date;# chop($thishost = `hostname`);## # Check for blank title and URL# ## &failure ("Blank title")     	if ($title eq "");# &failure ("Blank name")     	if ($name  eq "");# &failure ("Empty message")   	if ($msg   eq "");# &failure ("Empty subject area") if ($area  eq "");# The CGI_HANDLERS deal with basic CGI POST or GET method request# elements such as those delivered by an HTTPD form, i.e. a url# encoded line of "=" separated key=value pairs separated by &'s# Routines:# get_request:	reads the request and returns both the raw and#               processed version.# url_decode:	URL decodes a string or array of strings# html_header:	Transmits a HTML header back to the caller# html_trailer: Transmits a HTML trailer back to the caller# Author:# 	James Tappin: sjt@xun8.sr.bham.ac.uk#	School of Physics & Space Research University of Birmingham#	Feb 1993.# Copyright & Disclaimer.#	This set of routines may be freely distributed, modified and#	used, provided this copyright & disclaimer remains intact.#	This package is used at your own risk, if it does what you#	want, good; if it doesn't, modify it or use something else--but#	don't blame me. Support level = negligable (i.e. mail bugs but#	not requests for extensions)# Usage:#	needs a 'require "cgi_handlers.pl";' line in the main script##	&get_request;    will get the request and decode it into an#			 indexed array %rqpairs, the raw request is in#			 $request##	... = &url_decode(LIST); will return a URL decoded version of#			         the contents of LIST##	&html_header(TITLE); 	will write to standard output an HTML#				header (including the content-type#				field) giving the document the title#				specified by TITLE.##	&html_trailer;		Writes a trailer to the html document#				with the name of the script generating#				it and the date (in UT).sub get_request {    # Subroutine get_request reads the POST or GET form request from STDIN    # into the variable  $request, and then splits it into its    # name=value pairs in the associative array %rqpairs.    # The number of bytes is given in the environment variable    # CONTENT_LENGTH which is automatically set by the request generator.    # Encoded HEX values and spaces are decoded in the values at this    # stage.    # $request will contain the RAW request. N.B. spaces and other    # special characters are not handler in the name field.    if ($ENV{'REQUEST_METHOD'} eq "POST") {	read(STDIN, $request, $ENV{'CONTENT_LENGTH'});    } elsif ($ENV{'REQUEST_METHOD'} eq "GET" ) {	$request = $ENV{'QUERY_STRING'};    }    @F = split(/[&=]/, $request);    &url_decode(@F);}sub url_decode {#	Decode a URL encoded string or array of strings#		+ -> space#		%xx -> character xx    foreach (@_) {	tr/+/ /;	s/%(..)/pack("c",hex($1))/ge;    }    #@_;    # This gross stuff handles multiply defined attributes.  For example:    #    foo=abc&foo=xyz    # comes back as    #    $RQ{'foo'} eq 'abc xyz'    # Otherwise, the previous method just gave us one or the other.    -DW    #    local ($k, $v);    local (%Y);    while (($k=shift @_) ne '' && ($#_ >= 0)) {	$v=shift @_;	$Y{$k} = defined $Y{$k} ? join (' ', $Y{$k}, $v) : $v;    }    %Y;}# sort the objects by number of matched lines##sub bynml {#	split(/\n/, $b) <=> split(/\n/, $a);	# number of lines in object#}# sort the objects by "rank". This is based on code from Wesley Alan Wrightsub rank_objects {    local (@objects) = @_;    local ($objnum) = 0;    local (%ratings);    local (@object_index);    foreach $object (@objects) {	$rank = 1;	$lastline = ""; # hw - don't count repeated lines...	split(/\n/,$object);	foreach $line (@_) {	    # $rawline=$line;	    $line =~ s/\s{2,}/ /g;                # Remove multiple whitespaces	    $line =~ s/(Matched line: )(.*)$/$2/;	    $line =~ s/^(.*)\{(\d+)\}(.*)$/$1$3/; # Remove "{12}:" etc.	    # if (open(BQLOG, ">>$BQLOG")) {	    #	printf BQLOG ($rawline."\n");	    #	printf BQLOG ($line."\n");	    #	printf BQLOG ($rank."\n");	    #	close BQLOG;	    # }	    if ($line ne $lastline) {

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -