⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 search.in

📁 harvest是一个下载html网页得机器人
💻 IN
📖 第 1 页 / 共 3 页
字号:
		$hp_url = $1 if ($hp_url eq '');		next;	    }	    # Read relevant data into @OBJ	    if (/^120 -/o || /^103 -/o || /^111 -/o) {		$OBJ[++$#OBJ] = $_;		last if (/^103 -/o || /^111 -/o);	    } else {		$OBJ[$#OBJ] .= $_;	    }	}	close ($INPUT) || &fatal ("socket: $!\n");	close TEMPFILE;	$totnumber = (scalar (@OBJ)) - 1;	# obtain the navigator bar	$totalPages = $perpage == 0 ? 1 : ceil ($totnumber/$perpage);	# set default page	$page = 1 if ($page eq '');	eval $CFG{'CreateNavBars'};	print &expand ($CFG{'ResultSetBegin'});	# @OBJ = sort bynml @OBJ if ($sort eq 'by-NML');	&rank_objects(@OBJ,@RANK) if ($sort eq 'by-rank');	while (($curobj || (undef %SOIF,$curobj = shift(@OBJ))) &&	       (($_, $curobj) = split(/\n/, $curobj, 2))) {	    # skip all lines before first match to be displayed	    if ($nobjects<=(($page-1)*$perpage) && $nobjects>0) {		# New objects start with the URL of match. Here $nobjects is increased		if (!(/^120 - (.*)$/o )) {		    $nopaquelines++ if (/^122 - (.*)$/o);		    next;		}	    }	    # skip all lines after last match to be displayed	    next if ($nobjects>(($page)*$perpage) && !(/^103 - (.*)$/o ));	    # 120 - The URL of the match	    if (/^120 - (.*)$/o) {		$url = $1;		$url =~ /^([^:]+):\/*([^\/]+)(.*)$/o;		$A = $1;		$H = $2;		$P = $3;		@X = split ('/', $P);		$F = pop (@X);		$D = join ("/", @X, "");		$F = $url if ($url =~ /\/$/o);		if ($A eq 'news') {		    $F = $H;		    $H = $P = $D = undef;		}		$objectnum = sprintf ($CFG{'ObjectNumPrintf'}, ++$nobjects);		#print &expand ($CFG{'PrintUrl'});		next;	    }	    # 124 - The description line of the match	    if (/^124 - (\d+)/o) {		$n = $1;		next if ($n == 0);		$desc = &html_escape (substr($curobj, 0, $n));		$curobj = substr($curobj, $n);	# delete the data		next;	    }	    # 127 - Attribute request	    if (/^127 - ([^\s]+) (\d+)/o) {		$att	= $1;		$n	= $2;		next if ($n == 0);		$SOIF{$att} = &html_escape (substr($curobj, 0, $n));		$curobj = substr($curobj, $n);	# delete the data		next;	    }	    # 122 - Opaque data	    if (/^122 - (.*)$/o) {		$_ = &html_escape ($1);		eval $CFG{'MatchedLineSub'};		$opaque .= $_ . "\n" if (/\S/);		$nopaquelines++;		next;	    }	    # 101 - message to the user	    if (/^101 - (.*)$/o) {		$usermsg = $1;		#print &expand ($CFG{'UserMessage'});		next;	    }	    # 103 - end of broker results	    # 111 - Error Message that ends Broker Results	    if (/^103 - (.*)$/o || /^111 - (.*)$/o) {		$msg = $1;		if ($msg =~ /PARSE ERROR/o) {		    if ($query eq '') {			print &expand ($CFG{'NoQuery'});		    } else {			print &expand ($CFG{'ParseError'});		    }		} elsif ($msg =~ /heavily loaded/o) {		    print &expand ($CFG{'BrokerLoad'});		} elsif ($msg =~ /(\d+) Returned Object/o) {		    $nreturned = $1;		    print &expand ($CFG{'EndBrokerResults'});		    print &expand ($CFG{'EmptySetWarning'})			if ($nreturned == 0);		    print &expand ($CFG{'TruncateWarning'})			if ($nopaquelines >= $maxresult || $nreturned == $maxfiles);		} else {		    print &expand ($CFG{'FailBrokerResults'});		}		print &expand ($CFG{'ResultSetEnd'});		print &expand ($CFG{'ResultTrailer'});		last;	    }	    # content summary URL	    if (/^125 - (.*)$/o) {		$cs_url = $1;		$cs_url =~ /^([^:]+):\/*([^\/]+)(.*)$/o;		$cs_a = $1;		$cs_h = $2;		$cs_p = $3;		@X    = split ('/', $cs_p);		$cs_f = pop (@X);		$cs_d = join ("/", @X, "");		$cs_f = $cs_url if ($cs_url =~ /\/$/o);		next;	    }	    # 130 - Object Ends, start a new one	    if (/^130/o) {		$weight = $aWeight[$nobjects-1];		eval $CFG{'PerObjectFunction'}		if (defined ($CFG{'PerObjectFunction'}));		$attributes = '';		foreach $k (sort keys %SOIF) {		    $att = $k;		    $val = $SOIF{$k};		    eval ($CFG{'PerAttributeFunction'});		    $attributes .= &expand($CFG{'FormatAttribute'})			if ($att ne ''); # $att may be deleted in search.cf		}		print &expand ($CFG{'PrintObject'});		$weight = 0;		$opaque = "";		$url	= "";		$cs_url = "";		$desc	= "";		%SOIF	= ();		next;	    }	}}sub fatal {	local ($key) = shift;	local ($msg) = defined ($CFG{$key}) ? &expand ($CFG{$key}) : $key;	print STDOUT "$msg";	if (open(BQLOG, ">>$BQLOG")) {		my $now = localtime;		printf BQLOG ("[%s] %s: %s", $now, $0, $msg);		close BQLOG;	}	exit(1);}# Perform simple variable substitution.  See Wall & Schwartz# 'Programming Perl' book, p 217.#sub expand2 {	$_ = shift;	s/"/\\"/g;	eval qq/"$_"/;}# Gross regexp's to do some ifdef-type substitutions# Jonathan Rochkind, jrochkin@cs.oberlin.edu, 21 March 1996.## Use:#       <?foo>abcxyz</?foo>     becomes       abcxyz#       <?>abcxyz</?>           is deleted## so the user can conditionally include a section of text by# enclosing it in <?$var>...</?$var>.  The section will be# included if $var is not the null string.  If a newline# follows </?> the newline is removed.#sub expand {	$_ = &expand2 (shift);	# First get rid of everything of the form "<?>Some Stuff</?>"	local($begin) = 0;	local($end) = 0;	while ($begin != -1) {           # While there is still a <?>	    $begin=index($_,'<?>');      # Find it's location.	    unless ($begin == -1) {      # As long as we did find it....		# Find it's paired </?>		$end=index($_,'</?>',$begin)+4;		# Include the following \n too, if there...		if (substr($_,$end,1) eq "\n") { ++$end; }		# Now delete the whole "<?> ... </?>"		substr($_,$begin,$end-$begin) = "";	    }	}	# Now just delete _all_ tags of the form <?x>, </?x>	# Don't even bother matching them up in pairs,	#     just delete them where you see em.	s/<\?[^>]+>//g;     # delete all <?x>	s/<\/\?[^>]+>//g;   # delete all </?x>	s/\n\n*/\n/g;	$_;}# escape special HTML characterssub html_escape {	$_ = shift;	s/\&/\&amp;/g;		# do ampersand first!	s/</\&lt;/g;	s/>/\&gt;/g;	s/\&amp;(\w+;)/\&$1/g;	# try putting some HTML back   &Circ;	s/\&amp;(#\d+;)/\&$1/g;	# try putting some HTML back   &#38;	s/"/\&#34;/g;	$_;}# create search URIs used for the navigation bars - by h.weinreichsub create_link {    my $pageno = shift;    my $searchurl = "/Harvest/cgi-bin/search.cgi";    $searchurl .= "?query=".url_encode($RQ{'query'});    $searchurl .= "&broker=".$RQ{'broker'}               if ($RQ{'broker'} ne '');    $searchurl .= "&hp_url=".$RQ{'hp_url'}               if ($RQ{'hp_url'} ne '');    $searchurl .= "&caseflag=".$RQ{'caseflag'}           if ($RQ{'caseflag'} ne '');    $searchurl .= "&wordflag=".$RQ{'wordflag'}           if ($RQ{'wordflag'} ne '');    $searchurl .= "&opaqueflag=".$RQ{'opaqueflag'}       if ($RQ{'opaqueflag'} ne '');    $searchurl .= "&descflag=".$RQ{'descflag'}           if ($RQ{'descflag'} ne '');    $searchurl .= "&maxresultflag=".$RQ{'maxresultflag'} if ($RQ{'maxresultflag'} ne '');    $searchurl .= "&maxobjflag=".$RQ{'maxobjflag'}       if ($RQ{'maxobjflag'} ne '');    $searchurl .= "&maxlineflag=".$RQ{'maxlineflag'}     if ($RQ{'maxlineflag'} ne '');    $searchurl .= "&weightflag=".$RQ{'weightflag'}       if ($RQ{'weightflag'} ne '');    $searchurl .= "&perpageflag=".$RQ{'perpageflag'}     if ($RQ{'perpageflag'} ne '');    $searchurl .= "&sort=".$RQ{'sort'}                   if ($RQ{'sort'} ne '');    $searchurl .= "&brokerqueryconfig=".$RQ{'brokerqueryconfig'}                                                         if ($RQ{'brokerqueryconfig'} ne '');    $searchurl .= "&attribute=".url_encode("@atts")      if ("@atts" ne '');    foreach (@filter) {	$searchurl .= "&filter=".url_encode($_);    }    $searchurl .= "&pageflag=$pageno"                    if ($pageno > 0);    return $searchurl;}sub parse_config {	local ($CF)	= shift;	local ($out)	= 1;	undef (local ($key));	undef (local ($val));	open (CF) || &fatal ("$CF: $!\n");	while (<CF>) {		next if (/^#/o);		if ($out && /<(\w+)>/o) {			$key = $1;			$val = "";			$out = 0;			next;		}		if (!$out && /<\/$key>/) {			#eval ("\$$key=$val");			print "setting $key...\n" if ($debug);			#print "to $val\n";			chop ($CFG{$key} = $val);			undef ($val);			undef ($key);			$out = 1;			next;		}		chop, chop if (/[^\\]\\$/o);	# chop newline if esc'd		$val .= $_ if (!$out && defined ($val));	}	close CF;	1;}sub client_socket {	local ($host, $port) = @_;	local ($sockaddr) = 'S n a4 x8';	local ($name, $aliases, $proto) = getprotobyname('tcp');	local ($connected) = 0;	# Lookup addresses for remote hostname	#	local($w,$x,$y,$z,@thataddrs) = gethostbyname($host);	&fatal("Unknown Host: $host\n") unless (@thataddrs);	# bind local socket to INADDR_ANY	#	local ($thissock) = pack($sockaddr, &AF_INET, 0, "\0\0\0\0");	&fatal("socket: $!\n") unless		socket (SOCK, &AF_INET, &SOCK_STREAM, $proto);	&fatal("bind: $!\n") unless		bind (SOCK, $thissock);	# Try all addresses	#	foreach $thataddr (@thataddrs) {		local ($that) = pack($sockaddr, &AF_INET, $port, $thataddr);		@IP = unpack('C4', $thataddr);		printf ("Trying connection to %d.%d.%d.%d<BR>\n", @IP)			if ($debug);		if (connect (SOCK, $that)) {			$connected = 1;			printf ("Connected to %d.%d.%d.%d!<BR>\n", @IP)				if ($debug);			last;		}	}	return () unless ($connected);	# Set socket to flush-after-write and return it	#	select (SOCK); $| = 1;	select (STDOUT);	return (SOCK);}sub sigharddie {	kill ('KILL', $$);}sub sigdie {	local ($sig) = @_;	&fatal ('sigdie');}sub dump_array {	local (%A) =  @_;	local ($key);	print "<PRE>\n";	foreach $key ( sort keys %A) {	    print "$key=$A{$key}\n";	}	print "</PRE>\n";	1;}sub broker_down {	local($host, $port) = @_;	print &expand ($CFG{'BrokerDown'});	&fatal("$host:$port: $!\n");}sub not_configured {	print <<"EOM";Content-type: text/html<html><head><TITLE>$0 not properly installed</TITLE></head><body><STRONG>WARNING:</STRONG> You have not installed the WWW interfaceto the Harvest Broker correctly.  The \$HARVEST_HOME directory<PRE>        <STRONG>$ENV{'HARVEST_HOME'}</STRONG></PRE> does not exist.<P>Please refer to the current<A HREF="http://harvest.sourceforge.net/harvest/doc/html/manual.html">HarvestUser's Manual</A>.</body></html>EOM	exit 0;}# Read defaults from .cfsub parse_defaults {   local($attr,$value);   local(@vars)=split(/\n/,$CFG{'Default'});   while (@vars) {	($attr,$value)=split(/:/,shift(@vars));	$attr=~s/\s//g;	$value=~s/^\s*//;	$DEF{$attr}=$value;   }}sub option {   local($attr)=@_[0];   return (defined $RQ{$attr} && $RQ{$attr} ne '') ? $RQ{$attr} : $DEF{$attr};}# calculate from the query-string a (hopefully) unique string that can be used# as temporary filename...sub get_hashcode {    my $querystring = shift;    my $h = 0;    my $t = 0;    my $hashcode = "";    my $len = length($querystring);    # calculate hash number from querystring    for ($i = 0; $i < $len; $i++) {	$h = 137*$h + ord(substr($querystring, $i, 1));	$h = sprintf ("%016.0f",$h);	$h = substr($h,-15);    }    # convert hash-number to hash-string.    while ($h > 1) {	$t = ($h % 25) + 97;	$hashcode .= pack("c",$t);	$h = $h / 25;    }    return $hashcode;}####################################################################### Below is cgi.pl...## The CGI_HANDLERS deal with basic CGI POST or GET method request# elements such as those delivered by an HTTPD form, i.e. a url# encoded line of "=" separated key=value pairs separated by &'s# Routines:# get_request:	reads the request and returns both the raw and#               processed version.# url_decode:	URL decodes a string or array of strings## Author:# 	James Tappin: sjt@xun8.sr.bham.ac.uk#	School of Physics & Space Research University of Birmingham#	Feb 1993.

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -