⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gatherer.in

📁 harvest是一个下载html网页得机器人
💻 IN
📖 第 1 页 / 共 2 页
字号:
		$essencecmd .= $vals{"Data-Directory"};	}	if (defined($vals{"Working-Directory"})) {		$essencecmd .= " --tmpdir ";		$essencecmd .= $vals{"Working-Directory"};	}	if (defined($vals{"Lib-Directory"})) {		$essencecmd .= " --libdir ";		$essencecmd .= $vals{"Lib-Directory"};	}	if (defined($vals{"Log-File"})) {		$essencecmd .= " --log ";		$essencecmd .= $vals{"Log-File"};		$ENV{'HARVEST_GATHERER_LOGFILE'} = $vals{"Log-File"};	}	if (defined($vals{"Gatherer-Host"})) {		$essencecmd .= " --gatherer-host ";		$essencecmd .= "'" . $vals{"Gatherer-Host"} . "'";	}	if (defined($vals{"Gatherer-Name"})) {		$essencecmd .= " --gatherer-name ";		$tmp = $vals{"Gatherer-Name"};		$tmp =~ s/'/'\\''/g;	# protect quotes, ptooey!		$essencecmd .= "'" . $tmp . "'";	}	if (defined($vals{"Gatherer-Version"})) {		$essencecmd .= " --gatherer-version ";		$essencecmd .= "'" . $vals{"Gatherer-Version"} . "'";	}	if (defined($vals{"Post-Summarizing"})) {		$essencecmd .= " --post-process ";		$essencecmd .= "'" . $vals{"Post-Summarizing"} . "'";	}	if (defined($vals{"Refresh-Rate"})) {		$essencecmd .= " --default-refresh ";		$essencecmd .= "'" . $vals{"Refresh-Rate"} . "'";	}	if (defined($vals{"Time-To-Live"})) {		$essencecmd .= " --default-ttl ";		$essencecmd .= "'" . $vals{"Time-To-Live"} . "'";	}	if (defined($vals{"Essence-Options"})) {		$essencecmd .= " " . $vals{"Essence-Options"};	}	if (defined($vals{"Debug-Options"})) {		$essencecmd .= " " . $vals{"Debug-Options"};		$ENV{'HARVEST_DEBUG'} = $vals{"Debug-Options"};	}	$essencecmd .= " --verbose";	$essencecmd .= " -f -";}sub startup_prepurls {	&init_dir('Lib-Directory', 0)			# dont test writable		if (defined($vals{'Lib-Directory'}));	&init_dir('Data-Directory', 1)		if (defined($vals{'Data-Directory'}));	&init_dir('Working-Directory', 1)		if (defined($vals{'Working-Directory'}));#	foreach $f ("$vals{'Data-Directory'}/index.html", "$vals{'Working-Directory'}/index.html") {#		open(INDEXHTML, "> $f");#		print INDEXHTML <<EOM;#<html>#Please use the Harvest Gatherer's interface to retrieve these files.#</html>#EOM#		close(INDEXHTML);#		chmod(0644, $f);#	}	$ENV{'TMPDIR'} = $vals{"Working-Directory"};	# don't need this kjl/25oct2000	#$templatename = $vals{"Data-Directory"} . "/All-Templates.gz";	#if (-e $templatename) {	#	system("gzip -dc $templatename > $ENV{'TMPDIR'}/All-Files");	#}	if (defined(%mapping)) {		$tfile = "$vals{'Working-Directory'}/localmap.cf";		open(MAPPING, "> $tfile") ||			die "Gatherer: Cannot write Mapping: $tfile: $!\n";		foreach $k (sort keys %mapping) {			print MAPPING "$k\t$mapping{$k}\n";		}		close(MAPPING);		$ENV{'HARVEST_URL_LOCAL_MAPPINGS'} = $tfile;	} else {		$ENV{'HARVEST_URL_LOCAL_MAPPINGS'} = "/dev/null";	}	if (defined($HTTPAuth)) {		$tfile = "$vals{'Working-Directory'}/HTTPAuth.cf";		open(AUTH, "> $tfile") ||			die "Gatherer: Cannot write Auth file: $tfile: $!\n";		print AUTH $HTTPAuth;		close(AUTH);		$ENV{'HARVEST_HTTP_AUTHENTICATIONS'} = $tfile;	} else {		$ENV{'HARVEST_HTTP_AUTHENTICATIONS'} = '/dev/null';	}	if (defined($FTPAuth)) {		$tfile = "$vals{'Working-Directory'}/FTPAuth.cf";		open(AUTH, "> $tfile") ||			die "Gatherer: Cannot write Auth file: $tfile: $!\n";		print AUTH $FTPAuth;		close(AUTH);		$ENV{'HARVEST_FTP_AUTHENTICATIONS'} = $tfile;	} else {		$ENV{'HARVEST_FTP_AUTHENTICATIONS'} = '/dev/null';	}        if (defined($vals{"User-Agent"})) {                $ENV{'HARVEST_USER_AGENT'} = $vals{"User-Agent"};        }	if (defined($vals{"Transfer-Timeout"})) {		$ENV{'HARVEST_XFER_TIMEOUT'} = $vals{'Transfer-Timeout'};	}        if (defined($vals{"Maintainer"})) {                $ENV{'HARVEST_MAINTAINER_ADDRESS'} = $vals{"Maintainer"};        }        if (defined($vals{"Locale"})) {                $ENV{'LC_CTYPE'} = $vals{"Locale"};        }	if ($vals{"HTTP-If-Modified-Since"} =~ /^y.*/io) {		$ENV{'HARVEST_GATHERER_DBS'} = $vals{"Data-Directory"};	}	$proddb = $vals{"Data-Directory"} . "/PRODUCTION.gdbm";	$indexdb = $vals{"Data-Directory"} . "/INDEX.gdbm";	$mddb = $vals{"Data-Directory"} . "/MD5.gdbm";	$prepcmd = "prepurls --leaf 'staturl";	$prepcmd .= " $vals{'Debug-Options'}" if (defined ($vals{'Debug-Options'}));	$prepcmd .= "' --root 'enum";	$prepcmd .= " $vals{'Debug-Options'}" if (defined ($vals{'Debug-Options'}));	$prepcmd .= " -tmpdb " . $vals{"Working-Directory"} . "/tmpdb.gdbm";	$prepcmd .= " -log " . $vals{"Log-File"};	if (-r $proddb) {		$prepcmd .= " -db " . $proddb . "'";	} else {		$prepcmd .= " -db /dev/null'";	}	&init_essence();	#	#  Create a script that will run Essence and the rest of the	#  programs need to gather.  This gets around a Solaris 2.3 bug.	#  when trying to use fork/exec to &this...	#	$gcmd = $vals{"Working-Directory"} . "/gathercmd.$$";	$gcmdinput = $vals{"Working-Directory"} . "/gatherinput.$$";	open(GCMD, "> $gcmd") || die "Gatherer: Cannot write $gcmd: $!\n";	print GCMD "#\n#  This is the command to run the Gatherer\n#\n";	foreach $k (sort keys %ENV) {		next if ($k !~ /^(HARVEST|TMPDIR|LC_CTYPE)/o);		print GCMD "setenv $k '$ENV{$k}'\n";	}	$okpath = $ENV{'PATH'};	$okpath =~ s/:/ /g;	print GCMD "set path = ( $okpath )\n";	print GCMD "set clobber\n";	print GCMD "set noglob\n";	print GCMD "\n";	if ($vals{'HTTP-Proxy'} eq "") {		print GCMD "unsetenv http_proxy\n";	} else {		print GCMD "setenv http_proxy http://$vals{'HTTP-Proxy'}/\n";	}	# Expire objects from the URL cache	print GCMD "urlpurge < /dev/null\n";	# Expire objects from the production database	print GCMD <<EOM;if (-r $proddb) then	chmod -R +w $proddb	expiredb -log $vals{'Log-File'} $proddb	if (\$status == 1) then		# expiredb expired some objects, rebuild index		chmod -R -w $proddb		rm -rf $indexdb $mddb		mkindex $proddb $indexdb $mddb		chmod -R a-w $indexdb $mddb	else		chmod -R -w $proddb	endifendifEOM	# Run the Gatherer	print GCMD "\n";	print GCMD "cat $gcmdinput" . " |  \\\n";	print GCMD $prepcmd . " |  \\\n";	print GCMD $essencecmd . "\n";	print GCMD "\n";	unless ($debug) {		unless ($vals{'Keep-Cache'} =~ /^y.*/io) {			print GCMD "if (\$status == 0) then\n";			print GCMD "\trm -rf $vals{'Working-Directory'}/cache-liburl/\n";			print GCMD "endif\n";		print GCMD "\n";		}		print GCMD "rm -f $vals{'Working-Directory'}/localmap.cf\n";		print GCMD "rm -f $vals{'Working-Directory'}/HTTPAuth.cf\n";		print GCMD "rm -f $vals{'Working-Directory'}/FTPAuth.cf\n";	}	print GCMD "exit 0\n";	close(GCMD);	open(URL, "> $gcmdinput") ||		die "Gatherer: Cannot write $gcmdinput: $!\n";	$setupdone = 1;}##  Once the Gatherer has run, install the PRODUCTION database and#  run gatherd.#sub install_gatherer {	# prepare the database	if (defined($vals{'Gatherer-Host'})) {		$gid = "$vals{'Gatherer-Host'}:$vals{'Gatherer-Port'}";	} else {		#chop($h = &grab_cmd_output("hostname"));		chop($h = `hostname`);		($fullh, @blah) = gethostbyname($h);		undef @blah;		$fullh = $h if ($fullh eq "");		$gid = "$fullh:$vals{'Gatherer-Port'}";	}	$folddbcmd = "folddb ";	# only pass certain options to folddb	foreach $o (split (/\s+/, $vals{'Gatherer-Options'})) {		$folddbcmd .= "$o " if ($o eq '--save-space');	}	$folddbcmd .= "\"$gid\" ";	$folddbcmd .= "$vals{'Data-Directory'}";	# prepare the access control list if needed	$gatherdcf = $vals{'Data-Directory'} . "/gatherd.cf";	if (! -r $gatherdcf) {		open(GCF, "> $gatherdcf") ||			die "Gatherer: Cannot create $gatherdcf: $!\n";		print GCF <<EOM;##  gatherd.cf - Access Control List for gatherd#Allow allEOM		close(GCF);	}	if ($vals{"Gatherd-Inetd"} eq "yes") {		$gatherdcmd = "@CMD_TRUE@";	# ignore it	} else {		$gatherdcmd = "gatherd -d " . $vals{"Data-Directory"} . " ";		$gatherdcmd .= $vals{"Gatherer-Port"};	}	$ecmd = $vals{"Working-Directory"} . "/exportcmd.$$";	open(ECMD, "> $ecmd") || die "Cannot write $ecmd: $!\n";	print ECMD "#!/bin/sh\n";	print ECMD "$folddbcmd\n";	print ECMD "$gatherdcmd\n";	close(ECMD);	chmod(0755, $ecmd) || die "Cannot chmod $ecmd: $!\n";	if ($do_background) {		&run_system("$ecmd &");	} else {		&run_system("$ecmd");		unlink($ecmd) if (!$debug);	}}sub run_system {	local($cmd) = @_;	print "RUNNING: $cmd\n" if ($debug);	system($cmd);}##  This is an ugly hack so that it works with Perl 4.036 on Solaris 2.3.#  The backticks (`) don't work on Solaris like they should. -Darren.##sub grab_cmd_output {#	local($the_cmd) = @_;#	undef $the_var;#	unlink("/tmp/cmdoutput.$$");#	system("$the_cmd > /tmp/cmdoutput.$$");#	open(CMDOUT, "< /tmp/cmdoutput.$$") || return "none";#	$the_var = <CMDOUT>;#	close(CMDOUT);#	unlink("/tmp/cmdoutput.$$");#	return $the_var;#}sub set_defaults {	$urlmax = 250;	$urlfilter = "/dev/null";	$urlfilter = "$ENV{'HARVEST_HOME'}/lib/gatherer/URL-filter-default"		if (-f "$ENV{'HARVEST_HOME'}/lib/gatherer/URL-filter-default");	$hostmax = 1;	$hostfilter = "/dev/null";	$delay = 1;	$depth = 0;	$accesstypes  = "HTTP";		# maybe "HTTP|FTP|Gopher" ?	$enumeratepgm = "@CMD_FALSE@";	$searchtype = "Breadth";}sub parse_options {	local(@options) = @_;	foreach $opt (@options) {		if ($opt =~ /^URL=(\d+)/io) {			$urlmax = $1;			$urlfilter = $1 if ($opt =~ /^URL=\d+,(\S+)/io);			next;		}		if ($opt =~ /^Host=(\d+)/io) {			$hostmax = $1;			$hostfilter = $1 if ($opt =~ /^Host=\d+,(\S+)/io);			next;		}		if ($opt =~ /^Site=(\d+)/io) {			$hostmax = $1;			$hostfilter = $1 if ($opt =~ /^Site=\d+,(\S+)/io);			next;		}		if ($opt =~ /^Access=(.*)/io)  {        		$accesstypes = $1;			next;		}		if ($opt =~ /^Delay=(\d+)/io) {			$delay = $1;			next;		}		if ($opt =~ /^Depth=(\d+)/io) {			$depth = $1;			next;		}		if ($opt =~ /^Enumeration=(\S+)/io) {			$enumeratepgm = $1;			next;		}		if ($opt =~ /^Search=(\S+)/io) {			$searchtype = $1;			next;		}		print STDERR "Illegal Option: $opt\n";	}}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -