📄 gatherer.in
字号:
$essencecmd .= $vals{"Data-Directory"}; } if (defined($vals{"Working-Directory"})) { $essencecmd .= " --tmpdir "; $essencecmd .= $vals{"Working-Directory"}; } if (defined($vals{"Lib-Directory"})) { $essencecmd .= " --libdir "; $essencecmd .= $vals{"Lib-Directory"}; } if (defined($vals{"Log-File"})) { $essencecmd .= " --log "; $essencecmd .= $vals{"Log-File"}; $ENV{'HARVEST_GATHERER_LOGFILE'} = $vals{"Log-File"}; } if (defined($vals{"Gatherer-Host"})) { $essencecmd .= " --gatherer-host "; $essencecmd .= "'" . $vals{"Gatherer-Host"} . "'"; } if (defined($vals{"Gatherer-Name"})) { $essencecmd .= " --gatherer-name "; $tmp = $vals{"Gatherer-Name"}; $tmp =~ s/'/'\\''/g; # protect quotes, ptooey! $essencecmd .= "'" . $tmp . "'"; } if (defined($vals{"Gatherer-Version"})) { $essencecmd .= " --gatherer-version "; $essencecmd .= "'" . $vals{"Gatherer-Version"} . "'"; } if (defined($vals{"Post-Summarizing"})) { $essencecmd .= " --post-process "; $essencecmd .= "'" . $vals{"Post-Summarizing"} . "'"; } if (defined($vals{"Refresh-Rate"})) { $essencecmd .= " --default-refresh "; $essencecmd .= "'" . $vals{"Refresh-Rate"} . "'"; } if (defined($vals{"Time-To-Live"})) { $essencecmd .= " --default-ttl "; $essencecmd .= "'" . $vals{"Time-To-Live"} . "'"; } if (defined($vals{"Essence-Options"})) { $essencecmd .= " " . $vals{"Essence-Options"}; } if (defined($vals{"Debug-Options"})) { $essencecmd .= " " . $vals{"Debug-Options"}; $ENV{'HARVEST_DEBUG'} = $vals{"Debug-Options"}; } $essencecmd .= " --verbose"; $essencecmd .= " -f -";}sub startup_prepurls { &init_dir('Lib-Directory', 0) # dont test writable if (defined($vals{'Lib-Directory'})); &init_dir('Data-Directory', 1) if (defined($vals{'Data-Directory'})); &init_dir('Working-Directory', 1) if (defined($vals{'Working-Directory'}));# foreach $f ("$vals{'Data-Directory'}/index.html", "$vals{'Working-Directory'}/index.html") {# open(INDEXHTML, "> $f");# print INDEXHTML <<EOM;#<html>#Please use the Harvest Gatherer's interface to retrieve these files.#</html>#EOM# close(INDEXHTML);# chmod(0644, $f);# } $ENV{'TMPDIR'} = $vals{"Working-Directory"}; # don't need this kjl/25oct2000 #$templatename = $vals{"Data-Directory"} . "/All-Templates.gz"; #if (-e $templatename) { # system("gzip -dc $templatename > $ENV{'TMPDIR'}/All-Files"); #} if (defined(%mapping)) { $tfile = "$vals{'Working-Directory'}/localmap.cf"; open(MAPPING, "> $tfile") || die "Gatherer: Cannot write Mapping: $tfile: $!\n"; foreach $k (sort keys %mapping) { print MAPPING "$k\t$mapping{$k}\n"; } close(MAPPING); $ENV{'HARVEST_URL_LOCAL_MAPPINGS'} = $tfile; } else { $ENV{'HARVEST_URL_LOCAL_MAPPINGS'} = "/dev/null"; } if (defined($HTTPAuth)) { $tfile = "$vals{'Working-Directory'}/HTTPAuth.cf"; open(AUTH, "> $tfile") || die "Gatherer: Cannot write Auth file: $tfile: $!\n"; print AUTH $HTTPAuth; close(AUTH); $ENV{'HARVEST_HTTP_AUTHENTICATIONS'} = $tfile; } else { $ENV{'HARVEST_HTTP_AUTHENTICATIONS'} = '/dev/null'; } if (defined($FTPAuth)) { $tfile = "$vals{'Working-Directory'}/FTPAuth.cf"; open(AUTH, "> $tfile") || die "Gatherer: Cannot write Auth file: $tfile: $!\n"; print AUTH $FTPAuth; close(AUTH); $ENV{'HARVEST_FTP_AUTHENTICATIONS'} = $tfile; } else { $ENV{'HARVEST_FTP_AUTHENTICATIONS'} = '/dev/null'; } if (defined($vals{"User-Agent"})) { $ENV{'HARVEST_USER_AGENT'} = $vals{"User-Agent"}; } if (defined($vals{"Transfer-Timeout"})) { $ENV{'HARVEST_XFER_TIMEOUT'} = $vals{'Transfer-Timeout'}; } if (defined($vals{"Maintainer"})) { $ENV{'HARVEST_MAINTAINER_ADDRESS'} = $vals{"Maintainer"}; } if (defined($vals{"Locale"})) { $ENV{'LC_CTYPE'} = $vals{"Locale"}; } if ($vals{"HTTP-If-Modified-Since"} =~ /^y.*/io) { $ENV{'HARVEST_GATHERER_DBS'} = $vals{"Data-Directory"}; } $proddb = $vals{"Data-Directory"} . "/PRODUCTION.gdbm"; $indexdb = $vals{"Data-Directory"} . "/INDEX.gdbm"; $mddb = $vals{"Data-Directory"} . "/MD5.gdbm"; $prepcmd = "prepurls --leaf 'staturl"; $prepcmd .= " $vals{'Debug-Options'}" if (defined ($vals{'Debug-Options'})); $prepcmd .= "' --root 'enum"; $prepcmd .= " $vals{'Debug-Options'}" if (defined ($vals{'Debug-Options'})); $prepcmd .= " -tmpdb " . $vals{"Working-Directory"} . "/tmpdb.gdbm"; $prepcmd .= " -log " . $vals{"Log-File"}; if (-r $proddb) { $prepcmd .= " -db " . $proddb . "'"; } else { $prepcmd .= " -db /dev/null'"; } &init_essence(); # # Create a script that will run Essence and the rest of the # programs need to gather. This gets around a Solaris 2.3 bug. # when trying to use fork/exec to &this... # $gcmd = $vals{"Working-Directory"} . "/gathercmd.$$"; $gcmdinput = $vals{"Working-Directory"} . "/gatherinput.$$"; open(GCMD, "> $gcmd") || die "Gatherer: Cannot write $gcmd: $!\n"; print GCMD "#\n# This is the command to run the Gatherer\n#\n"; foreach $k (sort keys %ENV) { next if ($k !~ /^(HARVEST|TMPDIR|LC_CTYPE)/o); print GCMD "setenv $k '$ENV{$k}'\n"; } $okpath = $ENV{'PATH'}; $okpath =~ s/:/ /g; print GCMD "set path = ( $okpath )\n"; print GCMD "set clobber\n"; print GCMD "set noglob\n"; print GCMD "\n"; if ($vals{'HTTP-Proxy'} eq "") { print GCMD "unsetenv http_proxy\n"; } else { print GCMD "setenv http_proxy http://$vals{'HTTP-Proxy'}/\n"; } # Expire objects from the URL cache print GCMD "urlpurge < /dev/null\n"; # Expire objects from the production database print GCMD <<EOM;if (-r $proddb) then chmod -R +w $proddb expiredb -log $vals{'Log-File'} $proddb if (\$status == 1) then # expiredb expired some objects, rebuild index chmod -R -w $proddb rm -rf $indexdb $mddb mkindex $proddb $indexdb $mddb chmod -R a-w $indexdb $mddb else chmod -R -w $proddb endifendifEOM # Run the Gatherer print GCMD "\n"; print GCMD "cat $gcmdinput" . " | \\\n"; print GCMD $prepcmd . " | \\\n"; print GCMD $essencecmd . "\n"; print GCMD "\n"; unless ($debug) { unless ($vals{'Keep-Cache'} =~ /^y.*/io) { print GCMD "if (\$status == 0) then\n"; print GCMD "\trm -rf $vals{'Working-Directory'}/cache-liburl/\n"; print GCMD "endif\n"; print GCMD "\n"; } print GCMD "rm -f $vals{'Working-Directory'}/localmap.cf\n"; print GCMD "rm -f $vals{'Working-Directory'}/HTTPAuth.cf\n"; print GCMD "rm -f $vals{'Working-Directory'}/FTPAuth.cf\n"; } print GCMD "exit 0\n"; close(GCMD); open(URL, "> $gcmdinput") || die "Gatherer: Cannot write $gcmdinput: $!\n"; $setupdone = 1;}## Once the Gatherer has run, install the PRODUCTION database and# run gatherd.#sub install_gatherer { # prepare the database if (defined($vals{'Gatherer-Host'})) { $gid = "$vals{'Gatherer-Host'}:$vals{'Gatherer-Port'}"; } else { #chop($h = &grab_cmd_output("hostname")); chop($h = `hostname`); ($fullh, @blah) = gethostbyname($h); undef @blah; $fullh = $h if ($fullh eq ""); $gid = "$fullh:$vals{'Gatherer-Port'}"; } $folddbcmd = "folddb "; # only pass certain options to folddb foreach $o (split (/\s+/, $vals{'Gatherer-Options'})) { $folddbcmd .= "$o " if ($o eq '--save-space'); } $folddbcmd .= "\"$gid\" "; $folddbcmd .= "$vals{'Data-Directory'}"; # prepare the access control list if needed $gatherdcf = $vals{'Data-Directory'} . "/gatherd.cf"; if (! -r $gatherdcf) { open(GCF, "> $gatherdcf") || die "Gatherer: Cannot create $gatherdcf: $!\n"; print GCF <<EOM;## gatherd.cf - Access Control List for gatherd#Allow allEOM close(GCF); } if ($vals{"Gatherd-Inetd"} eq "yes") { $gatherdcmd = "@CMD_TRUE@"; # ignore it } else { $gatherdcmd = "gatherd -d " . $vals{"Data-Directory"} . " "; $gatherdcmd .= $vals{"Gatherer-Port"}; } $ecmd = $vals{"Working-Directory"} . "/exportcmd.$$"; open(ECMD, "> $ecmd") || die "Cannot write $ecmd: $!\n"; print ECMD "#!/bin/sh\n"; print ECMD "$folddbcmd\n"; print ECMD "$gatherdcmd\n"; close(ECMD); chmod(0755, $ecmd) || die "Cannot chmod $ecmd: $!\n"; if ($do_background) { &run_system("$ecmd &"); } else { &run_system("$ecmd"); unlink($ecmd) if (!$debug); }}sub run_system { local($cmd) = @_; print "RUNNING: $cmd\n" if ($debug); system($cmd);}## This is an ugly hack so that it works with Perl 4.036 on Solaris 2.3.# The backticks (`) don't work on Solaris like they should. -Darren.##sub grab_cmd_output {# local($the_cmd) = @_;# undef $the_var;# unlink("/tmp/cmdoutput.$$");# system("$the_cmd > /tmp/cmdoutput.$$");# open(CMDOUT, "< /tmp/cmdoutput.$$") || return "none";# $the_var = <CMDOUT>;# close(CMDOUT);# unlink("/tmp/cmdoutput.$$");# return $the_var;#}sub set_defaults { $urlmax = 250; $urlfilter = "/dev/null"; $urlfilter = "$ENV{'HARVEST_HOME'}/lib/gatherer/URL-filter-default" if (-f "$ENV{'HARVEST_HOME'}/lib/gatherer/URL-filter-default"); $hostmax = 1; $hostfilter = "/dev/null"; $delay = 1; $depth = 0; $accesstypes = "HTTP"; # maybe "HTTP|FTP|Gopher" ? $enumeratepgm = "@CMD_FALSE@"; $searchtype = "Breadth";}sub parse_options { local(@options) = @_; foreach $opt (@options) { if ($opt =~ /^URL=(\d+)/io) { $urlmax = $1; $urlfilter = $1 if ($opt =~ /^URL=\d+,(\S+)/io); next; } if ($opt =~ /^Host=(\d+)/io) { $hostmax = $1; $hostfilter = $1 if ($opt =~ /^Host=\d+,(\S+)/io); next; } if ($opt =~ /^Site=(\d+)/io) { $hostmax = $1; $hostfilter = $1 if ($opt =~ /^Site=\d+,(\S+)/io); next; } if ($opt =~ /^Access=(.*)/io) { $accesstypes = $1; next; } if ($opt =~ /^Delay=(\d+)/io) { $delay = $1; next; } if ($opt =~ /^Depth=(\d+)/io) { $depth = $1; next; } if ($opt =~ /^Enumeration=(\S+)/io) { $enumeratepgm = $1; next; } if ($opt =~ /^Search=(\S+)/io) { $searchtype = $1; next; } print STDERR "Illegal Option: $opt\n"; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -