📄 gatherer.in
字号:
#!@PERL@## Gatherer - Main interface to the Gatherer. Parses the configuration file# and starts up the Gatherering process.## Usage: Gatherer [-manual | -export | -debug | -background] file.cf## $Id: Gatherer.in,v 2.7 2000/02/03 12:45:56 sxw Exp $################################################################################ Harvest Indexer http://harvest.sourceforge.net/# -----------------------------------------------## The Harvest Indexer is a continued development of code developed by# the Harvest Project. Development is carried out by numerous individuals# in the Internet community, and is not officially connected with the# original Harvest Project or its funding sources.## Please mail lee@arco.de if you are interested in participating# in the development effort.## This program is free software; you can redistribute it and/or modify# it under the terms of the GNU General Public License as published by# the Free Software Foundation; either version 2 of the License, or# (at your option) any later version.## This program is distributed in the hope that it will be useful,# but WITHOUT ANY WARRANTY; without even the implied warranty of# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the# GNU General Public License for more details.## You should have received a copy of the GNU General Public License# along with this program; if not, write to the Free Software# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.########################################################################## Copyright (c) 1994, 1995. All rights reserved.## The Harvest software was developed by the Internet Research Task# Force Research Group on Resource Discovery (IRTF-RD):## Mic Bowman of Transarc Corporation.# Peter Danzig of the University of Southern California.# Darren R. Hardy of the University of Colorado at Boulder.# Udi Manber of the University of Arizona.# Michael F. Schwartz of the University of Colorado at Boulder.# Duane Wessels of the University of Colorado at Boulder.## This copyright notice applies to software in the Harvest# ``src/'' directory only. Users should consult the individual# copyright notices in the ``components/'' subdirectories for# copyright information about other software bundled with the# Harvest source code distribution.## TERMS OF USE## The Harvest software may be used and re-distributed without# charge, provided that the software origin and research team are# cited in any use of the system. Most commonly this is# accomplished by including a link to the Harvest Home Page# (http://harvest.cs.colorado.edu/) from the query page of any# Broker you deploy, as well as in the query result pages. These# links are generated automatically by the standard Broker# software distribution.## The Harvest software is provided ``as is'', without express or# implied warranty, and with no support nor obligation to assist# in its use, correction, modification or enhancement. We assume# no liability with respect to the infringement of copyrights,# trade secrets, or any patents, and are not responsible for# consequential damages. Proper use of the Harvest software is# entirely the responsibility of the user.## DERIVATIVE WORKS## Users may make derivative works from the Harvest software, subject# to the following constraints:## - You must include the above copyright notice and these# accompanying paragraphs in all forms of derivative works,# and any documentation and other materials related to such# distribution and use acknowledge that the software was# developed at the above institutions.## - You must notify IRTF-RD regarding your distribution of# the derivative work.## - You must clearly notify users that your are distributing# a modified version and not the original Harvest software.## - Any derivative product is also subject to these copyright# and use restrictions.## Note that the Harvest software is NOT in the public domain. We# retain copyright, as specified above.## HISTORY OF FREE SOFTWARE STATUS## Originally we required sites to license the software in cases# where they were going to build commercial products/services# around Harvest. In June 1995 we changed this policy. We now# allow people to use the core Harvest software (the code found in# the Harvest ``src/'' directory) for free. We made this change# in the interest of encouraging the widest possible deployment of# the technology. The Harvest software is really a reference# implementation of a set of protocols and formats, some of which# we intend to standardize. We encourage commercial# re-implementations of code complying to this set of standards.##$ENV{'HARVEST_HOME'} = "@prefix@" if (!defined($ENV{'HARVEST_HOME'}));$ENV{'PATH'} = "$ENV{'HARVEST_HOME'}/bin" . ":" . "$ENV{'HARVEST_HOME'}/lib/gatherer" . ":" . "$ENV{'HARVEST_HOME'}/lib" . ":" . "$ENV{'PATH'}";$debug = 0;sub usage { print STDERR "Usage: Gatherer [options] config-file\n"; exit(1);}## Set the default values. Basically, everything is set off of the# Top-Directory.##chop($cdir = &grab_cmd_output("pwd"));chop($cdir = `pwd`);undef %vals;$vals{"Top-Directory"} = $cdir;$vals{"Data-Directory"} = $cdir . "/data";$vals{"Working-Directory"} = $cdir . "/tmp";$vals{"Lib-Directory"} = $cdir . "/lib";$vals{"Log-File"} = $cdir . "/log.gatherer";$vals{"Errorlog-File"} = $cdir . "/log.errors";$vals{"Gatherer-Port"} = "8500";$vals{"Gatherd-Inetd"} = "no";$automatic = 1;$setupdone = 0;$do_export = 0;$do_background = 0;$configfile = shift(@ARGV);# These are some voodoo flags for those who know what they're doingwhile ($configfile =~ /^-/) { if ($configfile eq "-debug") { $debug = 1; $configfile = shift(@ARGV); } elsif ($configfile eq "-manual") { $automatic = 0; $configfile = shift(@ARGV); } elsif ($configfile eq "-export") { $do_export = 1; $configfile = shift(@ARGV); } elsif ($configfile eq "-background") { $do_background = 1; $configfile = shift(@ARGV); } else { usage(); }}# are args OK?usage() if ($#ARGV > -1); # still args left?usage() if ($configfile eq ""); # configfile bogus?$| = 1 if ($debug);# Valid tags for attribute-value pairs in configuration files@tags = ( "Access-Delay", "Data-Directory", "Debug-Options", "Essence-Options", "Gatherer-Options", "Gatherd-Inetd", "Gatherer-Host", "Gatherer-Name", "Gatherer-Port", "Gatherer-Version", "HTTP-Basic-Auth", "FTP-Auth", "HTTP-Proxy", "HTTP-If-Modified-Since", "Keep-Cache", "Lib-Directory", "Local-Mapping", "Locale", "Log-File", "Maintainer", "Errorlog-File", "Post-Summarizing", "Refresh-Rate", "Time-To-Live", "Top-Directory", "Transfer-Timeout", "User-Agent", "Working-Directory", );print "Data Directory is ", $vals{"Data-Directory"}, "\n" if ($debug);print "PATH is $ENV{'PATH'}\n" if ($debug);## Looks like this: Gatherer# |-> prepurls -> essence# |-> enum & staturl## Read in the configuration of the gatherer#open(CONFIG, "<$configfile") || die "Gatherer: Cannot read configuration file: $configfile: $!\n";while (<CONFIG>) { $recog = 0; next if (/^#/o); next if (/^\s+$/o); last if ($do_export && (/^<RootNodes>/io || /^<LeafNodes>/io)); chop; &process_rootnodes(), next if (/^<RootNodes>/io); &process_leafnodes(), next if (/^<LeafNodes>/io); foreach $tag (@tags) { if (/^$tag:\s+(.*)$/i) { $vals{$tag} = $1; if ($tag eq 'Local-Mapping') { ($url, $path) = split(/\s+/, $vals{$tag}); $url =~s#/~#/%7e#; $url =~s#/%7E#/%7e#; $mapping{$url} = $path; } if ($tag eq 'HTTP-Basic-Auth') { $HTTPAuth .= 'Basic ' . $vals{$tag} . "\n"; } if ($tag eq 'FTP-Auth') { $FTPAuth .= $vals{$tag} . "\n"; } if ($tag eq 'Top-Directory') { &init_dir("Top-Directory", 1); $vals{"Data-Directory"} = $vals{"Top-Directory"} . "/data"; $vals{"Working-Directory"} = $vals{"Top-Directory"} . "/tmp"; $vals{"Lib-Directory"} = $vals{"Top-Directory"} . "/lib"; $vals{"Log-File"} = $vals{"Top-Directory"} . "/log.gatherer"; $vals{"Errorlog-File"} = $vals{"Top-Directory"} . "/log.errors"; } if ($tag eq 'Access-Delay') { $delay = int($vals{$tag}); $ENV{'HARVEST_URL_DELAY'} = $delay; } $recog = 1; last; } } print "WARNING: Unrecognized line: $_\n" if (!$recog);}close(CONFIG);close(URL); # must close URL for process to stopchdir($vals{'Top-Directory'}) || die "Gatherer: Cannot chdir to $vals{'Top-Directory'}: $!";&run_system("/bin/csh -f $gcmd 2>> $vals{'Errorlog-File'}"); # actually run the Gathererunlink($gcmd) if ($debug == 0);unlink($gcmdinput) if ($debug == 0);# don't need this kjl/25oct2000#unlink("$ENV{'TMPDIR'}/All-Files") if ( -e "$ENV{'TMPDIR'}/All-Files");&install_gatherer() if ($automatic == 1);exit(0); # END OF PROGRAMsub process_rootnodes { &startup_prepurls() if (!$setupdone); while (<CONFIG>) { chop; next if (/^#/o); last if (/^<\/RootNodes>/io); while (substr($_, $#_, 1) eq "\\") { chop($_); chop($nextline = <CONFIG>); $_ .= $nextline; } ($rooturl, @options) = split; &set_defaults(); &parse_options(@options); next if ($rooturl =~ /^\s*$/io); # empty rooturl if ($rooturl =~ /^\|(.*)$/) { # generate URLs from pgm $pgm = $1; die "$pgm: $!\n" unless open (PGM, "$pgm|"); while (<PGM>) { if (substr($_, 0, 1) eq '#') { # Pass comments print URL $_; next; } chop; $rootargs = "$_ $urlmax $urlfilter $hostmax $hostfilter $delay $depth $accesstypes $enumeratepgm $searchtype"; print URL "ROOT\t$rootargs\n"; print "ROOT\t$rootargs\n" if ($debug); } close PGM; } else { $rootargs = "$rooturl $urlmax $urlfilter $hostmax $hostfilter $delay $depth $accesstypes $enumeratepgm $searchtype"; print URL "ROOT\t$rootargs\n"; print "ROOT\t$rootargs\n" if ($debug); } }}sub process_leafnodes { &startup_prepurls() if (!$setupdone); while (<CONFIG>) { next if (/^#/o); last if (/^<\/LeafNodes>/io); if (/^\|(.*)$/) { # generate URLs from pgm $pgm = $1; die "$pgm: $!\n" unless open (PGM, "$pgm|"); while (<PGM>) { if (substr($_, 0, 1) eq '#') { # Pass comments print URL $_; next; } print URL "LEAF\t$_"; print "LEAF\t$_" if ($debug); } close PGM; } else { print URL "LEAF\t$_"; print "LEAF\t$_" if ($debug); } }}sub init_dir { local($k, $write_test) = @_; return if (!defined($vals{$k})); if ($vals{$k} !~ /^\//o) { $vals{$k} = $1 if ($vals{$k} =~ /^\.\/(.*)$/o); $vals{$k} = $cdir . "/" . $vals{$k}; } print "init_dir($vals{$k})\n" if ($debug); mkdir($vals{$k}, 0755) || die "Gatherer: mkdir: $vals{$k}: $!\n" if (! -d $vals{$k}); return unless ($write_test); if (open (TEST, ">$vals{$k}/.write_test")) { close TEST; unlink "$vals{$k}/.write_test"; return; } die "Gatherer: Unable to make directory writable.\n\tchmod: $vals{$k}: $!\n" unless (chmod(0755, $vals{$k}));}sub init_essence { $essencecmd = "essence"; if (defined($vals{"Data-Directory"})) { $essencecmd .= " --dbdir ";
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -