📄 search.in
字号:
#!@PERL@############################################################################### search.cgi - Customizable WWW Interface to the Harvest Broker## Usage: Called as a CGI proccess from httpd################################################################################ Harvest Indexer http://harvest.sourceforge.net/# -----------------------------------------------## The Harvest Indexer is a continued development of code developed by# the Harvest Project. Development is carried out by numerous individuals# in the Internet community, and is not officially connected with the# original Harvest Project or its funding sources.## Please mail lee@arco.de if you are interested in participating# in the development effort.## This program is free software; you can redistribute it and/or modify# it under the terms of the GNU General Public License as published by# the Free Software Foundation; either version 2 of the License, or# (at your option) any later version.## This program is distributed in the hope that it will be useful,# but WITHOUT ANY WARRANTY; without even the implied warranty of# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the# GNU General Public License for more details.## You should have received a copy of the GNU General Public License# along with this program; if not, write to the Free Software# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.## Changes by Harald Weinreich <harald@weinreichs.de> for version 1.8 are# marked with "h.weinreich"$ENV{'HARVEST_HOME'} = "@prefix@" unless defined($ENV{'HARVEST_HOME'});# Uncomment this if you have problems with European 8-BIT characters on SunOS.#$ENV{'LANG'} = "C";#$ENV{'LC_CTYPE'} = "iso_8859_1";# Set this to a location for search.log, or to /dev/null#$BQLOG = "@prefix@/logs/search.log";$BQLOG = "/dev/null";$rcsid = '$Id: search.in,v 2.30 2002/08/30 12:25:00 sxw Exp $';# $expire# Temporary files are deleted after this many hours. Or, more precisely, when# this program is run, files older than $expire hours are deleted.## $tmp_dir# This is the location where temporary files are kept. This directory needs a# permissive access mode, so that the 'nobody' user (or whatever# user httpd uses) can write and delete files there. It is a fatal error if# this directory does not exist, or cannot be written in.####### configure these locally! #######$expire = 1; # expiration interval, in hours$tmp_dir = "$ENV{HARVEST_HOME}/tmp"; # dir for temp files###### end configuration #######$harvestIcon = "/Harvest/brokers/images/harvest-ic1.gif"; # logo$weightIcon = "/Harvest/brokers/images/harvest-weight.gif"; # weight image@X = split ('/', $0);$MYNAME = pop @X;$DIR = join ('/', @X);$ENV{'TMPDIR'} = "/tmp" unless defined($ENV{'TMPDIR'});unshift(@INC, "$ENV{'HARVEST_HOME'}/lib");not_configured() unless (-d $ENV{'HARVEST_HOME'});require 'socket.ph'; # not sys/socket.ph, we use $HARVEST_HOME/lib/socket.ph$debug = 0;$hp_url = '';$brokers = $ENV{'HARVEST_HOME'} . '/brokers/Brokers.cf';# show weight of result and navigation bar by Javier Masa Marin <masa@rediris.es>$weight = 0; # weight of current object$maxWeight = 0;$totalPages = 0; # number of Result pages# ===== MAIN =================================================================foreach $sig ('HUP', 'QUIT', 'TSTP', 'TERM', 'ABRT') { $SIG{$sig} = 'sigdie';}foreach $sig ('ALRM') { $SIG{$sig} = 'sigharddie';}# Parse the CGI request.#%RQ = &get_request;$debug = 1 if defined $RQ{'debug'};foreach $key (keys %RQ) { $RQ{$key} =~ s/\n/ /g; }# use unbuffered output#$unbuffered = 1;# If name is nph-search.cgi, more headers have to be sent.#$nph = $0 =~ /nph-/;# Send the MIME header *now* if we're in debug mode this means it will always# be text/html, but means when can see debugging information from parse_config#send_header() if $debug;# Read in the 'master config' file.#&parse_config ("$DIR/lib/search.cf");&parse_defaults;$CFG{'rcsid'} = $rcsid;# Parse a broker-specific config file (included as the following HTML - )# <INPUT TYPE="hidden" NAME="brokerqueryconfig" VALUE="foo.cf">#print "opening $DIR/lib/",&option('brokerqueryconfig'),"\n" if ($debug);&parse_config ("$DIR/lib/".&option('brokerqueryconfig')) if ( -f "$DIR/lib/".&option('brokerqueryconfig') );&parse_defaults;# determine character set for the HTTP-header## get the charset from search form or .cf-file$charset = &option('charset'); if ($charset eq "") {$charset = "ISO-8859-1";} # default# We can't send the MIME header *until* we've read the broker-specific file# (as they might change it in there) - so we have to wait until now to send# it.#send_header() unless $debug;# Now - once we're sure that a MIME header has been sent, we can validate# their input#&fatal ('NoQuery') if (%RQ == ());&fatal ('rcsid') if ($RQ{'version'} ne "");&dump_array (%RQ) if ($debug);&dump_array (%DEF) if ($debug);#&dump_array (%ENV) if ($debug);# EXTRACT QUERY OPTIONS#$lifetime = &option('lifetime');$userquery = &option('query');$category = &option('category');$userclass = &option('class');$caseflag = &option('caseflag') eq 'on' ? 1 : 0; # case sensitivity$wordflag = &option('wordflag') eq 'on' ? 1 : 0; # match on word boundary$csumflag = &option('csumflag') eq 'on' ? 1 : 0; # show links to indexing data$opaqueflag = &option('opaqueflag') eq 'on' ? 1 : 0; # return opaque data (matched lines)$descflag = &option('descflag') eq 'on' ? 1 : 0; # return object description$noregexflag = &option('noregexflag') eq 'on' ? 1 : 0; #$maxresult = &option('maxresultflag'); # max. num of result lines to be returned by broker$maxfiles = &option('maxobjflag'); # max. num of objects to be returned by broker$maxlines = &option('maxlineflag'); # max. num of matched lines per object$perpage = &option('perpageflag'); # show n objects per page (0 = don't split output)$page = &option('pageflag'); # show page nr x$errors = &option('errorflag'); # number of errors allowed$weightflag = &option('weightflag') eq 'on' ? 1 : 0; # show weight balls$broker = &option('broker') || &option('host'); # name of broker#$version = &option('version') eq "" ? 0 : 1; # ?$hp_url = &option('hp_url'); # URL of search page$sort = &option('sort'); # sort options@atts = split (/\s+/, &option('attribute')); # list of attributes to be displayedforeach $a (@atts) { $attributes .= " #attribute \"$a\"";}# Build filter strings. Filters are e.g. used to specify a host.# They are attached to the query but not displayed as query string# Filters consist of a query string and a user hint to be displayed# on the search results page, seperated by a pipe symbol. Example:# <INPUT NAME="filter" value="type: html|<H3>Search only HTML documents</h3>" type="hidden"># See query-glimpse.html for more examples - by h.weinreich@filter = split (/\0/, &option('filter'));$filter = ""; # The filter is attached to $query$filterhint = ""; # Filterhint is the text shown in the output to the user...foreach (@filter) { @t = split(/\|/); $filter .= " AND $t[0]" if $t[0]; $filterhint .= "$t[1]\n" if $t[1];}# SECURITY CHECKS AND TRANSLATION ON BROKER HOST,PORT#$errmsg = <<"EOF";<TITLE>$broker not found</TITLE><PRE>$MYNAME doesn't know the broker <B>$broker</B>Either it is not in the allowed list, or perhaps$brokers is not readable.</PRE>EOF&fatal($errmsg) unless (@hostport = &get_host_port ($broker));# HACKS FOR BROKEN LYNX BROWSER#$errors = 0 if ($errors eq 'None');$errors = 1 if ($errors eq '1 Error');$errors = 2 if ($errors eq '2 Errors');# SET THE LIFETIME#$BQlife = $CFG{'Timeout'};$BQlife = $lifetime + 300 if ($lifetime ne "");alarm ($BQlife);### # SANITY CHECKS### #### &fatal ('NoReplica') if ($host eq "No Replicas");### &fatal ('Misconfig') if ($host eq "" || $port == 0);# CHECK QUERY STRING FOR COMMON MISTAKES#$userquery =~ s/^\s+//; # remove leading whitespace$userquery =~ s/\s+$//; # remove trailing whitespace$userquery = &entities($userquery); # translate SGML entitiesunless ( $userquery =~ /\s+and\s+/i || $userquery =~ /\s+or\s+/i || $userquery =~ /:\s+/ || $userquery =~ /\"/) { if ($category ne "anytext") { @X = split (/\s+/, $userquery); for ($i=0; $i<=$#X; $i++) { # Support for + and - in front of search words - h.weinreich $X[$i] =~ s/^\+(.*)/$1/; # remove leading + $X[$i] =~ s/^\-(.*)/NOT $1/; # replace leading - by NOT $X[$i] =~ s/\,//; # remove commata: not supported by broker # put quotes around unknown characters $X[$i] = "\"$X[$i]\"" if ($X[$i] =~ /[^ \w\d-]/); $X[$i] = "$category:$X[$i]" if (($category ne "any") && ($category ne "")); } $userquery = join (' AND ', @X); } else { @X = split (/\s+/, $userquery); for ($i=0; $i<=$#X; $i++) { $X[$i] = "\"$X[$i]\"" if ($X[$i] =~ /\W/); $X[$i] = "(keywords:$X[$i] OR title:$X[$i] OR body:$X[$i] OR headings:$X[$i] OR address:$X[$i])" } $userquery = join (' AND ', @X); }}# BUILD QUERY STRING#$query = "";$query .= $userclass . " AND " if ($userclass ne "");$query .= $userquery;# BUILD BROKER QUERY#$bquery = "#USER";$bquery .= " #opaque" if ($opaqueflag);$bquery .= " #desc" if ($descflag);$bquery .= " #index timeout $lifetime" if ($lifetime ne "");$bquery .= " #index error $errors" if ($errors ne "");$bquery .= " #index maxresult $maxresult" if ($maxresult ne "");$bquery .= " #index maxfiles $maxfiles" if ($maxfiles ne "");$bquery .= " #index maxlines $maxlines" if ($maxlines ne "");$bquery .= " #index case";$bquery .= $caseflag ? " insensitive" : " sensitive";$bquery .= " #index matchword" if ($wordflag);$bquery .= " #index noregex" if ($noregexflag);$bquery .= $attributes;$bquery .= " #END ";$bquery .= $query;$bquery .= $filter;$simple_query = $1 if ($query =~ /^.*partial-text\s*:\s+"(.*)".*$/io);# Call Init Function#eval $CFG{'InitFunction'} if (defined ($CFG{'InitFunction'}));# DO THE QUERY#$html_query = &html_escape ($query);print &expand ($CFG{'ResultHeader'});$connected = 0;# Split query into words for highlighting...$searchwords = &option('query');$searchwords =~ s/-|\.|\,|;/ /g;$searchwords =~ s/ AND | OR | NOT |\(|\)|:|\"|\+/ /g;@searchwords = split(/ +/,$searchwords);&cleanup(); # remove outdated files...&do_query ($bquery, @hostport);exit 0; # END OF PROGRAM# ===== SUBROUTINES ==========================================================# Send the appropriate HTTP headerssub send_header { my $content = (defined $CFG{'ContentType'}) ? &expand($CFG{'ContentType'}) : "text/html"; $content .= "; charset=".$charset if ($charset ne ""); $|=1 if ($unbuffered); # use unbuffered output. if ($nph) {# $|=1; print "HTTP/1.0 200 OK\n"; print "Server: $ENV{SERVER_SOFTWARE}\n"; } print "Content-Type: $content\n\n";}# broker_host_port:## If given a name, return the corresponding (host,port) pair.# If given a host:port string, make sure it is a valid broker.sub get_host_port { local ($broker) = @_; local ($name,$host,$port); local ($pattern); local (@hostport) = (); if ($broker =~ /([^:]+):(\d+)/) { # given host:port $host = $1; $port = $2; return ($host,$port) unless ( -r $brokers ); $pattern = '\S+\s+' . $host . '\s+' . $port; } else { # given a name return () unless ( -r $brokers ); # cant translate name $broker_re = $broker; $broker_re =~ s/\W/\\$&/g; # escape specials $pattern = '^' . $broker_re . '\s+\S+\s+\d+'; } open (brokers) || &fatal ("$brokers: $!\n"); while (<brokers>) { chop; s/#.*//; # strip comments s/^\s+//; # leading whitespace s/\s+$//; # trailing whitespace next unless (/$pattern/io); ($name,$host,$port) = split; # found match push (@hostport, $host); # add host to array push (@hostport, $port); # add port to array } close brokers; return (@hostport); # not found}#### cleanup: remove old temporary files - h.weinreichsub cleanup { # (if $expire=12, then 12/24 sets a 12 hour expiration cycle) if (chdir "$tmp_dir/$broker") { foreach $file (<[a-z]*\.search>) { unlink "$file" if -M $file > $expire/24; } }}#### do_query: send query to broker and write it to a temporary file#### or read from temporary file if it already exists.sub do_query { local ($bquery, @hostport) = @_; local ($nobjects) = 0; local ($nopaquelines) = 0; # The temporary files search results are used as buffers to reduce requests # to the broker, e.g. necessary for splitted result pages - h.weinreich local ($filename) = &get_hashcode($bquery); # create filename for the buffer file $filename = "$tmp_dir/$broker/$filename.search"; local $tempfile_exists = 0; if (-e $filename) { # does temp file exist? open($INPUT, "<$filename") || &fatal ("Could not open file for read: $filename\n"); my $oldquery = (<$INPUT>); chop $oldquery; if ($oldquery eq $bquery) { # hash code OK $tempfile_exists = 1; # use tempfile as <INPUT> } else { close $INPUT; } } print "$tempfile_exists<BR>" if ($debug); print "$filename<BR>" if ($debug); if (!$tempfile_exists) { # tempfile does not exist: use broker as <INPUT> # read from broker and write to tempfile - h.weinreich while ($#hostport > $[) { $host = shift (@hostport); $port = shift (@hostport); if ($INPUT = &client_socket ($host, $port)) { $connected = 1; last; } } &broker_down ($host, $port) unless ($connected); print "Sending <PRE> $bquery </PRE> to $host:$port<HR>\n" if ($debug); alarm(300); # after 5 minutes just kill it, CERN httpd won't do it # Send query to broker print $INPUT $bquery; # Open temporary file for output open(TEMPFILE, ">$filename") || \ &fatal ("Could not open temporary file for write: $filename\n"); # Write query string first print TEMPFILE "$bquery\n"; } # Read input and pre-format for output. while (<$INPUT>) { # print to tempfile if $INPUT is broker print TEMPFILE; if ($debug) { chop; print "|$_|\n"; } # Ignore status messages next if (/^200 -/o); # Read Broker homepage URL (see file admin/broker.conf) if (/^126 - (.*)$/o) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -