📄 search.rediris.in
字号:
#!@PERL@############################################################################### search.cgi - Customizable WWW Interface to the Harvest Broker## Usage: Called as a CGI proccess from httpd################################################################################ Harvest Indexer http://harvest.sourceforge.net/# -----------------------------------------------## The Harvest Indexer is a continued development of code developed by# the Harvest Project. Development is carried out by numerous individuals# in the Internet community, and is not officially connected with the# original Harvest Project or its funding sources.## Please mail lee@arco.de if you are interested in participating# in the development effort.## This program is free software; you can redistribute it and/or modify# it under the terms of the GNU General Public License as published by# the Free Software Foundation; either version 2 of the License, or# (at your option) any later version.## This program is distributed in the hope that it will be useful,# but WITHOUT ANY WARRANTY; without even the implied warranty of# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the# GNU General Public License for more details.## You should have received a copy of the GNU General Public License# along with this program; if not, write to the Free Software# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.#$ENV{'HARVEST_HOME'} = "@prefix@" unless defined($ENV{'HARVEST_HOME'});# Uncomment this if you have problems with European 8-BIT characters on SunOS.#$ENV{'LANG'} = "C";#$ENV{'LC_CTYPE'} = "iso_8859_1";# Set this to a location for search.log, or to /dev/null#$BQLOG = "@prefix@/logs/search.log";$BQLOG = "/dev/null";$rcsid = '$Id: search.in,v 2.20 2002/07/27 16:25:00 sxw Exp $';# $expire# Temporary files are deleted after this many hours. Or, more precisely, when# this program is run, files older than $expire hours are deleted.## $tmp_dir# This is the location where temporary files are kept. This directory needs a# permissive access mode, so that the 'nobody' user (or whatever# user httpd uses) can write and delete files there. It is a fatal error if# this directory does not exist, or cannot be written in.## $tmp_www# This prefaces the URL file reference that leads to the 'Next' and 'Previous'# pages of results, e.g.:# http://www.some.host/tmp/875722836-6913-2.html####### configure these locally! #######$expire = 1; # expiration interval, in hours$tmp_dir = "$ENV{HARVEST_HOME}/tmp"; # dir for temp files$tmp_www = "/Harvest/tmp"; # URL path pointing to $tmp_dir###### end configuration ########--- Image "powered by Harvest"$rHarvestIcon = "http://www.rediris.es/poweredHarvest18.gif";#--- Image to use as weigth icon$rWeightIcon = "http://www.rediris.es/weightHarvest01.gif";#--- URL of the search home page$rSearchPage = "http://www.xxx.com/search.html";#--- URL of displaySOIF script$rDisplaySoifPage = "http://www.rediris.es/harvest/cgi-bin/displaySOIF.rediris";#--- Elements before and after de current page in the navigation bar$rNavPart = 5;# If name is nph-search.cgi, then we behave as an nph- cgi (ie send more# headers, and use unbuffered output).$nph = $0 =~ /nph-/;@X = split ('/', $0);$MYNAME = pop @X;$DIR = join ('/', @X);$ENV{'TMPDIR'} = "/tmp" unless defined($ENV{'TMPDIR'});unshift(@INC, "$ENV{'HARVEST_HOME'}/lib");not_configured() unless (-d $ENV{'HARVEST_HOME'});require 'socket.ph'; # not sys/socket.ph, we use $HARVEST_HOME/lib/socket.ph$debug = 0;$hp_url = '';$brokers = $ENV{'HARVEST_HOME'} . '/brokers/Brokers.cf';$rIndex = 0;$rWeight = 0;$rGreaterWeight = 0;$rResult = "";$rNavigationBar = "";$rTotHtmlFiles = 0;# ===== MAIN =================================================================foreach $sig ('HUP', 'QUIT', 'TSTP', 'TERM', 'ABRT') { $SIG{$sig} = 'sigdie';}foreach $sig ('ALRM') { $SIG{$sig} = 'sigharddie';}# Parse the CGI request.%RQ = &get_request;$debug = 1 if defined $RQ{'debug'};foreach $key (keys %RQ) { $RQ{$key} =~ s/\n/ /g; }# Send the MIME header *now* if we're in debug mode this means it will always# be text/html, but means when can see debugging information from parse_configsend_header() if $debug;# Read in the 'master config' file.&parse_config ("$DIR/lib/search.cf");&parse_defaults;$CFG{'rcsid'} = $rcsid;# Parse a broker-specific config file (included as the following HTML - )# <INPUT TYPE="hidden" NAME="brokerqueryconfig" VALUE="foo.cf">print "opening $DIR/lib/",&option('brokerqueryconfig'),"\n" if ($debug);&parse_config ("$DIR/lib/".&option('brokerqueryconfig')) if ( -f "$DIR/lib/".&option('brokerqueryconfig') );&parse_defaults;# We can't send the MIME header *until* we've read the broker-specific file# (as they might change it in there) - so we have to wait until now to send# it.send_header() unless $debug;# Now - once we're sure that a MIME header has been sent, we can validate# their input&fatal ('NoQuery') if (%RQ == ());&fatal ('rcsid') if ($RQ{'version'} ne "");&dump_array (%RQ) if ($debug);&dump_array (%DEF) if ($debug);#&dump_array (%ENV) if ($debug);# EXTRACT QUERY OPTIONS#$lifetime = &option('lifetime');$userquery = &option('query');$category = &option('category');$userclass = &option('class');$caseflag = &option('caseflag') eq 'on' ? 1 : 0;$wordflag = &option('wordflag') eq 'on' ? 1 : 0;$csumflag = &option('csumflag') eq 'on' ? 1 : 0;$opaqflag = &option('opaqueflag') eq 'on' ? 1 : 0;$descflag = &option('descflag') eq 'on' ? 1 : 0;$noregex = &option('noregexflag') eq 'on' ? 1 : 0;$maxresult = &option('maxresultflag');$maxfiles = &option('maxobjflag');$maxlines = &option('maxlineflag');$perpage = &option('perpageflag');$errors = &option('errorflag');$broker = &option('broker') || &option('host');#$verbflag = &option('verbose') eq "" ? 0 : 1;$version = &option('version') eq "" ? 0 : 1;$hp_url = &option('hp_url');@atts = split (/\s+/, &option('attribute'));$sort = &option('sort');foreach $a (@atts) { $attributes .= " #attribute \"$a\"";}# SECURITY CHECKS AND TRANSLATION ON BROKER HOST,PORT#$errmsg = <<"EOF";<TITLE>$broker not found</TITLE><PRE>$MYNAME doesn't know the broker <B>$broker</B>Either it is not in the allowed list, or perhaps$brokers is not readable.</PRE>EOF&fatal($errmsg) unless (@hostport = &get_host_port ($broker));# HACKS FOR BROKEN LYNX BROWSER#$errors = 0 if ($errors eq 'None');$errors = 1 if ($errors eq '1 Error');$errors = 2 if ($errors eq '2 Errors');# SET THE LIFETIME#$BQlife = $CFG{'Timeout'};$BQlife = $lifetime + 300 if ($lifetime ne "");alarm ($BQlife);### # SANITY CHECKS### #### &fatal ('NoReplica') if ($host eq "No Replicas");### &fatal ('Misconfig') if ($host eq "" || $port == 0);# CHECK QUERY STRING FOR COMMON MISTAKES#$userquery =~ s/^\s+//; # remove leading whitespace$userquery =~ s/\s+$//; # remove trailing whitespace$userquery = &entities($userquery); # translate SGML entitiesunless ( $userquery =~ /\s+and\s+/i || $userquery =~ /\s+or\s+/i || $userquery =~ /:\s+/ || $userquery =~ /\"/) { if ($category ne "anytext") { @X = split (/\s+/, $userquery); for ($i=0; $i<=$#X; $i++) { $X[$i] =~ s/^\+(.*)/$1/; # remove leading + $X[$i] =~ s/^\-(.*)/NOT $1/; # replace leading - by NOT $X[$i] =~ s/\,//; # remove commata: not supported by broker # put quotes around unknown characters $X[$i] = "\"$X[$i]\"" if ($X[$i] =~ /[^ \w\d-]/); $X[$i] = "$category:$X[$i]" if (($category ne "any") && ($category ne "")); } $userquery = join (' AND ', @X); } else { @X = split (/\s+/, $userquery); for ($i=0; $i<=$#X; $i++) { $X[$i] = "\"$X[$i]\"" if ($X[$i] =~ /\W/); $X[$i] = "(keywords:$X[$i] OR title:$X[$i] OR body:$X[$i] OR headings:$X[$i] OR address:$X[$i])" } $userquery = join (' AND ', @X); }}# BUILD QUERY STRING#$query = "";$query .= $userclass . " AND " if ($userclass ne "");#----------------------------------------------------------------------# JMM - 19960619# We use the metatag "DC.Language" to specify the language os the page.# We can search only the pages in a specific language if user selects# it in the form (using the searchLang variable)#---------------------------------------------------------------------,#$query .= $userquery;$rLang = $RQ{'searchLang'};if ($rLang eq "es") { $query .= $userquery . " AND (\"DC.Language\" : es)"; }elsif ($rLang eq "en") { $query .= $userquery . " AND (\"DC.Language\" : en)"; }elsif ($rLang eq "ca") { $query .= $userquery . " AND (\"DC.Language\" : ca)"; }elsif ($rLang eq "gl") { $query .= $userquery . " AND (\"DC.Language\" : gl)"; }else { $query .= $userquery; }#---------------------------------------------------------------------'# BUILD BROKER QUERY#$bquery = "#USER";$bquery .= " #opaque" if ($opaqflag);$bquery .= " #desc" if ($descflag);$bquery .= " #index timeout $lifetime" if ($lifetime ne "");$bquery .= " #index error $errors" if ($errors ne "");$bquery .= " #index maxresult $maxresult" if ($maxresult ne "");$bquery .= " #index maxfiles $maxfiles" if ($maxfiles ne "");$bquery .= " #index maxlines $maxlines" if ($maxlines ne "");$bquery .= " #index case";$bquery .= $caseflag ? " insensitive" : " sensitive";$bquery .= " #index matchword" if ($wordflag);$bquery .= " #index noregex" if ($noregex);$bquery .= $attributes;$bquery .= " #END ";$bquery .= $query;$simple_query = $1 if ($query =~ /^.*partial-text\s*:\s+"(.*)".*$/io);# Call Init Function#eval $CFG{'InitFunction'} if (defined ($CFG{'InitFunction'}));# DO THE QUERY#$html_query = &html_escape ($query);print &expand ($CFG{'ResultHeader'});$connected = 0;while ($#hostport > $[) { $host = shift (@hostport); $port = shift (@hostport); if ($SOCK = &client_socket ($host, $port)) { $connected = 1; last; }}&broker_down ($host, $port) unless ($connected);print "Sending <PRE> $bquery </PRE> to $host:$port<HR>\n" if ($debug);&do_query ($SOCK, $bquery);#print &expand ($CFG{'ResultTrailer'});exit 0; # END OF PROGRAM# ===== SUBROUTINES ==========================================================# Send the appropriate HTTP headerssub send_header { my $content = (defined $CFG{'ContentType'}) ? &expand($CFG{'ContentType'}) : "text/html"; if ($nph) { $|=1; print "HTTP/1.0 200 OK\n"; print "Server: $ENV{SERVER_SOFTWARE}\n"; } print "Content-Type: $content\n\n";}# broker_host_port:## If given a name, return the corresponding (host,port) pair.# If given a host:port string, make sure it is a valid broker.sub get_host_port { local ($broker) = @_; local ($name,$host,$port); local ($pattern); local (@hostport) = (); if ($broker =~ /([^:]+):(\d+)/) { # given host:port $host = $1; $port = $2; return ($host,$port) unless ( -r $brokers ); $pattern = '\S+\s+' . $host . '\s+' . $port; } else { # given a name return () unless ( -r $brokers ); # cant translate name $broker_re = $broker; $broker_re =~ s/\W/\\$&/g; # escape specials $pattern = '^' . $broker_re . '\s+\S+\s+\d+'; } open (brokers) || &fatal ("$brokers: $!\n"); while (<brokers>) { chop; s/#.*//; # strip comments s/^\s+//; # leading whitespace s/\s+$//; # trailing whitespace next unless (/$pattern/io); ($name,$host,$port) = split; # found match push (@hostport, $host); # add host to array push (@hostport, $port); # add port to array } close brokers; return (@hostport); # not found}# do_query:sub do_query { local ($S, $bquery) = @_; local ($nobjects) = 0; local ($nopaquelines) = 0; alarm(1600); # after 30 minutes just kill it, CERN httpd won't do it print $S $bquery;# @T = (<$S>);# close ($S) || &fatal ("socket: $!\n"); $/="\n"; while (<$S>) { if ($debug) { chop; print "|$_|\n"; } next if (/^200 -/o); # Broker homepage URL if (/^126 - (.*)$/o) { $hp_url = $1 if ($hp_url eq ''); next; } if (/^120 -/o || /^103 -/o || /^111 -/o) { $OBJ[++$#OBJ] = $_; last if (/^103 -/o || /^111 -/o); } else { $OBJ[$#OBJ] .= $_; } } close ($S) || &fatal ("socket: $!\n"); $totnumber = (scalar (@OBJ)) - 1;#----------------------------------------------------------------------# JMM - 20010605# We make calculations to obtain the navigator bar#---------------------------------------------------------------------, $rTotHtmlFiles = ceil ($totnumber/$perpage); eval $CFG{'CreateNavBars'}; print &expand ($CFG{'ResultSetBegin'}); @first_page = &expand ($CFG{'ResultSetBegin'}) if $perpage && ! $current_page;#---------------------------------------------------------------------' # @OBJ = sort bynml @OBJ if ($sort eq 'by-NML'); &rank_objects(@OBJ,@RANK) if ($sort eq 'by-rank'); while (($curobj || (undef %SOIF,$curobj = shift(@OBJ))) && (($_, $curobj) = split(/\n/, $curobj, 2))) { # 124 - The description line of the match if (/^124 - (\d+)/o) { $n = $1; next if ($n == 0); $desc = &html_escape (substr($curobj, 0, $n)); $curobj = substr($curobj, $n); # delete the data next; } # 127 - Attribute request if (/^127 - ([^\s]+) (\d+)/o) { $att = $1; $n = $2; next if ($n == 0); $SOIF{$att} = &html_escape (substr($curobj, 0, $n)); $curobj = substr($curobj, $n); # delete the data next; } # 122 - Opaque data if (/^122 - (.*)$/o) { $_ = &html_escape ($1); eval $CFG{'MatchedLineSub'}; $opaque .= $_ . "\n" if (/\S/); $nopaquelines++; next; } # 101 - message to the user if (/^101 - (.*)$/o) { $usermsg = $1; #print &expand ($CFG{'UserMessage'}); next; } # 103 - end of broker results
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -