📄 zquery.pl.in

📁 harvest是一个下载html网页得机器人
💻 IN
📖 第 1 页 / 共 2 页
字号:
上一页 12
sub print_html_error {    my $reason = shift;    print_html_header ("Service temporarily not available");    print "<h1 align=center>Service temporarily not available</h1>\n";    print "<center>Reason: <b>$reason</b>.</center>\n";    print_html_footer ();    exit;}############################################## print timeout message and exit            ##############################################sub print_html_timeout {    kill 1, $::zoompid;    print << "EOF";<p>Query timed out.<p>Please:<ul><li>Choose a more specific search term.<li>Choose more search terms.</ul><p>EOF    print_html_footer ();    exit 1;}############################################## print brief query hint                    ##############################################sub print_html_nomatch {    print << "EOF";<p>No documents were found matching your query.<p>Please:<ul><li>Verify that the spelling of the search terms were correct.<li>Choose fewer search terms.<li>Choose more general search terms.</ul><p>EOF}########################################################### test that everything necessary is in place and running ## args: none						 ###########################################################sub test_zebra {    # can't find zoomsh    print_html_error ("Can't run $::ZOOMSH") if (!-x "$::ZOOMSH");    # can't connect to zebrasrv    open (IN, "$::ZOOMSH \"connect $::ZURL\" quit|");    print_html_error ("Can't connect to zebrasrv at $::ZURL") if (<IN>);    close (IN);}## This reads the output from zoomsh and builds a hash containing data received# from z39.50 server. It uses a simple parser without validity check of the# returned data, but works well enough to experiment with the search system.## Here are some caveats when parsing the results produced by zoomsh:## - lines are continued at next line with leading whitespaces like this:#   title: This is a very long title line which does not fit in one line and#     it will be continued at next line, like this## - long lines can look like this:#   linkage: http://do.main/path/page.html#	       or#   linkage:#     http://do.main/path/page.html## - lines longer than 77 characters are broken into two lines by putting a#   "=" at position 77 and continued at next line.#################################################### read search results from zoomsh		 ## args: none					 ## returns:					 ##   number of total hits			 ##   hash with search result			 ###################################################sub read_hash_zoomsh {    my ($num_hit, @hits, $i, $j, $tmp);    # exit after this number of seconds    alarm (60);    $SIG{ALRM} = \&print_html_timeout;    open (IN, "$::zoomq|");    $_ = <IN>;    # parse number of hits    ($num_hit) = /^.*: (\d+) hits$/;    # build a list of hashes with hits    $i = -1;    $_ = <IN>;    while ($_) {	# one liners	if (/^xsoif:$/) {	# if (/^local-control-number: \d+$/) {	    $i++;	    $_ = <IN>;	    next;	}	if (/^\s+rank: \d+$/) {	    ($hits [$i] { 'rank' }) = /^\s+rank: (\d+)$/;	    $_ = <IN>;	    next;	}	if (/^\s+last-modification-time: \d+$/) {	    ($hits [$i] { 'last-modification-time' }) =		/^\s+last-modification-time: (\d+)$/;	    $_ = <IN>;	    next;	}	if (/^\s+file-size: \d+$/) {	    ($hits [$i] { 'file-size' }) = /^\s+file-size: (\d+)$/;	    $_ = <IN>;	    next;	}	# multi liners	if (/^\s+title:/) {	    ($hits [$i] { 'title' }) = /^\s+title: (.+)$/;	    $_ = <IN>;	    while (($_) && (/^\s{4}.+$/)) {		if ($hits [$i] { 'title' }) {		    if ($hits [$i] { 'title' } =~ /\S=$/) {			chop $hits [$i] { 'title' };		    } else {			$hits [$i] { 'title' } .= " ";		    }		}		$_ =~ s/^\s+//g;		chop;		$hits [$i] { 'title' } .= $_;		$_ = <IN>;	    }	    $hits [$i] { 'title' } = remove_quirks ($hits [$i] { 'title' });	    next;	}	if (/^\s+full-text:/) {	    ($hits [$i] { 'full-text' }) = /^\s+full-text: (.+)$/;	    $_ = <IN>;	    $j = 0;	    while (($_) && ($j < $::n_lines) && (/^\s{4}.+$/)) {		if ($hits [$i] { 'full-text' }) {		    if ($hits [$i] { 'full-text' } =~ /\S=$/) {			chop $hits [$i] { 'full-text' };		    } else {			$hits [$i] { 'full-text' } .= " ";		    }		}		$_ =~ s/^\s+//g;		chop;		$hits [$i] { 'full-text' } .= $_;		$_ = <IN>;		$j++;	    }	    $hits [$i] { 'full-text' } =		remove_quirks ($hits [$i] { 'full-text' });	    next;	}	if (/^\s+url:/) {	    ($hits [$i] { 'url' }) = /^\s+url: (.+)$/;	    $_ = <IN>;	    while (($_) && (/^\s{4}.+$/)) {		if (($hits [$i] { 'url' }) &&		    ($hits [$i] { 'url' } =~ /\S+=$/)) {		    chop $hits [$i] { 'url' };		}		$_ =~ s/^\s+//g;		chop;		$hits [$i] { 'url' } .= $_;		$_ = <IN>;	    }	    next;	}	$_ = <IN>;    }    close (IN);    alarm (0);    # print all parsed lines for debugging    # keys used to print search results    #@::keys = ("rank", "url", "last-modification-time",    #	   "file-size", "title", "full-text");    #foreach $i (0 .. $#hits) {    #	foreach my $key (@::keys) {    #	    print $i, " ->> ", $key, " ->> ", $hits [$i] { $key }, "\n";    #	}    #	print "---------------------\n";    #}    #######################################################################    # Postprocessing							  #    # we have to filter this after a full build of the hash because	  #    # we don't know when both, title and sample fields are initialized.	  #    #######################################################################    # title is sometimes repeated at the beginning of sampletext and some #    # documents doesn't have any title at all.				  #    # to do search and replace, meta characters must be escaped		  #    #######################################################################    foreach $i (0 .. $#hits) {	if ($hits [$i] { 'title' }) {	    $tmp = quotemeta ($hits [$i] { 'title' });	    $hits [$i] { 'full-text' } =~ s/^$tmp //g;	} else {	    $hits [$i] { 'title' } = $hits [$i] { 'url' };	}	if (length ($hits [$i] { 'title' }) > 60) {	    $hits [$i] { 'title' } =		substr ($hits [$i] { 'title' }, 0, 60) . "...";	}    }    return $num_hit, \@hits;}################################################### read search results from zoomsh		 ## args: none					 ## returns:					 ##   number of total hits			 ##   hash with search result			 ###################################################sub read_hash_zoomsh_xml {    my ($num_hit, @hits, $i, $one_line, $elem, $curr_elem,	$tmp, @wanted_str, $skip);    # exit after this number of seconds    alarm (60);    $SIG{ALRM} = \&print_html_timeout;    $::zoompid = open (IN, "$::zoomq|");    $_ = <IN>;    # parse number of hits    ($num_hit) = /^.*: (\d+) hits$/;    $i = -1;    $one_line = 0;    $curr_elem = "";    @wanted_str = ("url", "title", "full-text", "file-size",		   "last-modification-time");    <IN>;    while (<IN>) {	($elem) = /<(.*?)>/;	if ($elem) {	    if ($elem eq "xsoif") {		$i++;		next;	    }	    if ($elem eq "\/idzebra") {		$one_line = 0;		next;	    }	    if ($elem =~ /^idzebra/) {		$one_line = 1;		next;	    }	    if (!$one_line) {		$curr_elem = ($elem =~ /^\//) ? "" : $elem;	    } else {		($hits [$i] { 'filename' }) = /<filename>(\S+)<\/filename>/		    if ($elem eq "filename");		($hits [$i] { 'score' }) = /<score>(\d+)<\/score>/		    if ($elem eq "score");	    }	} else {	    foreach $tmp (@wanted_str) {		if ($curr_elem eq $tmp) {		    chop;		    if (!$hits [$i] { "$tmp" }) {			$skip = 0;			$hits [$i] { "$tmp" } = $_;		    } else {			if (!$skip) {			    $skip = 1 if (length ($hits [$i] { "$tmp" }) > ($::n_lines + 1) * 80);			    $hits [$i] { "$tmp" } .= " $_";			}		    }		    last;		}	    }	}    }    close (IN);    alarm (0);    # print all parsed lines for debugging    # keys used to print search results    #@::keys = ("rank", "url", "last-modification-time",    #	   "file-size", "title", "full-text");    #foreach $i (0 .. $#hits) {    #	foreach my $key (@::keys) {    #	    print $i, " ->> ", $key, " ->> ", $hits [$i] { $key }, "\n";    #	}    #	print "---------------------\n";    #}    #######################################################################    # Postprocessing							  #    # we have to filter this after a full build of the hash because	  #    # we don't know when both, title and sample fields are initialized.	  #    #######################################################################    # title is sometimes repeated at the beginning of sampletext and some #    # documents doesn't have any title at all.				  #    # to do search and replace, meta characters must be escaped		  #    #######################################################################    foreach $i (0 .. $#hits) {	if ($hits [$i] { 'title' }) {	    $tmp = $hits [$i] { 'title' };	    $hits [$i] { 'full-text' } =~ s/^$tmp\s*//g;	} else {	    $hits [$i] { 'title' } = $hits [$i] { 'url' };	}	$hits [$i] { 'title' } = trim_line ($hits [$i] { 'title' }, 80);	$hits [$i] { 'full-text' } = trim_line ($hits [$i] { 'full-text' }, $::n_lines * 80);    }    return $num_hit, \@hits;}################################################## build query string from input			## args:						##   query string				##   boolean operation				##   array of search terms			## returns: query string in Zebra's client	##	   notation				##################################################sub build_query {    my $query = shift;    my $op    = shift;    my @token = @_;    my ($attr, $i, $z_query);    return "" if ($#token < 0);    # request ranking    $attr = "\@attr 2=102 ";    #$attr .= "\@attr 1=/xsoif ";    #$attr .= "\@attr 1=/xsoif/url ";    #$attr .= "\@attr 1=/xsoif/author ";    #$attr .= "\@attr 1=/xsoif/title ";    #$attr .= "\@attr 1=/xsoif/abstract ";    #$attr .= "\@attr 1=/xsoif/headings ";    #$attr .= "\@attr 1=/xsoif/keywords ";    #$attr .= "\@attr 1=/xsoif/description ";    #$attr .= "\@attr 1=/xsoif/full-text ";    # make it a regexp search, if any of these special characters are found    # in query string: ".[]*+?|"    $attr .= "\@attr 5=102 " if ($query =~ /\.|\[|\]|\*|\+|\?|\|/);    foreach (@token) {	$_ = "\\\"$_\\\"" if (/\s/);	if ($z_query) {	    if (/^\-/) {		s/^\-//g;		$z_query = "\@not $z_query " . $attr . $_;	    } else {		$z_query = "\@$op $z_query " . $attr . $_;	    }	} else {	    $z_query = $attr . $_;	}    }    return $z_query;}############################################## strip html tags still left in summarized  ## data and map dangerous letters	    ## args: text to strip			    ## returns: stripped text		    ##############################################sub remove_quirks {    my $line = shift;    $line =~ s/</\&lt;/g;    $line =~ s/>/\&gt;/g;    return $line;}################################################## truncate a line reasonably			## args: string, length				## returns: a trimmed string shorter than length	##################################################sub trim_line {    my $str = shift;    my $len = shift;    return $str if (length ($str) <= $len);    $str = substr ($str, 0, $len);    $str =~ s/(.*)\s.*/$1/g;    $str .= " ..." if (!($str =~ /\.$/));    return $str;}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -