📄 zquery-gils.pl.in

📁 harvest是一个下载html网页得机器人
💻 IN
📖 第 1 页 / 共 2 页
字号:
上一页 12
Last modified: <b>$dateOfLastModification</b><br>EOF	if ($::DEBUG) {	    print "Relevance ranking value: $rank\n";	    print "<br>\n";	    # show raw data for debugging	    # how do I get the filename of a hit?	    # $obj_data = ... ?	    # print "<a href=" . $::DATA_STORE . $obj_data . ">";	    # print "Object data</a>\n";	    # print "<br>\n";	}	print "<br>\n";    }}################################ print page-navigation links ## args:			      ##   querystring		      ##   number of hits	      ##   page number		      ##   number of items per page  ################################sub print_html_page_index {    my ($query, $num_hit, $page, $per_page);    my ($url, $num_pages, $start, $i);    $query = shift;    $num_hit = shift;    $page = shift;    $per_page = shift;    $url =  "$::ME";    $url .= "?query=$query";    $url .= "&per_page=$per_page";    $url .= "&DEBUG=$::DEBUG" if ($::DEBUG);    $num_pages = $num_hit / $per_page;    $num_pages++ if ($num_hit % $per_page);    print "<center>\n";    print "<div class=pager>\n";    print "<a href=$url&page=" . ($page - 1) . ">Previous</a>&nbsp;\n"	if ($page > 0);    $start = $page - 5 if ($page > 5);    $start = $start - ($page + 6 - $num_pages) if ($num_pages <= $page + 5);    $start = 0 if (!defined $start || $start < 0);    for ($i = $start; $i < $page; $i++) {	print "<a href=$url&page=$i>" . ($i + 1) . "</a>&nbsp;\n";    }    print "<font color=red><b>";    print $page + 1;    print "</b></font>&nbsp;\n";    for ($i = $page + 1; $i < $num_pages && $i < $start + 11; $i++) {	print "<a href=$url&page=$i>" . ($i + 1) . "</a>&nbsp;\n";    }    print "<a href=$url&page=" . ($page + 1) . ">Next</a>\n"	if ($page < $num_pages - 1);    print "</div>\n";    print "</center>\n";}############################################## print error page and exit		    ## args: error message			    ##############################################sub print_html_error {    my $reason = shift;    print_html_header ("Service temporarily not available");    print "<h1 align=center>Service temporarily not available</h1>\n";    print "<center>Reason: <b>$reason</b>.</center>\n";    print_html_footer ();    exit;}################################################### read search results from zoomsh		 ## args: none					 ## returns:					 ##   number of total hits			 ##   hash with search result			 ###################################################sub read_hash_zoomsh {    my ($num_hit, @hits, $i, $j, $tmp);    open (IN, "$::zoomsh|") || print_html_error ("Can't run zoomsh");    $_ = <IN>;    # parse number of hits    ($num_hit) = /^.*: (\d+) hits$/;    # build a list of hashes with hits    $i = -1;    $_ = <IN>;    while ($_) {	# one liners	if (/^gils:$/) {	# if (/^local-control-number: \d+$/) {	    $i++;	    $_ = <IN>;	    next;	}	if (/^\s+rank: \d+$/) {	    ($hits [$i] { 'rank' }) = /^\s+rank: (\d+)$/;	    $_ = <IN>;	    next;	}	if (/^\s+dateOfLastModification: \d+$/) {	    ($hits [$i] { 'dateOfLastModification' }) =		/^\s+dateOfLastModification: (\d+)$/;	    $_ = <IN>;	    next;	}	# two liners	if (/^\s+supplementalInformation:$/) {	    $_ = <IN>;	    if (/^\s+bytes: \d+$/) {		($hits [$i] { 'bytes' }) = /^\s+bytes: (\d+)$/;	    }	    $_ = <IN>;	    next;	}	# multi liners	if (/^\s+title:/) {	    ($hits [$i] { 'title' }) = /^\s+title: (.+)$/;	    $_ = <IN>;	    while (($_) && (/^\s{4}.+$/)) {		if ($hits [$i] { 'title' }) {		    if ($hits [$i] { 'title' } =~ /\S=$/) {			chop $hits [$i] { 'title' };		    } else {			$hits [$i] { 'title' } .= " ";		    }		}		$_ =~ s/^\s+//g;		chop;		$hits [$i] { 'title' } .= $_;		$_ = <IN>;	    }	    $hits [$i] { 'title' } = remove_quirks ($hits [$i] { 'title' });	    next;	}	if (/^\s+sampleText:/) {	    ($hits [$i] { 'sampleText' }) = /^\s+sampleText: (.+)$/;	    $_ = <IN>;	    $j = 0;	    while (($_) && ($j < $::n_lines) && (/^\s{4}.+$/)) {		if ($hits [$i] { 'sampleText' }) {		    if ($hits [$i] { 'sampleText' } =~ /\S=$/) {			chop $hits [$i] { 'sampleText' };		    } else {			$hits [$i] { 'sampleText' } .= " ";		    }		}		$_ =~ s/^\s+//g;		chop;		$hits [$i] { 'sampleText' } .= $_;		$_ = <IN>;		$j++;	    }	    $hits [$i] { 'sampleText' } =		remove_quirks ($hits [$i] { 'sampleText' });	    next;	}	if (/^\s+availability:$/) {	    $_ = <IN>;	    if (/^\s+linkage:/) {		($hits [$i] { 'linkage' }) = /^\s+linkage: (.+)$/;		$_ = <IN>;		while (($_) && (/^\s{4}.+$/)) {		    if (($hits [$i] { 'linkage' }) &&			($hits [$i] { 'linkage' } =~ /\S+=$/)) {			chop $hits [$i] { 'linkage' };		    }		    $_ =~ s/^\s+//g;		    chop;		    $hits [$i] { 'linkage' } .= $_;		    $_ = <IN>;		}	    }	    next;	}	$_ = <IN>;    }    close (IN);    # print all parsed lines for debugging    # keys used to print search results    #@::keys = ("rank", "linkage", "dateOfLastModification",    #	   "bytes", "title", "sampleText");    #foreach $i (0 .. $#hits) {    #	foreach my $key (@::keys) {    #	    print $i, " ->> ", $key, " ->> ", $hits [$i] { $key }, "\n";    #	}    #	print "---------------------\n";    #}    #######################################################################    # Postprocessing							  #    # we have to filter this after a full build of the hash because	  #    # we don't know when both, title and sample fields are initialized.	  #    #######################################################################    # title is sometimes repeated at the beginning of sampletext and some #    # documents doesn't have any title at all.				  #    # to do search and replace, meta characters must be escaped		  #    #######################################################################    foreach $i (0 .. $#hits) {	if ($hits [$i] { 'title' }) {	    $tmp = quotemeta ($hits [$i] { 'title' });	    $hits [$i] { 'sampleText' } =~ s/^$tmp //g;	} else {	    $hits [$i] { 'title' } = $hits [$i] { 'linkage' };	}	if (length ($hits [$i] { 'title' }) > 60) {	    $hits [$i] { 'title' } =		substr ($hits [$i] { 'title' }, 0, 60) . "...";	}    }    return $num_hit, \@hits;}############################################## build a query string from input	    ## args: complete query string		    ## returns: query string in Zebra's client   ##	   notation			    ##############################################sub build_query {    my ($query, $attr, @token, $op, $i, $z_query, $quoted);    $query = shift;    $op = shift;    $query =~ s/\s+/ /g;    # collapse multiple whitespaces to one    $query =~ s/\s+$//g;    # remove trailing whitespaces    $query =~ s/\"/\\\"/g;    # request ranking    $attr = "\@attr 2=102 ";    # make it a regexp search, if any of these special characters are found    # in query string: ".[]*+?|"    $attr .= "\@attr 5=102 " if ($query =~ /\.|\[|\]|\*|\+|\?|\|/);    @token = split /\s+/, $query;    if ($#token < 0) {	$z_query = "";    } else {	$quoted = 0;	foreach (@token) {	    if ($quoted == 1) {		$z_query .= " $_";	    } else {		if ($z_query) {		    if (/^\-/) {			s/^\-//g;			$z_query = "\@not $z_query " . $attr . $_;		    } else {			$z_query = "\@$op $z_query " . $attr . $_;		    }		} else {		    $z_query = $attr . $_;		}	    }	    $quoted = 1 if (/^\\\"/);	    $quoted = 0 if (/\S+\\\"$/);	}    }    return $z_query;}############################################## strip html tags still left in summarized  ## data and map dangerous letters	    ## args: text to strip			    ## returns: stripped text		    ##############################################sub remove_quirks {    my $line = shift;    $line =~ s/</\&lt;/g;    $line =~ s/>/\&gt;/g;    return $line;}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -