📄 tutorial.pod

📁 外国人写的Perl搜索引擎程序
💻 POD
字号:
=head1 NAMEKinoSearch::Docs::Tutorial - sample indexing and search applications=head1 DESCRIPTION The following sample code for invindexer.plx and search.cgi can be used tocreate a simple search engine. It requires the html presentation of the USConstitution included in the distribution for KinoSearch, underC<t/us_constitution>.Note that a proper indexer for html documents would not rely on quick-n-dirtyregular expressions for stripping tags, as this one does for the sake ofbrevity -- it would use a dedicated parsing module such asL<HTML::Parser|HTML::Parser>.=head2 invindexer.plx    #!/usr/bin/perl    use strict;    use warnings;        use File::Spec;    use KinoSearch::InvIndexer;    use KinoSearch::Analysis::PolyAnalyzer;        ### In order for invindexer.plx to work correctly, you must modify     ### $source_dir, $path_to_invindex, and possibly $base_url.    ###    ### $source_dir must lead to the directory containing the US    ### Constitution html files.    ###    ### $path_to_invindex is the future location of the invindex.    ###    ### $base_url should reflect the location of the us_constitution directory    ### when accessed via a web browser.    my $source_dir       = '';    my $path_to_invindex = '';    my $base_url         = '/us_constitution';        opendir( my $source_dh, $source_dir )        or die "Couldn't opendir '$source_dir': $!";    my @filenames = grep {/\.html/} readdir $source_dh;    closedir $source_dh or die "Couldn't closedir '$source_dir': $!";        ### STEP 1: Choose an Analyzer.    my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new(         language => 'en',    );    ### STEP 2: Create a InvIndexer object.    my $invindexer = KinoSearch::InvIndexer->new(        analyzer => $analyzer,        invindex => $path_to_invindex,        create   => 1,    );        ### STEP 3: Define fields.    $invindexer->spec_field( name => 'title' );    $invindexer->spec_field(         name       => 'bodytext',        vectorized => 1,    );    $invindexer->spec_field(        name    => 'url',        indexed => 0,    );        foreach my $filename (@filenames) {        next if $filename eq 'index.html';        my $filepath = File::Spec->catfile( $source_dir, $filename );        open( my $fh, '<', $filepath )            or die "couldn't open file '$filepath': $!";        my $content = do { local $/; <$fh> };            ### STEP 4: Start a new document.        my $doc = $invindexer->new_doc;            $content =~ m#<title>(.*?)</title>#s            or die "couldn't isolate title in '$filepath'";        my $title = $1;        $content =~ m#<div id="bodytext">(.*?)</div><!--bodytext-->#s            or die "couldn't isolate bodytext in '$filepath'";        my $bodytext = $1;        $bodytext =~ s/<.*?>/ /gsm;    # quick and dirty tag stripping            ### STEP 5: Set the value for each field.        $doc->set_value( url      => "$base_url/$filename" );        $doc->set_value( title    => $title );        $doc->set_value( bodytext => $bodytext );            ### STEP 6 Add the document to the invindex.        $invindexer->add_doc($doc);            ### STEP 7 Repeat steps 3-5 for each document in the collection.    }        ### STEP 8 Finalize the invindex.    $invindexer->finish;=head2 search.cgi    #!/usr/bin/perl -T    use strict;    use warnings;        use CGI;    use List::Util qw( max min );    use POSIX qw( ceil );    use KinoSearch::Searcher;    use KinoSearch::Analysis::PolyAnalyzer;    use KinoSearch::Highlight::Highlighter;        my $cgi           = CGI->new;    my $q             = $cgi->param('q');    my $offset        = $cgi->param('offset');    my $hits_per_page = 10;    $q      = '' unless defined $q;    $offset = 0  unless defined $offset;        ### In order for search.cgi to work, $path_to_invindex must be modified so    ### that it points to the invindex created by invindexer.plx, and    ### $base_url may have to change to reflect where a web-browser should    ### look for the us_constitution directory.    my $path_to_invindex = '';    my $base_url         = '/us_constitution';        ### STEP 1: Specify the same Analyzer used to create the invindex.    my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new(         language => 'en',     );        ### STEP 2: Create a Searcher object.    my $searcher = KinoSearch::Searcher->new(        invindex => $path_to_invindex,        analyzer => $analyzer,    );        ### STEP 3: Feed a query to the Search object.    my $hits = $searcher->search($q);        ### STEP 4: Arrange for highlighted excerpts to be created.    my $highlighter = KinoSearch::Highlight::Highlighter->new(         excerpt_field => 'bodytext' );    $hits->create_excerpts( highlighter => $highlighter );        ### STEP 5: Process the search.    $hits->seek( $offset, $hits_per_page );        ### STEP 6: Format the results however you like.        # create result list    my $report = '';    while ( my $hit = $hits->fetch_hit_hashref ) {        my $score = sprintf( "%0.3f", $hit->{score} );        $report .= qq|            <p>                <a href="$hit->{url}"><strong>$hit->{title}</strong></a>                <em>$score</em>                <br>                $hit->{excerpt}                <br>                <span class="excerptURL">$hit->{url}</span>            </p>            |;    }        $q = CGI::escapeHTML($q);        # display info about the number of hits, paging links    my $total_hits = $hits->total_hits;    my $num_hits_info;    if ( !length $q ) {        # no query, no display        $num_hits_info = '';    }    elsif ( $total_hits == 0 ) {        # alert the user that their search failed        $num_hits_info = qq|<p>No matches for <strong>$q</strong></p>|;    }    else {        # calculate the nums for the first and last hit to display        my $last_result = min( ( $offset + $hits_per_page ), $total_hits );        my $first_result = min( ( $offset + 1 ), $last_result );            # display the result nums, start paging info        $num_hits_info = qq|            <p>                Results <strong>$first_result-$last_result</strong>                 of <strong>$total_hits</strong> for <strong>$q</strong>.            </p>            <p>                Results Page:            |;            # calculate first and last hits pages to display / link to        my $current_page = int( $first_result / $hits_per_page ) + 1;        my $last_page    = ceil( $total_hits / $hits_per_page );        my $first_page   = max( 1, ( $current_page - 9 ) );        $last_page = min( $last_page, ( $current_page + 10 ) );            # create a url for use in paging links        my $href = $cgi->url( -relative => 1 ) . "?" . $cgi->query_string;        $href .= ";offset=0" unless $href =~ /offset=/;            # generate the "Prev" link;        if ( $current_page > 1 ) {            my $new_offset = ( $current_page - 2 ) * $hits_per_page;            $href =~ s/(?<=offset=)\d+/$new_offset/;            $num_hits_info .= qq|<a href="$href">&lt;= Prev</a>\n|;        }            # generate paging links        for my $page_num ( $first_page .. $last_page ) {            if ( $page_num == $current_page ) {                $num_hits_info .= qq|$page_num \n|;            }            else {                my $new_offset = ( $page_num - 1 ) * $hits_per_page;                $href =~ s/(?<=offset=)\d+/$new_offset/;                $num_hits_info .= qq|<a href="$href">$page_num</a>\n|;            }        }            # generate the "Next" link        if ( $current_page != $last_page ) {            my $new_offset = $current_page * $hits_per_page;            $href =~ s/(?<=offset=)\d+/$new_offset/;            $num_hits_info .= qq|<a href="$href">Next =&gt;</a>\n|;        }            # finish paging links        $num_hits_info .= "</p>\n";    }        # blast it all out    print "Content-type: text/html\n\n";    print <<END_HTML;    <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"        "http://www.w3.org/TR/html4/loose.dtd">    <html>    <head>        <meta http-equiv="Content-type"             content="text/html;charset=ISO-8859-1">        <link rel="stylesheet" type="text/css" href="$base_url/uscon.css">        <title>KinoSearch: $q</title>    </head>        <body>            <div id="navigation">            <form id="usconSearch" action="">                <strong>                Search the <a href="$base_url/index.html">US Constitution</a>:                </strong>                <input type="text" name="q" id="q" value="$q">                <input type="submit" value="=&gt;">                <input type="hidden" name="offset" value="0">            </form>        </div><!--navigation-->            <div id="bodytext">            $report            $num_hits_info            <p style="font-size: smaller; color: #666">            <em>Powered by                 <a href="http://www.rectangular.com/kinosearch/">                    KinoSearch                </a>            </em>        </p>        </div><!--bodytext-->        </body>        </html>    END_HTML=head1 COPYRIGHTCopyright 2005-2007 Marvin Humphrey=head1 LICENSE, DISCLAIMER, BUGS, etc.See L<KinoSearch|KinoSearch> version 0.163.
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -